]> git.ipfire.org Git - thirdparty/vim.git/commitdiff
updated for version 7.3.1011 v7.3.1011
authorBram Moolenaar <Bram@vim.org>
Fri, 24 May 2013 19:59:54 +0000 (21:59 +0200)
committerBram Moolenaar <Bram@vim.org>
Fri, 24 May 2013 19:59:54 +0000 (21:59 +0200)
Problem:    New regexp engine is inefficient with multi-byte characters.
Solution:   Handle a character at a time instead of a byte at a time.  Also
            make \Z partly work.

src/regexp_nfa.c
src/testdir/test95.in
src/testdir/test95.ok
src/version.c

index 4f2b9254010928f251dade0d6496740224677952..8a90248deeebd86c66d814178a6a78e1dab6d623 100644 (file)
@@ -46,9 +46,6 @@ enum
     NFA_NCLOSE,                            /* End of subexpr. marked with \%( ... \) */
     NFA_START_INVISIBLE,
     NFA_END_INVISIBLE,
-    NFA_MULTIBYTE,                 /* Next nodes in NFA are part of the same
-                                      multibyte char */
-    NFA_END_MULTIBYTE,             /* End of multibyte char in the NFA */
     NFA_COMPOSING,                 /* Next nodes in NFA are part of the
                                       composing multibyte char */
     NFA_END_COMPOSING,             /* End of a composing char in the NFA */
@@ -195,26 +192,6 @@ static long nfa_regexec_multi __ARGS((regmmatch_T *rmp, win_T *win, buf_T *buf,
                    *post_ptr++ = c;            \
                } while (0)
 
-#define EMIT_MBYTE(c)                                      \
-                       len = (*mb_char2bytes)(c, buf);     \
-                       EMIT(buf[0]);                       \
-                       for (i = 1; i < len; i++)           \
-                       {                                   \
-                           EMIT(buf[i]);                   \
-                           EMIT(NFA_CONCAT);               \
-                       }                                   \
-                       EMIT(NFA_MULTIBYTE);
-
-#define EMIT_COMPOSING_UTF(input)                          \
-                       len = utfc_ptr2len(input);          \
-                       EMIT(input[0]);                     \
-                       for (i = 1; i < len; i++)           \
-                       {                                   \
-                           EMIT(input[i]);                 \
-                           EMIT(NFA_CONCAT);               \
-                       }                                   \
-                       EMIT(NFA_COMPOSING);
-
 /*
  * Initialize internal variables before NFA compilation.
  * Return OK on success, FAIL otherwise.
@@ -611,8 +588,6 @@ nfa_regatom()
 #ifdef FEAT_MBYTE
     char_u     *old_regparse = regparse;
     int                clen;
-    int                len;
-    static char_u      buf[30];
     int                i;
 #endif
     int                extra = 0;
@@ -845,14 +820,7 @@ nfa_regatom()
                    return FAIL;
 
                    c = coll_get_char();
-#ifdef FEAT_MBYTE
-                   if ((*mb_char2len)(c) > 1)
-                   {
-                       EMIT_MBYTE(c);
-                   }
-                   else
-#endif
-                       EMIT(c);
+                   EMIT(c);
                    break;
 
                /* Catch \%^ and \%$ regardless of where they appear in the
@@ -1135,12 +1103,7 @@ collection:
                             * skip it. */
                            for (c = startc + 1; c <= endc; c++)
                            {
-                               if ((*mb_char2len)(c) > 1)
-                               {
-                                   EMIT_MBYTE(c);
-                               }
-                               else
-                                   EMIT(c);
+                               EMIT(c);
                                TRY_NEG();
                                EMIT_GLUE();
                            }
@@ -1187,14 +1150,7 @@ collection:
                        if (got_coll_char == TRUE && startc == 0)
                            EMIT(0x0a);
                        else
-#ifdef FEAT_MBYTE
-                           if ((*mb_char2len)(startc) > 1)
-                           {
-                               EMIT_MBYTE(startc);
-                           }
-                           else
-#endif
-                               EMIT(startc);
+                           EMIT(startc);
                        TRY_NEG();
                        EMIT_GLUE();
                    }
@@ -1242,30 +1198,30 @@ collection:
                int     plen;
 
 nfa_do_multibyte:
-               /* length of current char, with composing chars,
-                * from pointer */
-               plen = (*mb_ptr2len)(old_regparse);
-               if (enc_utf8 && clen != plen)
+               /* Length of current char with composing chars. */
+               if (enc_utf8 && clen != (plen = (*mb_ptr2len)(old_regparse)))
                {
-                   /* A composing character is always handled as a
-                    * separate atom, surrounded by NFA_COMPOSING and
-                    * NFA_END_COMPOSING. Note that right now we are
+                   /* A base character plus composing characters.
+                    * This requires creating a separate atom as if enclosing
+                    * the characters in (), where NFA_COMPOSING is the ( and
+                    * NFA_END_COMPOSING is the ). Note that right now we are
                     * building the postfix form, not the NFA itself;
                     * a composing char could be: a, b, c, NFA_COMPOSING
-                    * where 'a', 'b', 'c' are chars with codes > 256.
-                    */
-                   EMIT_COMPOSING_UTF(old_regparse);
+                    * where 'b' and 'c' are chars with codes > 256. */
+                   i = 0;
+                   for (;;)
+                   {
+                       EMIT(c);
+                       if (i > 0)
+                           EMIT(NFA_CONCAT);
+                       if (i += utf_char2len(c) >= plen)
+                           break;
+                       c = utf_ptr2char(old_regparse + i);
+                   }
+                   EMIT(NFA_COMPOSING);
                    regparse = old_regparse + plen;
                }
                else
-                   /* A multi-byte character is always handled as a
-                    * separate atom, surrounded by NFA_MULTIBYTE and
-                    * NFA_END_MULTIBYTE */
-                   if (plen > 1)
-                   {
-                       EMIT_MBYTE(c);
-                   }
-                   else
 #endif
                {
                    c = no_Magic(c);
@@ -1702,9 +1658,6 @@ nfa_set_code(c)
        case NFA_START_INVISIBLE:   STRCPY(code, "NFA_START_INVISIBLE"); break;
        case NFA_END_INVISIBLE:     STRCPY(code, "NFA_END_INVISIBLE"); break;
 
-       case NFA_MULTIBYTE:         STRCPY(code, "NFA_MULTIBYTE"); break;
-       case NFA_END_MULTIBYTE:     STRCPY(code, "NFA_END_MULTIBYTE"); break;
-
        case NFA_COMPOSING:         STRCPY(code, "NFA_COMPOSING"); break;
        case NFA_END_COMPOSING:     STRCPY(code, "NFA_END_COMPOSING"); break;
 
@@ -2194,7 +2147,7 @@ post2nfa(postfix, end, nfa_calc_size)
            }
            e1 = POP();
            e1.start->negated = TRUE;
-           if (e1.start->c == NFA_MULTIBYTE || e1.start->c == NFA_COMPOSING)
+           if (e1.start->c == NFA_COMPOSING)
                e1.start->out1->negated = TRUE;
            PUSH(e1);
            break;
@@ -2311,6 +2264,16 @@ post2nfa(postfix, end, nfa_calc_size)
            PUSH(frag(s, list1(&s1->out)));
            break;
 
+       case NFA_COMPOSING:     /* char with composing char */
+#if 0
+           /* TODO */
+           if (regflags & RF_ICOMBINE)
+           {
+               goto normalchar;
+           }
+#endif
+           /* FALLTHROUGH */
+
        case NFA_MOPEN + 0:     /* Submatch */
        case NFA_MOPEN + 1:
        case NFA_MOPEN + 2:
@@ -2322,8 +2285,6 @@ post2nfa(postfix, end, nfa_calc_size)
        case NFA_MOPEN + 8:
        case NFA_MOPEN + 9:
        case NFA_NOPEN:         /* \%( "Invisible Submatch" */
-       case NFA_MULTIBYTE:     /* mbyte char */
-       case NFA_COMPOSING:     /* composing char */
            if (nfa_calc_size == TRUE)
            {
                nstate += 2;
@@ -2336,9 +2297,6 @@ post2nfa(postfix, end, nfa_calc_size)
                case NFA_NOPEN:
                    mclose = NFA_NCLOSE;
                    break;
-               case NFA_MULTIBYTE:
-                   mclose = NFA_END_MULTIBYTE;
-                   break;
                case NFA_COMPOSING:
                    mclose = NFA_END_COMPOSING;
                    break;
@@ -2377,9 +2335,8 @@ post2nfa(postfix, end, nfa_calc_size)
                goto theend;
            patch(e.out, s1);
 
-           if (mopen == NFA_MULTIBYTE || mopen == NFA_COMPOSING)
-               /* MULTIBYTE->out1 = END_MULTIBYTE
-               * COMPOSING->out1 = END_COMPOSING */
+           if (mopen == NFA_COMPOSING)
+               /* COMPOSING->out1 = END_COMPOSING */
                patch(list1(&s->out1), s1);
 
            PUSH(frag(s, list1(&s1->out)));
@@ -2540,17 +2497,8 @@ addstate(l, state, m, off, lid, match)
        case NFA_COMPOSING:
            /* nfa_regmatch() will match all the bytes of this composing char. */
            break;
-
-       case NFA_MULTIBYTE:
-           /* nfa_regmatch() will match all the bytes of this multibyte char. */
-           break;
 #endif
 
-       case NFA_END_MULTIBYTE:
-           /* Successfully matched this mbyte char */
-           addstate(l, state->out, m, off, lid, match);
-           break;
-
        case NFA_NOPEN:
        case NFA_NCLOSE:
            addstate(l, state->out, m, off, lid, match);
@@ -2841,7 +2789,7 @@ nfa_regmatch(start, submatch, m)
     regsub_T           *submatch;
     regsub_T           *m;
 {
-    int                c = -1;
+    int                c;
     int                n;
     int                i = 0;
     int                result;
@@ -2859,7 +2807,6 @@ nfa_regmatch(start, submatch, m)
     List       *listtbl[2][2];
     List       *ll;
     int                listid = 1;
-    int                endnode;
     List       *thislist;
     List       *nextlist;
     List       *neglist;
@@ -3190,33 +3137,35 @@ nfa_regmatch(start, submatch, m)
                break;
            }
 
-           case NFA_MULTIBYTE:
+#ifdef FEAT_MBYTE
            case NFA_COMPOSING:
-               endnode = t->state->c + 1;
+           {
+               int mc = c;
+
                result = OK;
                sta = t->state->out;
-               len = 1;
-               while (sta->c != endnode && len <= n)
+               len = 0;
+               while (sta->c != NFA_END_COMPOSING && len < n)
                {
-                   if (reginput[len-1] != sta->c)
-                   {
-                       result = FAIL;
+                   if (len > 0)
+                       mc = mb_ptr2char(reginput + len);
+                   if (mc != sta->c)
                        break;
-                   }
-                   len++;
+                   len += mb_char2len(mc);
                    sta = sta->out;
                }
 
                /* if input char length doesn't match regexp char length */
-               if (len -1 < n || sta->c != endnode)
+               if (len < n || sta->c != NFA_END_COMPOSING)
                    result = FAIL;
-               end = t->state->out1;       /* NFA_END_MULTIBYTE or
-                                              NFA_END_COMPOSING */
+               end = t->state->out1;       /* NFA_END_COMPOSING */
                /* If \Z was present, then ignore composing characters */
-               if (ireg_icombine && endnode == NFA_END_COMPOSING)
+               if (ireg_icombine)
                    result = 1 ^ sta->negated;
                ADD_POS_NEG_STATE(end);
                break;
+           }
+#endif
 
            case NFA_NEWL:
                if (!reg_line_lbr && REG_MULTI
@@ -3425,6 +3374,14 @@ nfa_regmatch(start, submatch, m)
                if (!result)
                    result = ireg_ic == TRUE
                                && MB_TOLOWER(t->state->c) == MB_TOLOWER(c);
+#ifdef FEAT_MBYTE
+               /* If there is a composing character which is not being
+                * ignored there can be no match. Match with composing
+                * character uses NFA_COMPOSING above. */
+               if (result && enc_utf8 && !ireg_icombine
+                                                     && n != utf_char2len(c))
+                   result = FALSE;
+#endif
                ADD_POS_NEG_STATE(t->state);
                break;
            }
index e332b9708ba258e0b1f50b6529101f8d87a14044..3451cc511f66c4c23b6c58fafa71c3dc62990b9c 100644 (file)
@@ -35,6 +35,10 @@ STARTTEST
 :call add(tl, ['\f\+', '&*\9ffname ', 'fname'])
 :call add(tl, ['\%#=1\f\+', '&*\9ffname ', 'fname'])
 
+:"""" Test composing character matching
+:call add(tl, ['.ม', 'xม่x yมy', 'yม'])
+:call add(tl, ['.ม่', 'xม่x yมy', 'xม่'])
+
 :"""" Test \Z
 :call add(tl, ['ú\Z', 'x'])
 
index 23d228494938f45db48b9e735e190933b7e189b5..57c28d9175500bfd63c4138572a8fd4b6eccad17 100644 (file)
@@ -9,5 +9,7 @@ OK - \i\+
 OK - \%#=1\i\+
 OK - \f\+
 OK - \%#=1\f\+
+OK - .ม
+OK - .ม่
 OK - ú\Z
 OK - [^[=a=]]\+
index ffc138ec82a712e2dfee231426603a65d85ce8f1..1a60933663e9b49caec92e9529d34bb2b482f58a 100644 (file)
@@ -728,6 +728,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    1011,
 /**/
     1010,
 /**/