-*version9.txt* For Vim version 9.1. Last change: 2024 Jul 28
+*version9.txt* For Vim version 9.1. Last change: 2024 Jul 30
VIM REFERENCE MANUAL by Bram Moolenaar
behaviour/inconsistency (see |d-special| and |cw|).
- allow to specify additional attributes in the completion menu (allows to
mark deprecated attributes from LSP server) |complete-items|
+- the regex engines match correctly case-insensitive multi-byte characters
+ (and apply proper case folding)
*added-9.2*
Added ~
* Returns zero if s1 and s2 are equal (ignoring case), the difference between
* two characters otherwise.
*/
+ int
+mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2)
+{
+ if (n1 == n2 || !enc_utf8)
+ return mb_strnicmp(s1, s2, n1);
+ else
+ return utf_strnicmp(s1, s2, n1, n2);
+}
+
int
mb_strnicmp(char_u *s1, char_u *s2, size_t nn)
{
int utf_tolower(int a);
int utf_isupper(int a);
int mb_strnicmp(char_u *s1, char_u *s2, size_t nn);
+int mb_strnicmp2(char_u *s1, char_u *s2, size_t n1, size_t n2);
void show_utf8(void);
int latin_head_off(char_u *base, char_u *p);
int dbcs_screen_head_off(char_u *base, char_u *p);
/*
* Compare two strings, ignore case if rex.reg_ic set.
* Return 0 if strings match, non-zero otherwise.
- * Correct the length "*n" when composing characters are ignored.
+ * Correct the length "*n" when composing characters are ignored
+ * or for utf8 when both utf codepoints are considered equal because of
+ * case-folding but have different length (e.g. 's' and 'ſ')
*/
static int
cstrncmp(char_u *s1, char_u *s2, int *n)
if (!rex.reg_ic)
result = STRNCMP(s1, s2, *n);
+ else if (enc_utf8)
+ {
+ char_u *p = s1;
+ size_t n2 = 0;
+ int n1 = *n;
+ // count the number of characters for byte-length of s1
+ while (n1 > 0 && *p != NUL)
+ {
+ n1 -= mb_ptr2len(s1);
+ MB_PTR_ADV(p);
+ n2++;
+ }
+ // count the number of bytes to advance the same number of chars for s2
+ p = s2;
+ while (n2-- > 0 && *p != NUL)
+ MB_PTR_ADV(p);
+
+ n2 = p - s2;
+
+ result = MB_STRNICMP2(s1, s2, *n, n2);
+ if (result == 0 && (int)n2 < *n)
+ *n = n2;
+ }
else
result = MB_STRNICMP(s1, s2, *n);
cstrchr(char_u *s, int c)
{
char_u *p;
- int cc;
+ int cc, lc;
if (!rex.reg_ic || (!enc_utf8 && mb_char2len(c) > 1))
return vim_strchr(s, c);
// faster (esp. when using MS Visual C++!).
// For UTF-8 need to use folded case.
if (enc_utf8 && c > 0x80)
+ {
cc = utf_fold(c);
+ lc = cc;
+ }
else
- if (MB_ISUPPER(c))
- cc = MB_TOLOWER(c);
- else if (MB_ISLOWER(c))
- cc = MB_TOUPPER(c);
- else
- return vim_strchr(s, c);
+ if (MB_ISUPPER(c))
+ {
+ cc = MB_TOLOWER(c);
+ lc = cc;
+ }
+ else if (MB_ISLOWER(c))
+ {
+ cc = MB_TOUPPER(c);
+ lc = c;
+ }
+ else
+ return vim_strchr(s, c);
if (has_mbyte)
{
for (p = s; *p != NUL; p += (*mb_ptr2len)(p))
{
- if (enc_utf8 && c > 0x80)
+ int uc = utf_ptr2char(p);
+ if (enc_utf8 && (c > 0x80 || uc > 0x80))
{
- int uc = utf_ptr2char(p);
-
// Do not match an illegal byte. E.g. 0xff matches 0xc3 0xbf,
// not 0xff.
- if ((uc < 0x80 || uc != *p) && utf_fold(uc) == cc)
+ // compare with lower case of the character
+ if ((uc < 0x80 || uc != *p) && utf_fold(uc) == lc)
return p;
}
else if (*p == c || *p == cc)
}
}
}
+ else if (enc_utf8)
+ {
+ if (cstrncmp(opnd, rex.input, &len) != 0)
+ {
+ status = RA_NOMATCH;
+ break;
+ }
+ }
else
for (i = 0; i < len; ++i)
if (opnd[i] != rex.input[i])
for (;;)
{
match = TRUE;
- len2 = MB_CHAR2LEN(regstart); // skip regstart
+ // skip regstart
+ len2 = MB_CHAR2LEN(regstart);
+ if (enc_utf8 && len2 > 1 && MB_CHAR2LEN(PTR2CHAR(rex.line + col)) != len2)
+ // because of case-folding of the previously matched text, we may need
+ // to skip fewer bytes than mb_char2len(regstart)
+ len2 = mb_char2len(utf_fold(regstart));
for (len1 = 0; match_text[len1] != NUL; len1 += MB_CHAR2LEN(c1))
{
c1 = PTR2CHAR(match_text + len1);
// If match_text is set it contains the full text that must match.
// Nothing else to try. Doesn't handle combining chars well.
- if (prog->match_text != NULL && !rex.reg_icombine)
+ if (prog->match_text != NULL && *prog->match_text != NUL && !rex.reg_icombine)
{
retval = find_match_text(&col, prog->regstart, prog->match_text);
if (REG_MULTI)
bw!
endfunc
+func Test_search_multibyte_match_ascii()
+ new
+ " Match single 'ſ' and 's'
+ call setline(1, 'das abc heraus abc ſich abc ſind')
+ for i in range(0, 2)
+ exe "set re="..i
+ let ic_match = matchbufline('%', '\c\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+ let noic_match = matchbufline('%', '\C\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+ call assert_equal(['s', 's', 'ſ','ſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+ call assert_equal(['ſ','ſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+ endfor
+ " Match several 'ſſ' and 'ss'
+ call setline(1, 'das abc herauss abc ſſich abc ſind')
+ for i in range(0, 2)
+ exe "set re="..i
+ let ic_match = matchbufline('%', '\c\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+ let noic_match = matchbufline('%', '\C\%u17f\%u17f', 1, '$')->mapnew({idx, val -> val.text})
+ let ic_match2 = matchbufline('%', '\c\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+ let noic_match2 = matchbufline('%', '\C\%u17f\+', 1, '$')->mapnew({idx, val -> val.text})
+ let ic_match3 = matchbufline('%', '\c[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+ let noic_match3 = matchbufline('%', '\C[\u17f]\+', 1, '$')->mapnew({idx, val -> val.text})
+
+ call assert_equal(['ss', 'ſſ'], ic_match, "Ignorecase Regex-engine: " .. &re)
+ call assert_equal(['ſſ'], noic_match, "No-Ignorecase Regex-engine: " .. &re)
+ call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match2, "Ignorecase Regex-engine: " .. &re)
+ call assert_equal(['ſſ','ſ'], noic_match2, "No-Ignorecase Regex-engine: " .. &re)
+ call assert_equal(['s', 'ss', 'ſſ', 'ſ'], ic_match3, "Ignorecase Collection Regex-engine: " .. &re)
+ call assert_equal(['ſſ','ſ'], noic_match3, "No-Ignorecase Collection Regex-engine: " .. &re)
+ endfor
+ bw!
+endfunc
+
" vim: shiftwidth=2 sts=2 expandtab
static int included_patches[] =
{ /* Add new patch number below this line */
+/**/
+ 645,
/**/
644,
/**/
# define MB_STRICMP(d, s) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)MAXCOL)
# define MB_STRNICMP(d, s, n) mb_strnicmp((char_u *)(d), (char_u *)(s), (int)(n))
+# define MB_STRNICMP2(d, s, n1, n2) mb_strnicmp2((char_u *)(d), (char_u *)(s), (n1), (n2))
#define STRCAT(d, s) strcat((char *)(d), (char *)(s))
#define STRNCAT(d, s, n) strncat((char *)(d), (char *)(s), (size_t)(n))