]> git.ipfire.org Git - thirdparty/vim.git/commitdiff
patch 9.1.1276: inline word diff treats multibyte chars as word char v9.1.1276
authorYee Cheng Chin <ychin.git@gmail.com>
Fri, 4 Apr 2025 17:16:21 +0000 (19:16 +0200)
committerChristian Brabandt <cb@256bit.org>
Fri, 4 Apr 2025 17:16:21 +0000 (19:16 +0200)
Problem:  inline word diff treats multibyte chars as word char
          (after 9.1.1243)
Solution: treat all non-alphanumeric characters as non-word characters
          (Yee Cheng Chin)

Previously inline word diff simply used Vim's definition of keyword to
determine what is a word, which leads to multi-byte character classes
such as emojis and CJK (Chinese/Japanese/Korean) characters all
classifying as word characters, leading to entire sentences being
grouped as a single word which does not provide meaningful information
in a diff highlight.

Fix this by treating all non-alphanumeric characters (with class number
above 2) as non-word characters, as there is usually no benefit in using
word diff on them. These include CJK characters, emojis, and also
subscript/superscript numbers. Meanwhile, multi-byte characters like
Cyrillic and Greek letters will still continue to considered as words.

Note that this is slightly inconsistent with how words are defined
elsewhere, as Vim usually considers any character with class >=2 to be
a "word".

related: #16881 (diff inline highlight)
closes: #17050

Signed-off-by: Yee Cheng Chin <ychin.git@gmail.com>
Signed-off-by: Christian Brabandt <cb@256bit.org>
runtime/doc/options.txt
src/diff.c
src/mbyte.c
src/testdir/dumps/Test_diff_inline_word_03.dump [new file with mode: 0644]
src/testdir/test_diffmode.vim
src/version.c

index 84deecae6fdca8e13e92948a66d992c02e0dab6e..e2206f0b46c5bd7a2ffe5d88e12cfe5970e2b21c 100644 (file)
@@ -1,4 +1,4 @@
-*options.txt*  For Vim version 9.1.  Last change: 2025 Mar 28
+*options.txt*  For Vim version 9.1.  Last change: 2025 Apr 04
 
 
                  VIM REFERENCE MANUAL    by Bram Moolenaar
@@ -2989,7 +2989,10 @@ A jump table for the options with a short description can be found at |Q_op|.
                                        difference.
                                word    Use internal diff to perform a
                                        |word|-wise diff and highlight the
-                                       difference.
+                                       difference.  Non-alphanumeric
+                                       multi-byte characters such as emoji
+                                       and CJK characters are considered
+                                       individual words.
 
                internal        Use the internal diff library.  This is
                                ignored when 'diffexpr' is set.  *E960*
index 3adcdb7dba67a493e325c561990780d183e5a97e..e694cf20cd49acf96aad8b4ed14761d5cb6b6908 100644 (file)
@@ -3309,10 +3309,17 @@ diff_find_change_inline_diff(
            char_u *s;
            for (s = curline; *s != NUL;)
            {
-               // Always use the first buffer's 'iskeyword' to have a consistent diff
                int new_in_keyword = FALSE;
                if (diff_flags & DIFF_INLINE_WORD)
-                   new_in_keyword = vim_iswordp_buf(s, curtab->tp_diffbuf[file1_idx]);
+               {
+                   // Always use the first buffer's 'iskeyword' to have a
+                   // consistent diff.
+                   // For multibyte chars, only treat alphanumeric chars
+                   // (class 2) as "word", as other classes such as emojis and
+                   // CJK ideographs do not usually benefit from word diff as
+                   // Vim doesn't have a good way to segment them.
+                   new_in_keyword = (mb_get_class_buf(s, curtab->tp_diffbuf[file1_idx]) == 2);
+               }
                if (in_keyword && !new_in_keyword)
                {
                    ga_append(curstr, NL);
index a38ab24f32613e9f52d1a56190e5061a7a154f46..cc8d628ed59b88192b771c3d3b5f28825360acfa 100644 (file)
@@ -828,8 +828,8 @@ remove_bom(char_u *s)
  * Get class of pointer:
  * 0 for blank or NUL
  * 1 for punctuation
- * 2 for an (ASCII) word character
- * >2 for other word characters
+ * 2 for an alphanumeric word character
+ * >2 for other word characters, including CJK and emoji
  */
     int
 mb_get_class(char_u *p)
diff --git a/src/testdir/dumps/Test_diff_inline_word_03.dump b/src/testdir/dumps/Test_diff_inline_word_03.dump
new file mode 100644 (file)
index 0000000..30efaed
--- /dev/null
@@ -0,0 +1,20 @@
+| +0#0000e05#a8a8a8255@1|🚀*0#0000000#ffd7ff255|⛵️*2&#ff404010|一*0&#ffd7ff255|二|三*2&#ff404010|ひ*0&#ffd7ff255|ら|が*0&#4040ff13|な*0&#ffd7ff255|Δ+2&#ff404010|έ|λ|τ|α| +0&#ffd7ff255|Δ+2&#ff404010|e|l|t|a| +0&#ffd7ff255|f|o@1|b|a||+1&#ffffff0| +0#0000e05#a8a8a8255@1|🚀*0#0000000#ffd7ff255|🛸*2&#ff404010|一*0&#ffd7ff255|二|四*2&#ff404010|ひ*0&#ffd7ff255|ら|な|δ+2&#ff404010|έ|λ|τ|α| +0&#ffd7ff255|δ+2&#ff404010|e|l|t|a| +0&#ffd7ff255|f|o@1|b|a|r| 
+|~+0#4040ff13#ffffff0| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|~| @35||+1#0000000&|~+0#4040ff13&| @35
+|X+3#0000000&|d|i|f|i|l|e|1| @10|1|,|1| @11|A|l@1| |X+1&&|d|i|f|i|l|e|2| @10|1|,|1| @11|A|l@1
+|:+0&&> @73
index 1b5e5c081969d38763191f71516db15efb62bd4b..d0c2f18e4ee8cfba9c032cb3dfe01540a7e85b7d 100644 (file)
@@ -2444,6 +2444,11 @@ func Test_diff_inline()
 
   call term_sendkeys(buf, ":windo set iskeyword&\<CR>:1wincmd w\<CR>")
 
+  " word diff: test handling of multi-byte characters. Only alphanumeric chars
+  " (e.g. Greek alphabet, but not CJK/emoji) count as words.
+  call WriteDiffFiles(buf, ["🚀⛵️一二三ひらがなΔέλτα Δelta foobar"], ["🚀🛸一二四ひらなδέλτα δelta foobar"])
+  call VerifyInternal(buf, "Test_diff_inline_word_03", " diffopt+=inline:word")
+
   " char diff: should slide highlight to whitespace boundary if possible for
   " better readability (by using forced indent-heuristics). A wrong result
   " would be if the highlight is "Bar, prefix". It should be "prefixBar, "
index d1ba7adf5ab14d2d22221458dfa11edfdd5e7f97..3e45e2f95f9b6af78725ca9ec81544e4d261b23c 100644 (file)
@@ -704,6 +704,8 @@ static char *(features[]) =
 
 static int included_patches[] =
 {   /* Add new patch number below this line */
+/**/
+    1276,
 /**/
     1275,
 /**/