From: Hirohito Higashi Date: Sat, 13 Jun 2026 18:41:28 +0000 (+0000) Subject: patch 9.2.0635: checking the syntax contains/cluster list is slow X-Git-Tag: v9.2.0635^0 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=48fbae4378ab9bb6e7fcbd2480544847eac10cd7;p=thirdparty%2Fvim.git patch 9.2.0635: checking the syntax contains/cluster list is slow Problem: Deciding whether a group is in a "contains"/cluster list scans the list and expands clusters on every check, which is slow for syntaxes with large lists (e.g. plugins such as netrw). Solution: Resolve each list once into a sorted, cluster-expanded set of group IDs and use a binary search; cache it per syntax block and drop the cache when syntax definitions change (Hirohito Higashi). closes: #20490 Co-Authored-By: Claude Opus 4.7 (1M context) Signed-off-by: Hirohito Higashi Signed-off-by: Christian Brabandt --- diff --git a/runtime/doc/testing.txt b/runtime/doc/testing.txt index 24e3a4aa25..8e8831b422 100644 --- a/runtime/doc/testing.txt +++ b/runtime/doc/testing.txt @@ -409,6 +409,8 @@ test_override({name}, {val}) *test_override()* redraw disable the redrawing() function. redraw_flag ignore the RedrawingDisabled flag. starting reset the "starting" variable, see below. + syn_idlist_cache disable the cache for resolving + syntax "contains"/cluster lists. term_props reset all terminal properties when the version string is detected. ui_delay time in msec to use in ui_delay(); overrules a diff --git a/src/globals.h b/src/globals.h index 96d6785453..8550725235 100644 --- a/src/globals.h +++ b/src/globals.h @@ -2007,6 +2007,7 @@ EXTERN int disable_char_avail_for_testing INIT(= FALSE); EXTERN int disable_redraw_for_testing INIT(= FALSE); EXTERN int ignore_redraw_flag_for_testing INIT(= FALSE); EXTERN int nfa_fail_for_testing INIT(= FALSE); +EXTERN int disable_syn_idlist_cache_for_testing INIT(= FALSE); EXTERN int no_query_mouse_for_testing INIT(= FALSE); EXTERN int ui_delay_for_testing INIT(= 0); EXTERN int reset_term_props_on_termresponse INIT(= FALSE); diff --git a/src/structs.h b/src/structs.h index c831e3341f..92d4441bea 100644 --- a/src/structs.h +++ b/src/structs.h @@ -3193,6 +3193,9 @@ typedef struct { int b_sst_freecount; linenr_T b_sst_check_lnum; short_u b_sst_lasttick; // last display tick + + // Cache for in_id_list(); see idl_cache_T in syntax.c. + void *b_idlist_cache; #endif // FEAT_SYN_HL #ifdef FEAT_SPELL diff --git a/src/syntax.c b/src/syntax.c index c845a1051f..4333c31839 100644 --- a/src/syntax.c +++ b/src/syntax.c @@ -345,6 +345,7 @@ static void init_syn_patterns(void); static char_u *get_syn_pattern(char_u *arg, synpat_T *ci); static int get_id_list(char_u **arg, int keylen, short **list, int skip); static void syn_combine_list(short **clstr1, short **clstr2, int list_op); +static void idl_cache_clear(synblock_T *block); /* * Start the syntax recognition for a line. This function is normally called @@ -1027,6 +1028,9 @@ syn_stack_free_block(synblock_T *block) { synstate_T *p; + // Syntax definitions may have changed: drop the in_id_list() cache. + idl_cache_clear(block); + if (block->b_sst_array == NULL) return; @@ -6110,6 +6114,194 @@ copy_id_list(short *list) return retval; } +/* + * Cache for in_id_list() {{{ + * + * A "contains"/cluster list is otherwise scanned with recursive cluster + * expansion on every check. Each distinct list is resolved once into a + * cluster-expanded, sorted set of group IDs so membership becomes a binary + * search. The cache lives in the synblock and is cleared by + * syn_stack_free_block() when syntax definitions change (which also covers + * list pointer reuse). + */ +#define IDL_CACHE_SIZE 16 + +typedef struct +{ + short *idl_key; // the list this entry resolves (NULL when empty) + short *idl_ids; // sorted, cluster-expanded group IDs (allocated) + int idl_count; // number of IDs in idl_ids + short idl_marker; // leading ALLBUT/TOP/CONTAINED item, or 0 + bool idl_usable; // false: not cacheable, use the slow path +} idl_entry_T; + +typedef struct +{ + idl_entry_T ent[IDL_CACHE_SIZE]; + int round; // round-robin replacement index + int last; // index of the last returned entry (fast path) +} idl_cache_T; + +/* + * Collect the explicit group IDs of "list" into "ga", expanding clusters. + * "seen" records visited clusters to break cycles and avoid duplicate work. + * Return false when the list cannot be represented as a plain set (a nested + * ALLBUT/TOP/CONTAINED marker) or on out of memory. + */ + static bool +idl_collect(short *list, garray_T *ga, garray_T *seen) +{ + if (list == NULL || list == ID_LIST_ALL) + return false; + for (short item = *list; item != 0; item = *++list) + { + if (item >= SYNID_ALLBUT && item < SYNID_CLUSTER) + return false; // nested ALLBUT/TOP/CONTAINED: not cacheable + if (item >= SYNID_CLUSTER) + { + short *scl_list; + bool seen_it = false; + + for (int i = 0; i < seen->ga_len; ++i) + if (((short *)seen->ga_data)[i] == item) + { + seen_it = true; + break; + } + if (seen_it) + continue; + if (ga_grow(seen, 1) == FAIL) + return false; + ((short *)seen->ga_data)[seen->ga_len++] = item; + // Record the cluster id itself, not just its members: the slow + // path matches "item == id" for a cluster id present in the list. + if (ga_grow(ga, 1) == FAIL) + return false; + ((short *)ga->ga_data)[ga->ga_len++] = item; + scl_list = SYN_CLSTR(syn_block)[item - SYNID_CLUSTER].scl_list; + if (scl_list != NULL && !idl_collect(scl_list, ga, seen)) + return false; + } + else + { + if (ga_grow(ga, 1) == FAIL) + return false; + ((short *)ga->ga_data)[ga->ga_len++] = item; + } + } + return true; +} + + static int +idl_id_cmp(const void *a, const void *b) +{ + return (int)*(const short *)a - (int)*(const short *)b; +} + +/* + * Get the cache entry that resolves "list", building it if needed. + * Returns NULL only on allocation failure. + */ + static idl_entry_T * +idl_get_entry(short *list) +{ + idl_cache_T *cache = (idl_cache_T *)syn_block->b_idlist_cache; + idl_entry_T *e; + garray_T ga, seen; + + if (cache == NULL) + { + cache = ALLOC_CLEAR_ONE(idl_cache_T); + if (cache == NULL) + return NULL; + syn_block->b_idlist_cache = cache; + } + // Fast path: the same list is usually queried many times in a row. + if (cache->ent[cache->last].idl_key == list) + return &cache->ent[cache->last]; + for (int i = 0; i < IDL_CACHE_SIZE; ++i) + if (cache->ent[i].idl_key == list) + { + cache->last = i; + return &cache->ent[i]; + } + + // Build a new entry, replacing one in round-robin order. + cache->last = cache->round; + e = &cache->ent[cache->round]; + cache->round = (cache->round + 1) % IDL_CACHE_SIZE; + VIM_CLEAR(e->idl_ids); + e->idl_count = 0; + e->idl_key = list; + e->idl_marker = 0; + e->idl_usable = true; + + // A leading ALLBUT/TOP/CONTAINED marker is kept aside; the rest is the set. + if (*list >= SYNID_ALLBUT && *list < SYNID_CLUSTER) + { + e->idl_marker = *list; + ++list; + } + + ga_init2(&ga, sizeof(short), 50); + ga_init2(&seen, sizeof(short), 10); + if (!idl_collect(list, &ga, &seen)) + { + e->idl_usable = false; + ga_clear(&ga); + ga_clear(&seen); + return e; + } + ga_clear(&seen); + if (ga.ga_len > 0) + { + qsort(ga.ga_data, (size_t)ga.ga_len, sizeof(short), idl_id_cmp); + e->idl_ids = (short *)ga.ga_data; // take ownership + e->idl_count = ga.ga_len; + } + else + ga_clear(&ga); + return e; +} + +/* + * Return true if "id" is in the resolved set of cache entry "e". + */ + static bool +idl_contains(idl_entry_T *e, short id) +{ + int lo = 0; + int hi = e->idl_count - 1; + + while (lo <= hi) + { + int mid = (lo + hi) / 2; + short v = e->idl_ids[mid]; + + if (v == id) + return true; + if (v < id) + lo = mid + 1; + else + hi = mid - 1; + } + return false; +} + + static void +idl_cache_clear(synblock_T *block) +{ + idl_cache_T *cache = (idl_cache_T *)block->b_idlist_cache; + + if (cache == NULL) + return; + for (int i = 0; i < IDL_CACHE_SIZE; ++i) + vim_free(cache->ent[i].idl_ids); + vim_free(cache); + block->b_idlist_cache = NULL; +} +// }}} + /* * Check if syntax group "ssp" is in the ID list "list" of "cur_si". * "cur_si" can be NULL if not checking the "containedin" list. @@ -6131,6 +6323,7 @@ in_id_list( static int depth = 0; int r; int toplevel; + idl_entry_T *e; // If ssp has a "containedin" list and "cur_si" is in it, return TRUE. if (cur_si != NULL && ssp->cont_in_list != NULL @@ -6166,6 +6359,42 @@ in_id_list( toplevel = !(flags & HL_CONTAINED) || (flags & HL_INCLUDED_TOPLEVEL); /* + * Fast path: use the resolved, sorted set cached for this list. Only the + * leading ALLBUT/TOP/CONTAINED gate depends on "ssp"/"flags", so it is + * applied here; membership itself is a binary search. Can be turned off + * with test_override('syn_idlist_cache', 1) to verify the result matches + * the slow path. + */ + e = disable_syn_idlist_cache_for_testing ? NULL : idl_get_entry(list); + if (e != NULL && e->idl_usable) + { + if (e->idl_marker != 0) + { + item = e->idl_marker; + if (item < SYNID_TOP) + { + if (item - SYNID_ALLBUT != ssp->inc_tag) + return FALSE; + } + else if (item < SYNID_CONTAINED) + { + if (item - SYNID_TOP != ssp->inc_tag || !toplevel) + return FALSE; + } + else + { + if (item - SYNID_CONTAINED != ssp->inc_tag || toplevel) + return FALSE; + } + retval = FALSE; + } + else + retval = TRUE; + return idl_contains(e, id) ? retval : !retval; + } + + /* + * Slow path (uncacheable list, e.g. a nested ALLBUT). * If the first item is "ALLBUT", return TRUE if "id" is NOT in the * contains list. We also require that "id" is at the same ":syn include" * level as the list. diff --git a/src/testdir/test_syntax.vim b/src/testdir/test_syntax.vim index 739fee528d..170e2e3b93 100644 --- a/src/testdir/test_syntax.vim +++ b/src/testdir/test_syntax.vim @@ -689,6 +689,43 @@ func Test_syn_zsub() bw! endfunc +func s:SynDumpBuffer(ft, lines) + new + syntax on + execute 'setfiletype ' .. a:ft + call setline(1, a:lines) + let result = [] + for lnum in range(1, line('$')) + for col in range(1, max([1, len(getline(lnum))])) + call add(result, synIDattr(synID(lnum, col, 1), 'name')) + endfor + endfor + bwipe! + return result +endfunc + +" The in_id_list() cache is a speed optimization only: highlighting must be +" identical with the cache on (default) and off. Compare full-buffer synID() +" output both ways across filetypes that use "contains"/cluster lists. +func Test_syntax_idlist_cache_unchanged() + let samples = { + \ 'c': ['#define M 0x1F', "char c = '\\n';", + \ 'int f(int a) { return a + 0xAB; }'], + \ 'vim': ['func Foo() abort', ' let x = [1, 2] + {"k": 0z1f}', + \ 'endfunc'], + \ 'python': ['class C:', ' def m(self): return {1: "s", 2: r"\d"}'], + \ 'sh': ['case $x in', ' a) echo "${y:-0xAB}";;', 'esac'], + \ } + for ft in sort(keys(samples)) + call test_override('syn_idlist_cache', 0) + let l:on = s:SynDumpBuffer(ft, samples[ft]) + call test_override('syn_idlist_cache', 1) + let l:off = s:SynDumpBuffer(ft, samples[ft]) + call test_override('ALL', 0) + call assert_equal(l:off, l:on, 'filetype ' .. ft) + endfor +endfunc + " Using \z() in a region with NFA failing should not crash. func Test_syn_wrong_z_one() new diff --git a/src/testing.c b/src/testing.c index eaa7c69f84..e6ba03963a 100644 --- a/src/testing.c +++ b/src/testing.c @@ -1055,6 +1055,8 @@ f_test_override(typval_T *argvars, typval_T *rettv UNUSED) } else if (STRCMP(name, (char_u *)"nfa_fail") == 0) nfa_fail_for_testing = val; + else if (STRCMP(name, (char_u *)"syn_idlist_cache") == 0) + disable_syn_idlist_cache_for_testing = val; else if (STRCMP(name, (char_u *)"no_query_mouse") == 0) no_query_mouse_for_testing = val; else if (STRCMP(name, (char_u *)"no_wait_return") == 0) @@ -1081,6 +1083,7 @@ f_test_override(typval_T *argvars, typval_T *rettv UNUSED) disable_redraw_for_testing = FALSE; ignore_redraw_flag_for_testing = FALSE; nfa_fail_for_testing = FALSE; + disable_syn_idlist_cache_for_testing = FALSE; no_query_mouse_for_testing = FALSE; ui_delay_for_testing = 0; reset_term_props_on_termresponse = FALSE; diff --git a/src/version.c b/src/version.c index 756793446f..e99db49712 100644 --- a/src/version.c +++ b/src/version.c @@ -759,6 +759,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 635, /**/ 634, /**/