From: Yasuhiro Matsumoto Date: Wed, 11 Mar 2026 20:18:26 +0000 (+0000) Subject: patch 9.2.0140: file reading performance can be improved X-Git-Tag: v9.2.0140^0 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=2ca96b09d751c35189dd587d9b201e2aeb5559c0;p=thirdparty%2Fvim.git patch 9.2.0140: file reading performance can be improved Problem: Reading large files is slow because UTF-8 validation and newline scanning are performed byte-by-byte. Initial file loading also triggers listener and channel processing. Solution: Use memchr() for SIMD-optimized newline scanning, implement word-at-a-time ASCII skipping during UTF-8 validation using a bitmask, skip listener/netbeans/channel notifications when the ML_APPEND_NEW flag is set during readfile() (Yasuhiro Matsumoto). closes: #19612 Co-authored-by: NRK Signed-off-by: Yasuhiro Matsumoto Signed-off-by: Christian Brabandt --- diff --git a/src/fileio.c b/src/fileio.c index 0ba9905c35..8917a7e291 100644 --- a/src/fileio.c +++ b/src/fileio.c @@ -27,6 +27,10 @@ // Is there any system that doesn't have access()? #define USE_MCH_ACCESS +// Bitmask with 0x80 set in each byte of a long_u word, used to detect +// non-ASCII bytes (high bit set) in multiple bytes at once. +#define NONASCII_MASK (((long_u)-1 / 0xFF) * 0x80) + #if defined(__hpux) && !defined(HAVE_DIRFD) # define dirfd(x) ((x)->__dd_fd) # define HAVE_DIRFD @@ -2056,11 +2060,27 @@ retry: int incomplete_tail = FALSE; // Reading UTF-8: Check if the bytes are valid UTF-8. - for (p = ptr; ; ++p) + for (p = ptr; ; ) { - int todo = (int)((ptr + size) - p); + int todo; int l; + // Skip ASCII bytes quickly using word-at-a-time check. + { + char_u *ascii_end = ptr + size; + while (ascii_end - p >= (long)sizeof(long_u)) + { + long_u word; + memcpy(&word, p, sizeof(long_u)); + if (word & NONASCII_MASK) + break; + p += sizeof(long_u); + } + while (p < ascii_end && *p < 0x80) + ++p; + } + + todo = (int)((ptr + size) - p); if (todo <= 0) break; if (*p >= 0x80) @@ -2109,14 +2129,17 @@ retry: if (bad_char_behavior == BAD_DROP) { mch_memmove(p, p + 1, todo - 1); - --p; --size; } - else if (bad_char_behavior != BAD_KEEP) - *p = bad_char_behavior; + else + { + if (bad_char_behavior != BAD_KEEP) + *p = bad_char_behavior; + ++p; + } } else - p += l - 1; + p += l; } } if (p < ptr + size && !incomplete_tail) @@ -2255,73 +2278,101 @@ rewind_retry: } else { - --ptr; - while (++ptr, --size >= 0) + // Use memchr() for SIMD-optimized newline scanning instead + // of scanning each byte individually. + char_u *end = ptr + size; + + while (ptr < end) { - if ((c = *ptr) != NUL && c != NL) // catch most common case - continue; - if (c == NUL) - *ptr = NL; // NULs are replaced by newlines! - else + char_u *nl = (char_u *)memchr(ptr, NL, end - ptr); + char_u *nul_scan; + + if (nl == NULL) { - if (skip_count == 0) + // No more newlines in buffer. + // Replace any NUL bytes with NL in remaining data. + while ((nul_scan = (char_u *)memchr(ptr, NUL, + end - ptr)) != NULL) + { + *nul_scan = NL; + ptr = nul_scan + 1; + } + ptr = end; + break; + } + + // Replace NUL bytes with NL before the newline. + { + char_u *scan = ptr; + while ((nul_scan = (char_u *)memchr(scan, NUL, + nl - scan)) != NULL) + { + *nul_scan = NL; + scan = nul_scan + 1; + } + } + + // Process the newline. + ptr = nl; + if (skip_count == 0) + { + *ptr = NUL; // end of line + len = (colnr_T)(ptr - line_start + 1); + if (fileformat == EOL_DOS) { - *ptr = NUL; // end of line - len = (colnr_T)(ptr - line_start + 1); - if (fileformat == EOL_DOS) + if (ptr > line_start && ptr[-1] == CAR) { - if (ptr > line_start && ptr[-1] == CAR) - { - // remove CR before NL - ptr[-1] = NUL; - --len; - } - /* - * Reading in Dos format, but no CR-LF found! - * When 'fileformats' includes "unix", delete all - * the lines read so far and start all over again. - * Otherwise give an error message later. - */ - else if (ff_error != EOL_DOS) - { - if ( try_unix - && !read_stdin - && (read_buffer - || vim_lseek(fd, (off_T)0L, SEEK_SET) - == 0)) - { - fileformat = EOL_UNIX; - if (set_options) - set_fileformat(EOL_UNIX, OPT_LOCAL); - file_rewind = TRUE; - keep_fileformat = TRUE; - goto retry; - } - ff_error = EOL_DOS; - } + // remove CR before NL + ptr[-1] = NUL; + --len; } - if (ml_append(lnum, line_start, len, newfile) == FAIL) + /* + * Reading in Dos format, but no CR-LF found! + * When 'fileformats' includes "unix", delete all + * the lines read so far and start all over again. + * Otherwise give an error message later. + */ + else if (ff_error != EOL_DOS) { - error = TRUE; - break; + if ( try_unix + && !read_stdin + && (read_buffer + || vim_lseek(fd, (off_T)0L, SEEK_SET) + == 0)) + { + fileformat = EOL_UNIX; + if (set_options) + set_fileformat(EOL_UNIX, OPT_LOCAL); + file_rewind = TRUE; + keep_fileformat = TRUE; + goto retry; + } + ff_error = EOL_DOS; } + } + if (ml_append(lnum, line_start, len, newfile) == FAIL) + { + error = TRUE; + break; + } #ifdef FEAT_PERSISTENT_UNDO - if (read_undo_file) - sha256_update(&sha_ctx, line_start, len); + if (read_undo_file) + sha256_update(&sha_ctx, line_start, len); #endif - ++lnum; - if (--read_count == 0) - { - error = TRUE; // break loop - line_start = ptr; // nothing left to write - break; - } + ++lnum; + if (--read_count == 0) + { + error = TRUE; // break loop + line_start = ptr; // nothing left to write + break; } - else - --skip_count; - line_start = ptr + 1; } + else + --skip_count; + line_start = ptr + 1; + ++ptr; } + size = -1; } linerest = (long)(ptr - line_start); ui_breakcheck(); diff --git a/src/memline.c b/src/memline.c index 2d92aadb6d..58824ad118 100644 --- a/src/memline.c +++ b/src/memline.c @@ -3472,7 +3472,7 @@ ml_append_int( #endif #ifdef FEAT_NETBEANS_INTG - if (netbeans_active()) + if (!(flags & ML_APPEND_NEW) && netbeans_active()) { int line_len = (int)STRLEN(line); if (line_len > 0) @@ -3481,7 +3481,7 @@ ml_append_int( } #endif #ifdef FEAT_JOB_CHANNEL - if (buf->b_write_to_channel) + if (!(flags & ML_APPEND_NEW) && buf->b_write_to_channel) channel_write_new_lines(buf); #endif ret = OK; @@ -3512,11 +3512,15 @@ ml_append_flush( ml_flush_line(buf); #ifdef FEAT_EVAL - // When inserting above recorded changes: flush the changes before changing - // the text. Then flush the cached line, it may become invalid. - may_invoke_listeners(buf, lnum + 1, lnum + 1, 1); - if (buf->b_ml.ml_line_lnum != 0) - ml_flush_line(buf); + if (!(flags & ML_APPEND_NEW)) + { + // When inserting above recorded changes: flush the changes before + // changing the text. Then flush the cached line, it may become + // invalid. Skip during initial file read for performance. + may_invoke_listeners(buf, lnum + 1, lnum + 1, 1); + if (buf->b_ml.ml_line_lnum != 0) + ml_flush_line(buf); + } #endif return ml_append_int(buf, lnum, line, len, flags); diff --git a/src/version.c b/src/version.c index ad7d6ab32f..8c1f6c30a0 100644 --- a/src/version.c +++ b/src/version.c @@ -734,6 +734,8 @@ static char *(features[]) = static int included_patches[] = { /* Add new patch number below this line */ +/**/ + 140, /**/ 139, /**/