From: Enzo Matsumiya Date: Mon, 13 Apr 2026 19:07:10 +0000 (-0300) Subject: smb: client: compress: LZ77 optimizations X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=4460e9c68d1a8d1bd5b892c01f10f2cd06b1fd8b;p=thirdparty%2Fkernel%2Fstable.git smb: client: compress: LZ77 optimizations This patch implements several micro-optimizations on lz77_compress() with the goal of reducing the number of instructions per [input] byte (a.k.a. IPB). Changes: - change hashtable to be u32 (instead of u64) -- change the hash function to reflect that (adds lz77_hash() and lz77_read32() helpers) - batch-write literals instead of 1 by 1 -- now that we have a well defined hot path (match finding) and a cold path (encode literals + match), batch writing makes a significant difference - implement adaptive skipping of input bytes -- skip input bytes more aggressively if too few matches are being found - name some constants for more meaningful context Signed-off-by: Enzo Matsumiya Signed-off-by: Steve French --- diff --git a/fs/smb/client/compress/lz77.c b/fs/smb/client/compress/lz77.c index 480927dcd4c6..96744f52e364 100644 --- a/fs/smb/client/compress/lz77.c +++ b/fs/smb/client/compress/lz77.c @@ -1,6 +1,6 @@ // SPDX-License-Identifier: GPL-2.0-only /* - * Copyright (C) 2024, SUSE LLC + * Copyright (C) 2024-2026, SUSE LLC * * Authors: Enzo Matsumiya * @@ -16,17 +16,26 @@ /* * Compression parameters. */ -#define LZ77_MATCH_MIN_LEN 4 #define LZ77_MATCH_MAX_DIST SZ_8K #define LZ77_HASH_LOG 15 #define LZ77_HASH_SIZE (1 << LZ77_HASH_LOG) -#define LZ77_STEP_SIZE sizeof(u64) +#define LZ77_RSTEP_SIZE sizeof(u32) +#define LZ77_MSTEP_SIZE sizeof(u64) +#define LZ77_SKIP_TRIGGER 4 + +#define LZ77_PREFETCH(ptr) __builtin_prefetch((ptr), 0, 3) +#define LZ77_FLAG_MAX 32 static __always_inline u8 lz77_read8(const u8 *ptr) { return get_unaligned(ptr); } +static __always_inline u32 lz77_read32(const u32 *ptr) +{ + return get_unaligned(ptr); +} + static __always_inline u64 lz77_read64(const u64 *ptr) { return get_unaligned(ptr); @@ -50,14 +59,14 @@ static __always_inline void lz77_write32(u32 *ptr, u32 v) static __always_inline u32 lz77_match_len(const void *match, const void *cur, const void *end) { const void *start = cur; - u64 diff; /* Safe for a do/while because otherwise we wouldn't reach here from the main loop. */ do { - diff = lz77_read64(cur) ^ lz77_read64(match); + const u64 diff = lz77_read64(cur) ^ lz77_read64(match); + if (!diff) { - cur += LZ77_STEP_SIZE; - match += LZ77_STEP_SIZE; + cur += LZ77_MSTEP_SIZE; + match += LZ77_MSTEP_SIZE; continue; } @@ -66,7 +75,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co cur += count_trailing_zeros(diff) >> 3; return (cur - start); - } while (likely(cur + LZ77_STEP_SIZE <= end)); + } while (likely(cur + LZ77_MSTEP_SIZE <= end)); /* Fallback to byte-by-byte comparison for last <8 bytes. */ while (cur < end && lz77_read8(cur) == lz77_read8(match)) { @@ -77,7 +86,7 @@ static __always_inline u32 lz77_match_len(const void *match, const void *cur, co return (cur - start); } -static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u32 len) +static __always_inline void *lz77_encode_match(void *dst, void **nib, u16 dist, u32 len) { len -= 3; dist--; @@ -131,94 +140,124 @@ static __always_inline void *lz77_write_match(void *dst, void **nib, u32 dist, u return dst + 4; } -noinline int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen) +static __always_inline void *lz77_encode_literals(const void *start, const void *end, void *dst, + long *f, u32 *fc, void **fp) +{ + if (start >= end) + return dst; + + do { + const u32 len = umin(end - start, LZ77_FLAG_MAX - *fc); + + memcpy(dst, start, len); + + dst += len; + start += len; + + *f <<= len; + *fc += len; + if (*fc == LZ77_FLAG_MAX) { + lz77_write32(*fp, *f); + *fc = 0; + *fp = dst; + dst += 4; + } + } while (start < end); + + return dst; +} + +static __always_inline u32 lz77_hash(const u32 v) +{ + return ((v ^ 0x9E3779B9) * 0x85EBCA6B) >> (32 - LZ77_HASH_LOG); +} + +noinline int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen) { - const void *srcp, *end; + const void *srcp, *rlim, *end, *anchor; + u32 *htable, hash, flag_count = 0; void *dstp, *nib, *flag_pos; - u32 flag_count = 0; long flag = 0; - u64 *htable; /* This is probably a bug, so throw a warning. */ if (WARN_ON_ONCE(*dlen < lz77_compressed_alloc_size(slen))) return -EINVAL; - srcp = src; - end = src + slen; + srcp = anchor = src; + end = srcp + slen; /* absolute end */ + rlim = end - LZ77_MSTEP_SIZE; /* read limit (for lz77_match_len()) */ dstp = dst; - nib = NULL; flag_pos = dstp; dstp += 4; + nib = NULL; htable = kvcalloc(LZ77_HASH_SIZE, sizeof(*htable), GFP_KERNEL); if (!htable) return -ENOMEM; - /* Main loop. */ - do { - u32 dist, len = 0; - const void *wnd; - u64 hash; - - hash = ((lz77_read64(srcp) << 24) * 889523592379ULL) >> (64 - LZ77_HASH_LOG); - wnd = src + htable[hash]; - htable[hash] = srcp - src; - dist = srcp - wnd; - - if (dist && dist < LZ77_MATCH_MAX_DIST) - len = lz77_match_len(wnd, srcp, end); + LZ77_PREFETCH(srcp + LZ77_RSTEP_SIZE); - if (len < LZ77_MATCH_MIN_LEN) { - lz77_write8(dstp, lz77_read8(srcp)); - - dstp++; - srcp++; - - flag <<= 1; - flag_count++; - if (flag_count == 32) { - lz77_write32(flag_pos, flag); - flag_count = 0; - flag_pos = dstp; - dstp += 4; - } - - continue; - } + hash = lz77_hash(lz77_read32(srcp++)); + htable[hash] = 0; + hash = lz77_hash(lz77_read32(srcp)); - dstp = lz77_write_match(dstp, &nib, dist, len); + /* + * Main loop. + * + * @dlen is >= lz77_compressed_alloc_size(), so run without bound-checking @dstp. + * + * This code was crafted in a way to best utilise fetch-decode-execute CPU flow. + * Any attempt to optimize it, or even organize it, can lead to huge performance loss. + */ + do { + const void *match, *next = srcp; + u32 len, step = 1, skip = 1U << LZ77_SKIP_TRIGGER; + + /* Match finding (hot path -- don't change the read/check/write order). */ + do { + const u32 cur_hash = hash; + + srcp = next; + next += step; + step = (skip++ >> LZ77_SKIP_TRIGGER); + if (unlikely(next > rlim)) + goto out; + + hash = lz77_hash(lz77_read32(next)); + match = src + htable[cur_hash]; + htable[cur_hash] = srcp - src; + } while (likely(match + LZ77_MATCH_MAX_DIST < srcp) || + lz77_read32(match) != lz77_read32(srcp)); + + dstp = lz77_encode_literals(anchor, srcp, dstp, &flag, &flag_count, &flag_pos); + len = lz77_match_len(match, srcp, end); + dstp = lz77_encode_match(dstp, &nib, srcp - match, len); srcp += len; + anchor = srcp; + + LZ77_PREFETCH(srcp); flag = (flag << 1) | 1; flag_count++; - if (flag_count == 32) { + if (flag_count == LZ77_FLAG_MAX) { lz77_write32(flag_pos, flag); flag_count = 0; flag_pos = dstp; dstp += 4; } - } while (likely(srcp + LZ77_STEP_SIZE <= end)); - - while (srcp < end) { - u32 c = umin(end - srcp, 32 - flag_count); - memcpy(dstp, srcp, c); + if (unlikely(srcp > rlim)) + break; - dstp += c; - srcp += c; - - flag <<= c; - flag_count += c; - if (flag_count == 32) { - lz77_write32(flag_pos, flag); - flag_count = 0; - flag_pos = dstp; - dstp += 4; - } - } + /* Prepare for next loop. */ + hash = lz77_hash(lz77_read32(srcp)); + } while (srcp < end); +out: + dstp = lz77_encode_literals(anchor, end, dstp, &flag, &flag_count, &flag_pos); - flag <<= (32 - flag_count); - flag |= (1UL << (32 - flag_count)) - 1; + flag_count = LZ77_FLAG_MAX - flag_count; + flag <<= flag_count; + flag |= (1UL << flag_count) - 1; lz77_write32(flag_pos, flag); *dlen = dstp - dst; diff --git a/fs/smb/client/compress/lz77.h b/fs/smb/client/compress/lz77.h index 2603eab9e071..4e570846aefa 100644 --- a/fs/smb/client/compress/lz77.h +++ b/fs/smb/client/compress/lz77.h @@ -1,6 +1,6 @@ /* SPDX-License-Identifier: GPL-2.0-only */ /* - * Copyright (C) 2024, SUSE LLC + * Copyright (C) 2024-2026, SUSE LLC * * Authors: Enzo Matsumiya * @@ -39,5 +39,5 @@ static __always_inline u32 lz77_compressed_alloc_size(const u32 size) return size + (size >> 3) + 8; } -int lz77_compress(const void *src, u32 slen, void *dst, u32 *dlen); +int lz77_compress(const void *src, const u32 slen, void *dst, u32 *dlen); #endif /* _SMB_COMPRESS_LZ77_H */