From: Hans Kristian Rosbach Date: Tue, 11 Nov 2025 21:47:52 +0000 (+0100) Subject: - Unify crc32_chorba, chorba_sse2 and chorba_sse41 dispatch functions. X-Git-Tag: 2.3.0-rc2~6 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=a72d5f249b31d80990a0b687bd7a822301205f0c;p=thirdparty%2Fzlib-ng.git - Unify crc32_chorba, chorba_sse2 and chorba_sse41 dispatch functions. - Fixed alignment diff calculation in crc32_chorba. - Fixed length check to happen early, avoiding extra branches for too short lengths, this also allows removing one function call to crc32_braid_internal to handle those. Gbench shows ~0.15-0.25ns saved per call for lengths shorter than CHORBA_SMALL_THRESHOLD. - Avoid calculating aligned len if buffer is already aligned --- diff --git a/arch/generic/crc32_chorba_c.c b/arch/generic/crc32_chorba_c.c index 4041abd46..6f90d3c09 100644 --- a/arch/generic/crc32_chorba_c.c +++ b/arch/generic/crc32_chorba_c.c @@ -1448,32 +1448,31 @@ Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const #endif // OPTIMAL_CMP == 64 Z_INTERNAL uint32_t crc32_chorba(uint32_t crc, const uint8_t *buf, size_t len) { + uint64_t* aligned_buf; uint32_t c = (~crc) & 0xffffffff; + uintptr_t algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 7)) & 7; - uint64_t* aligned_buf; - size_t aligned_len; - unsigned long algn_diff = ((uintptr_t)8 - ((uintptr_t)buf & 0xF)) & 0xF; - if (algn_diff < len) { + if (len > algn_diff + CHORBA_SMALL_THRESHOLD) { if (algn_diff) { c = crc32_braid_internal(c, buf, algn_diff); + len -= algn_diff; } aligned_buf = (uint64_t*) (buf + algn_diff); - aligned_len = len - algn_diff; - if(aligned_len > CHORBA_LARGE_THRESHOLD) - c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); + if(len > CHORBA_LARGE_THRESHOLD) { + c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, len); +# if OPTIMAL_CMP == 64 + } else if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) { + c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, len); +# endif + } else { # if OPTIMAL_CMP == 64 - else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) - c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); - else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) - c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, aligned_len); + c = crc32_chorba_small_nondestructive(c, (uint64_t*) aligned_buf, len); # else - else if (aligned_len > CHORBA_SMALL_THRESHOLD_32BIT) - c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, aligned_len); + c = crc32_chorba_small_nondestructive_32bit(c, (uint32_t*) aligned_buf, len); # endif - else - c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); - } - else { + } + } else { + // Process too short lengths using crc32_braid c = crc32_braid_internal(c, buf, len); } diff --git a/arch/x86/chorba_sse2.c b/arch/x86/chorba_sse2.c index 3e25d7586..f79a5ac00 100644 --- a/arch/x86/chorba_sse2.c +++ b/arch/x86/chorba_sse2.c @@ -847,30 +847,26 @@ Z_INTERNAL uint32_t chorba_small_nondestructive_sse2(uint32_t crc, const uint64_ } Z_INTERNAL uint32_t crc32_chorba_sse2(uint32_t crc, const uint8_t *buf, size_t len) { - uint32_t c; uint64_t* aligned_buf; - size_t aligned_len; + uint32_t c = (~crc) & 0xffffffff; + uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15; - c = (~crc) & 0xffffffff; - unsigned long algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15; - if (algn_diff < len) { + if (len > algn_diff + CHORBA_SMALL_THRESHOLD_64BIT) { if (algn_diff) { c = crc32_braid_internal(c, buf, algn_diff); + len -= algn_diff; } aligned_buf = (uint64_t*) (buf + algn_diff); - aligned_len = len - algn_diff; #if !defined(WITHOUT_CHORBA) - if(aligned_len > CHORBA_LARGE_THRESHOLD) { - c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); + if(len > CHORBA_LARGE_THRESHOLD) { + c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, len); } else #endif - if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) { - c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len); - } else { - c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); + { + c = chorba_small_nondestructive_sse2(c, aligned_buf, len); } - } - else { + } else { + // Process too short lengths using crc32_braid c = crc32_braid_internal(c, buf, len); } diff --git a/arch/x86/chorba_sse41.c b/arch/x86/chorba_sse41.c index aebede45e..a7568a280 100644 --- a/arch/x86/chorba_sse41.c +++ b/arch/x86/chorba_sse41.c @@ -305,33 +305,28 @@ static Z_FORCEINLINE uint32_t crc32_chorba_32768_nondestructive_sse41(uint32_t c } Z_INTERNAL uint32_t crc32_chorba_sse41(uint32_t crc, const uint8_t *buf, size_t len) { - uint32_t c; uint64_t* aligned_buf; - size_t aligned_len; - - c = (~crc) & 0xffffffff; + uint32_t c = (~crc) & 0xffffffff; uintptr_t algn_diff = ((uintptr_t)16 - ((uintptr_t)buf & 15)) & 15; - if (algn_diff < len) { + + if (len > algn_diff + CHORBA_SMALL_THRESHOLD_64BIT) { if (algn_diff) { c = crc32_braid_internal(c, buf, algn_diff); + len -= algn_diff; } aligned_buf = (uint64_t*) (buf + algn_diff); - aligned_len = len - algn_diff; #if !defined(WITHOUT_CHORBA) - if(aligned_len > CHORBA_LARGE_THRESHOLD) { - c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len); + if(len > CHORBA_LARGE_THRESHOLD) { + c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, len); } else #endif - if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && - aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD) { - c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, aligned_len); - } else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT) { - c = chorba_small_nondestructive_sse2(c, aligned_buf, aligned_len); + if (len > CHORBA_MEDIUM_LOWER_THRESHOLD && len <= CHORBA_MEDIUM_UPPER_THRESHOLD) { + c = crc32_chorba_32768_nondestructive_sse41(c, aligned_buf, len); } else { - c = crc32_braid_internal(c, (uint8_t*) aligned_buf, aligned_len); + c = chorba_small_nondestructive_sse2(c, aligned_buf, len); } - } - else { + } else { + // Process too short lengths using crc32_braid c = crc32_braid_internal(c, buf, len); } diff --git a/crc32.h b/crc32.h index e26b59e52..d41af8f01 100644 --- a/crc32.h +++ b/crc32.h @@ -13,7 +13,11 @@ #define CHORBA_MEDIUM_UPPER_THRESHOLD 32768 #define CHORBA_MEDIUM_LOWER_THRESHOLD 8192 #define CHORBA_SMALL_THRESHOLD_64BIT 72 -#define CHORBA_SMALL_THRESHOLD_32BIT 80 +#if OPTIMAL_CMP == 64 +# define CHORBA_SMALL_THRESHOLD 72 +#else +# define CHORBA_SMALL_THRESHOLD 80 +#endif typedef struct crc32_fold_s { uint8_t fold[CRC32_FOLD_BUFFER_SIZE];