The current hash involves 3 simple shifts and additions so that it can
be mapped to a multiply on architecures having a fast multiply. This is
indeed what the compiler does on x86_64. A large range of values was
scanned to try to find more optimal factors on machines supporting such
a fast multiply, and it turned out that new factor 0x1af42f resulted in
smoother hashes that provided on average 0.4% better compression on both
the Silesia corpus and an mbox file composed of very compressible emails
and uncompressible attachments. It's even slightly better than CRC32C
while being faster on Skylake. This patch enables this factor on archs
with a fast multiply.
This is slz upstream commit
82ad1e75c13245a835c1c09764c89f2f6e8e2a40.
#define UNALIGNED_LE_OK
#define UNALIGNED_FASTER
#define USE_64BIT_QUEUE
+#define HAVE_FAST_MULT
#elif defined(__i386__) || defined(__i486__) || defined(__i586__) || defined(__i686__)
#define UNALIGNED_LE_OK
//#define UNALIGNED_FASTER
#elif defined(__ARM_ARCH_8A) || defined(__ARM_FEATURE_UNALIGNED)
#define UNALIGNED_LE_OK
#define UNALIGNED_FASTER
+#define HAVE_FAST_MULT
#endif
/* Log2 of the size of the hash table used for the references table. */
// but provides a slightly smoother hash
__asm__ volatile("crc32l %1,%0" : "+r"(a) : "r"(0));
return a >> (32 - HASH_BITS);
+#elif defined(HAVE_FAST_MULT)
+ // optimal factor for HASH_BITS=12 and HASH_BITS=13 among 48k tested: 0x1af42f
+ return (a * 0x1af42f) >> (32 - HASH_BITS);
#else
return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
#endif