If building for sse4 and USE_CRC32C_HASH is defined, then we can use
crc32c to calculate the lookup hash. By default we don't do it because
even on skylake it's slower than the current hash, which only involves
a short multiply (~5% slower). But the gains are marginal (0.3%).
This is slz upstream commit
44ae4f3f85eb275adba5844d067d281e727d8850.
Note: this is not used by default and only merged in order to avoid
divergence between the code bases.
__asm__ volatile("crc32w %0,%0,%1" : "+r"(a) : "r"(0));
# endif
return a >> (32 - HASH_BITS);
+#elif defined(__SSE4_2__) && defined(USE_CRC32C_HASH)
+ // SSE 4.2 offers CRC32C which is a bit slower than the multiply
+ // but provides a slightly smoother hash
+ __asm__ volatile("crc32l %1,%0" : "+r"(a) : "r"(0));
+ return a >> (32 - HASH_BITS);
#else
return ((a << 19) + (a << 6) - a) >> (32 - HASH_BITS);
#endif