#include "crc32_braid_tbl.h"
/*
- A CRC of a message is computed on N braids of words in the message, where
- each word consists of W bytes (4 or 8). If N is 3, for example, then three
- running sparse CRCs are calculated respectively on each braid, at these
+ A CRC of a message is computed on BRAID_N braids of words in the message, where
+ each word consists of BRAID_W bytes (4 or 8). If BRAID_N is 3, for example, then
+ three running sparse CRCs are calculated respectively on each braid, at these
indices in the array of words: 0, 3, 6, ..., 1, 4, 7, ..., and 2, 5, 8, ...
- This is done starting at a word boundary, and continues until as many blocks
- of N * W bytes as are available have been processed. The results are combined
- into a single CRC at the end. For this code, N must be in the range 1..6 and
- W must be 4 or 8. The upper limit on N can be increased if desired by adding
- more #if blocks, extending the patterns apparent in the code. In addition,
- crc32 tables would need to be regenerated, if the maximum N value is increased.
-
- N and W are chosen empirically by benchmarking the execution time on a given
- processor. The choices for N and W below were based on testing on Intel Kaby
- Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC POWER9, and MIPS64
- Octeon II processors. The Intel, AMD, and ARM processors were all fastest
- with N=5, W=8. The Sparc, PowerPC, and MIPS64 were all fastest at N=5, W=4.
+ This is done starting at a word boundary, and continues until as many blocks of
+ BRAID_N * BRAID_W bytes as are available have been processed. The results are
+ combined into a single CRC at the end. For this code, BRAID_N must be in the
+ range 1..6 and BRAID_W must be 4 or 8. The upper limit on BRAID_N can be increased
+ if desired by adding more #if blocks, extending the patterns apparent in the code.
+ In addition, crc32 tables would need to be regenerated, if the maximum BRAID_N
+ value is increased.
+
+ BRAID_N and BRAID_W are chosen empirically by benchmarking the execution time
+ on a given processor. The choices for BRAID_N and BRAID_W below were based on
+ testing on Intel Kaby Lake i7, AMD Ryzen 7, ARM Cortex-A57, Sparc64-VII, PowerPC
+ POWER9, and MIPS64 Octeon II processors.
+ The Intel, AMD, and ARM processors were all fastest with BRAID_N=5, BRAID_W=8.
+ The Sparc, PowerPC, and MIPS64 were all fastest at BRAID_N=5, BRAID_W=4.
They were all tested with either gcc or clang, all using the -O3 optimization
level. Your mileage may vary.
*/
/* ========================================================================= */
-#ifdef W
+#ifdef BRAID_W
/*
- Return the CRC of the W bytes in the word_t data, taking the
+ Return the CRC of the BRAID_W bytes in the word_t data, taking the
least-significant byte of the word as the first byte of data, without any pre
or post conditioning. This is used to combine the CRCs of each braid.
*/
-#if BYTE_ORDER == LITTLE_ENDIAN
+# if BYTE_ORDER == LITTLE_ENDIAN
static uint32_t crc_word(z_word_t data) {
int k;
- for (k = 0; k < W; k++)
+ for (k = 0; k < BRAID_W; k++)
data = (data >> 8) ^ crc_table[data & 0xff];
return (uint32_t)data;
}
-#elif BYTE_ORDER == BIG_ENDIAN
+# elif BYTE_ORDER == BIG_ENDIAN
static z_word_t crc_word(z_word_t data) {
int k;
- for (k = 0; k < W; k++)
+ for (k = 0; k < BRAID_W; k++)
data = (data << 8) ^
- crc_big_table[(data >> ((W - 1) << 3)) & 0xff];
+ crc_big_table[(data >> ((BRAID_W - 1) << 3)) & 0xff];
return data;
}
-#endif /* BYTE_ORDER */
-
-#endif /* W */
+# endif /* BYTE_ORDER */
+#endif /* BRAID_W */
/* ========================================================================= */
Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len) {
-#ifdef W
+#ifdef BRAID_W
/* If provided enough bytes, do a braided CRC calculation. */
- if (len >= N * W + W - 1) {
+ if (len >= BRAID_N * BRAID_W + BRAID_W - 1) {
size_t blks;
z_word_t const *words;
int k;
/* Compute the CRC up to a z_word_t boundary. */
- while (len && ((uintptr_t)buf & (W - 1)) != 0) {
+ while (len && ((uintptr_t)buf & (BRAID_W - 1)) != 0) {
len--;
DO1;
}
- /* Compute the CRC on as many N z_word_t blocks as are available. */
- blks = len / (N * W);
- len -= blks * N * W;
+ /* Compute the CRC on as many BRAID_N z_word_t blocks as are available. */
+ blks = len / (BRAID_N * BRAID_W);
+ len -= blks * BRAID_N * BRAID_W;
words = (z_word_t const *)buf;
z_word_t crc0, word0, comb;
-#if N > 1
+#if BRAID_N > 1
z_word_t crc1, word1;
-#if N > 2
+#if BRAID_N > 2
z_word_t crc2, word2;
-#if N > 3
+#if BRAID_N > 3
z_word_t crc3, word3;
-#if N > 4
+#if BRAID_N > 4
z_word_t crc4, word4;
-#if N > 5
+#if BRAID_N > 5
z_word_t crc5, word5;
#endif
#endif
#endif
/* Initialize the CRC for each braid. */
crc0 = ZSWAPWORD(c);
-#if N > 1
+#if BRAID_N > 1
crc1 = 0;
-#if N > 2
+#if BRAID_N > 2
crc2 = 0;
-#if N > 3
+#if BRAID_N > 3
crc3 = 0;
-#if N > 4
+#if BRAID_N > 4
crc4 = 0;
-#if N > 5
+#if BRAID_N > 5
crc5 = 0;
#endif
#endif
while (--blks) {
/* Load the word for each braid into registers. */
word0 = crc0 ^ words[0];
-#if N > 1
+#if BRAID_N > 1
word1 = crc1 ^ words[1];
-#if N > 2
+#if BRAID_N > 2
word2 = crc2 ^ words[2];
-#if N > 3
+#if BRAID_N > 3
word3 = crc3 ^ words[3];
-#if N > 4
+#if BRAID_N > 4
word4 = crc4 ^ words[4];
-#if N > 5
+#if BRAID_N > 5
word5 = crc5 ^ words[5];
#endif
#endif
#endif
#endif
#endif
- words += N;
+ words += BRAID_N;
/* Compute and update the CRC for each word. The loop should get unrolled. */
crc0 = BRAID_TABLE[0][word0 & 0xff];
-#if N > 1
+#if BRAID_N > 1
crc1 = BRAID_TABLE[0][word1 & 0xff];
-#if N > 2
+#if BRAID_N > 2
crc2 = BRAID_TABLE[0][word2 & 0xff];
-#if N > 3
+#if BRAID_N > 3
crc3 = BRAID_TABLE[0][word3 & 0xff];
-#if N > 4
+#if BRAID_N > 4
crc4 = BRAID_TABLE[0][word4 & 0xff];
-#if N > 5
+#if BRAID_N > 5
crc5 = BRAID_TABLE[0][word5 & 0xff];
#endif
#endif
#endif
#endif
#endif
- for (k = 1; k < W; k++) {
+ for (k = 1; k < BRAID_W; k++) {
crc0 ^= BRAID_TABLE[k][(word0 >> (k << 3)) & 0xff];
-#if N > 1
+#if BRAID_N > 1
crc1 ^= BRAID_TABLE[k][(word1 >> (k << 3)) & 0xff];
-#if N > 2
+#if BRAID_N > 2
crc2 ^= BRAID_TABLE[k][(word2 >> (k << 3)) & 0xff];
-#if N > 3
+#if BRAID_N > 3
crc3 ^= BRAID_TABLE[k][(word3 >> (k << 3)) & 0xff];
-#if N > 4
+#if BRAID_N > 4
crc4 ^= BRAID_TABLE[k][(word4 >> (k << 3)) & 0xff];
-#if N > 5
+#if BRAID_N > 5
crc5 ^= BRAID_TABLE[k][(word5 >> (k << 3)) & 0xff];
#endif
#endif
}
}
- /* Process the last block, combining the CRCs of the N braids at the same time. */
+ /* Process the last block, combining the CRCs of the BRAID_N braids at the same time. */
comb = crc_word(crc0 ^ words[0]);
-#if N > 1
+#if BRAID_N > 1
comb = crc_word(crc1 ^ words[1] ^ comb);
-#if N > 2
+#if BRAID_N > 2
comb = crc_word(crc2 ^ words[2] ^ comb);
-#if N > 3
+#if BRAID_N > 3
comb = crc_word(crc3 ^ words[3] ^ comb);
-#if N > 4
+#if BRAID_N > 4
comb = crc_word(crc4 ^ words[4] ^ comb);
-#if N > 5
+#if BRAID_N > 5
comb = crc_word(crc5 ^ words[5] ^ comb);
#endif
#endif
#endif
#endif
#endif
- words += N;
+ words += BRAID_N;
Assert(comb <= UINT32_MAX, "comb should fit in uint32_t");
c = (uint32_t)ZSWAPWORD(comb);
buf = (const unsigned char *)words;
}
-#endif /* W */
+#endif /* BRAID_W */
/* Complete the computation of the CRC on any remaining bytes. */
while (len >= 8) {
return c;
}
-uint32_t PREFIX(crc32_braid)(uint32_t c, const uint8_t *buf, size_t len) {
+Z_INTERNAL uint32_t crc32_braid(uint32_t c, const uint8_t *buf, size_t len) {
c = (~c) & 0xffffffff;
c = crc32_braid_internal(c, buf, len);
aligned_len = len - algn_diff;
if(aligned_len > CHORBA_LARGE_THRESHOLD)
c = crc32_chorba_118960_nondestructive(c, (z_word_t*) aligned_buf, aligned_len);
-# if W == 8
+# if BRAID_W == 8
else if (aligned_len > CHORBA_MEDIUM_LOWER_THRESHOLD && aligned_len <= CHORBA_MEDIUM_UPPER_THRESHOLD)
c = crc32_chorba_32768_nondestructive(c, (uint64_t*) aligned_buf, aligned_len);
else if (aligned_len > CHORBA_SMALL_THRESHOLD_64BIT)
#include "zendian.h"
-/* Define N */
-#ifdef Z_TESTN
-# define N Z_TESTN
-#else
-# define N 5
-#endif
-#if N < 1 || N > 6
-# error N must be in 1..6
-#endif
+/* Define BRAID_N, valid range is 1..6 */
+#define BRAID_N 5
-/*
- Define W and the associated z_word_t type. If W is not defined, then a
- braided calculation is not used, and the associated tables and code are not
- compiled.
+/* Define BRAID_W and the associated z_word_t type. If BRAID_W is not defined, then a braided
+ calculation is not used, and the associated tables and code are not compiled.
*/
-#ifdef Z_TESTW
-# if Z_TESTW-1 != -1
-# define W Z_TESTW
-# endif
+#if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
+# define BRAID_W 8
+ typedef uint64_t z_word_t;
#else
-# ifndef W
-# if defined(__x86_64__) || defined(_M_AMD64) || defined(__aarch64__) || defined(_M_ARM64) || defined(__powerpc64__)
-# define W 8
-# else
-# define W 4
-# endif
-# endif
-#endif
-#ifdef W
-# if W == 8
- typedef uint64_t z_word_t;
-# else
-# undef W
-# define W 4
- typedef uint32_t z_word_t;
-# endif
+# define BRAID_W 4
+ typedef uint32_t z_word_t;
#endif
#if BYTE_ORDER == LITTLE_ENDIAN
# define ZSWAPWORD(word) (word)
# define BRAID_TABLE crc_braid_table
#elif BYTE_ORDER == BIG_ENDIAN
-# if W == 8
+# if BRAID_W == 8
# define ZSWAPWORD(word) ZSWAP64(word)
-# elif W == 4
+# elif BRAID_W == 4
# define ZSWAPWORD(word) ZSWAP32(word)
# endif
# define BRAID_TABLE crc_braid_big_table
0x5d681b02, 0x2a6f2b94, 0xb40bbe37, 0xc30c8ea1, 0x5a05df1b,
0x2d02ef8d};
-#ifdef W
-
-#if W == 8
+#ifdef BRAID_W
+# if BRAID_W == 8
static const z_word_t crc_big_table[] = {
0x0000000000000000, 0x9630077700000000, 0x2c610eee00000000,
0x37be0bb400000000, 0xa18e0cc300000000, 0x1bdf055a00000000,
0x8def022d00000000};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const z_word_t crc_big_table[] = {
0x00000000, 0x96300777, 0x2c610eee, 0xba510999, 0x19c46d07,
0x021b685d, 0x942b6f2a, 0x37be0bb4, 0xa18e0cc3, 0x1bdf055a,
0x8def022d};
-#endif
-
-#endif /* W */
+# endif
+#endif /* BRAID_W */
-#if N == 1
-
-#if W == 8
+#if BRAID_N == 1
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
0x0501c4a800000000, 0x9b016e6400000000, 0x7806e1ea00000000,
0xe6064b2600000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xb8bc6765, 0xaa09c88b, 0x12b5afee, 0x8f629757,
0xc3f6dbe9, 0xa6916751, 0x1fa9b0cc, 0x7ace0c74, 0x9461b966,
0xf10605de}};
-#endif /* W */
-
-#endif /* N == 1 */
-#if N == 2
-
-#if W == 8
+# endif /* BRAID_W */
+#endif /* BRAID_N == 1 */
+#if BRAID_N == 2
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
0x258db92400000000, 0xb41cd18a00000000, 0x46a819a300000000,
0xd739710d00000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xccaa009e, 0x4225077d, 0x8e8f07e3, 0x844a0efa,
0x8208ab6e, 0x1c0801a2, 0x0501c4a8, 0x9b016e64, 0x7806e1ea,
0xe6064b26}};
-#endif /* W */
-
-#endif /* N == 2 */
-#if N == 3
-
-#if W == 8
+# endif /* BRAID_W */
+#endif /* BRAID_N == 2 */
+#if BRAID_N == 3
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
0x792cd35100000000, 0x5e49f6d000000000, 0x76e0e88800000000,
0x5185cd0900000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x9ba54c6f, 0xec3b9e9f, 0x779ed2f0, 0x03063b7f,
0x7506baae, 0x1a4a1f35, 0x95a38741, 0xfaef22da, 0x0a3dbcad,
0x65711936}};
-#endif /* W */
-
-#endif /* N == 3 */
-#if N == 4
-
-#if W == 8
+# endif /* BRAID_W */
+#endif /* BRAID_N == 3 */
+#if BRAID_N == 4
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xf1da05aa, 0x38c50d15, 0xc91f08bf, 0x718a1a2a,
0xa951db2a00000000, 0x035401db00000000, 0xbc5c1e1200000000,
0x1659c4e300000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xae689191, 0x87a02563, 0x29c8b4f2, 0xd4314c87,
0xc1e42877, 0x507540d9, 0x258db924, 0xb41cd18a, 0x46a819a3,
0xd739710d}};
-#endif /* W */
-
-#endif /* N == 4 */
-#if N == 5
-
-#if W == 8
+# endif /* BRAID_W */
+#endif /* BRAID_N == 4 */
+#if BRAID_N == 5
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0xaf449247, 0x85f822cf, 0x2abcb088, 0xd08143df,
0xedc528c300000000, 0xaa576c6c00000000, 0x22e7d04600000000,
0x657594e900000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x65673b46, 0xcace768c, 0xafa94dca, 0x4eedeb59,
0x2abb26f3, 0x6c804196, 0xff260577, 0xb91d6212, 0x7350cbbd,
0x356bacd8}};
-#endif /* W */
-
-#endif /* N == 5 */
-#if N == 6
-
-#if W == 8
+# endif /* BRAID_W */
+#endif /* BRAID_N == 5 */
+#if BRAID_N == 6
+# if BRAID_W == 8
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x3db1ecdc, 0x7b63d9b8, 0x46d23564, 0xf6c7b370,
0xcc95bac300000000, 0x10790bfe00000000, 0x744cd9b800000000,
0xa8a0688500000000}};
-#else /* W == 4 */
+# else /* BRAID_W == 4 */
static const uint32_t crc_braid_table[][256] = {
{0x00000000, 0x81256527, 0xd93bcc0f, 0x581ea928, 0x69069e5f,
0x297eeee1, 0x0e1bcb60, 0x792cd351, 0x5e49f6d0, 0x76e0e888,
0x5185cd09}};
-#endif /* W */
-
-#endif /* N == 6 */
+# endif /* BRAID_W */
+#endif /* BRAID_N == 6 */
static const uint32_t x2n_table[] = {
0x40000000, 0x20000000, 0x08000000, 0x00800000, 0x00008000,
Z_INTERNAL uint32_t crc32_chorba_32768_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
Z_INTERNAL uint32_t crc32_chorba_small_nondestructive (uint32_t crc, const uint64_t* buf, size_t len);
Z_INTERNAL uint32_t crc32_chorba_small_nondestructive_32bit (uint32_t crc, const uint32_t* buf, size_t len);
-Z_INTERNAL uint32_t crc32_braid_internal(uint32_t c, const uint8_t *buf, size_t len);
and writes out the tables for the case that z_word_t is 32 bits.
*/
-#define W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */
-
-#include "crc32_braid_p.h"
+#define POLY 0xedb88320 /* p(x) reflected, with x^32 implied */
+#define BRAID_W 8 /* Need a 64-bit integer type in order to generate crc32 tables. */
+typedef uint64_t z_word_t;
static uint32_t crc_table[256];
static z_word_t crc_big_table[256];
printf("};\n\n");
/* print big-endian CRC table for 64-bit z_word_t */
- printf("#ifdef W\n\n");
- printf("#if W == 8\n\n");
+ printf("#ifdef BRAID_W\n");
+ printf("# if BRAID_W == 8\n\n");
printf("static const z_word_t crc_big_table[] = {\n");
printf(" ");
write_table64(crc_big_table, 256);
printf("};\n\n");
/* print big-endian CRC table for 32-bit z_word_t */
- printf("#else /* W == 4 */\n\n");
+ printf("# else /* BRAID_W == 4 */\n\n");
printf("static const z_word_t crc_big_table[] = {\n");
printf(" ");
write_table32hi(crc_big_table, 256);
printf("};\n\n");
- printf("#endif\n\n");
- printf("#endif /* W */\n\n");
+ printf("# endif\n");
+ printf("#endif /* BRAID_W */\n\n");
/* write out braid tables for each value of N */
for (n = 1; n <= 6; n++) {
- printf("#if N == %d\n", n);
+ printf("#if BRAID_N == %d\n", n);
/* compute braid tables for this N and 64-bit word_t */
braid(ltl, big, n, 8);
/* write out braid tables for 64-bit z_word_t */
- printf("\n");
- printf("#if W == 8\n\n");
+ printf("# if BRAID_W == 8\n\n");
printf("static const uint32_t crc_braid_table[][256] = {\n");
for (k = 0; k < 8; k++) {
printf(" {");
/* write out braid tables for 32-bit z_word_t */
printf("\n");
- printf("#else /* W == 4 */\n\n");
+ printf("# else /* BRAID_W == 4 */\n\n");
printf("static const uint32_t crc_braid_table[][256] = {\n");
for (k = 0; k < 4; k++) {
printf(" {");
printf("}%s", k < 3 ? ",\n" : "");
}
printf("};\n\n");
- printf("#endif /* W */\n\n");
-
- printf("#endif /* N == %d */\n", n);
+ printf("# endif /* BRAID_W */\n");
+ printf("#endif /* BRAID_N == %d */\n", n);
}
printf("\n");