}
#endif
-#ifndef WITHOUT_CHORBA
- /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
- * We interleave the PCLMUL-base folds with 8x scaled generator
- * polynomial copies; we read 8x QWORDS and then XOR them into
- * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
- * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
- * as "generator_64_bits_unrolled_8" */
- while (len >= 512 + 64 + 16*8) {
- __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
- __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
- __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
- __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
- __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
- __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
- __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
- __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
+ /* Implement Chorba algorithm from https://arxiv.org/abs/2412.16398
+ * We interleave the PCLMUL-base folds with 8x scaled generator
+ * polynomial copies; we read 8x QWORDS and then XOR them into
+ * the stream at the following offsets: 6, 9, 10, 16, 20, 22,
+ * 24, 25, 27, 28, 30, 31, 32 - this is detailed in the paper
+ * as "generator_64_bits_unrolled_8" */
+ while (len >= 512 + 64 + 16*8) {
+ __m128i chorba8 = _mm_loadu_si128((__m128i *)src);
+ __m128i chorba7 = _mm_loadu_si128((__m128i *)src + 1);
+ __m128i chorba6 = _mm_loadu_si128((__m128i *)src + 2);
+ __m128i chorba5 = _mm_loadu_si128((__m128i *)src + 3);
+ __m128i chorba4 = _mm_loadu_si128((__m128i *)src + 4);
+ __m128i chorba3 = _mm_loadu_si128((__m128i *)src + 5);
+ __m128i chorba2 = _mm_loadu_si128((__m128i *)src + 6);
+ __m128i chorba1 = _mm_loadu_si128((__m128i *)src + 7);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, chorba8);
- _mm_storeu_si128((__m128i *)dst + 1, chorba7);
- _mm_storeu_si128((__m128i *)dst + 2, chorba6);
- _mm_storeu_si128((__m128i *)dst + 3, chorba5);
- _mm_storeu_si128((__m128i *)dst + 4, chorba4);
- _mm_storeu_si128((__m128i *)dst + 5, chorba3);
- _mm_storeu_si128((__m128i *)dst + 6, chorba2);
- _mm_storeu_si128((__m128i *)dst + 7, chorba1);
- dst += 16*8;
+ _mm_storeu_si128((__m128i *)dst, chorba8);
+ _mm_storeu_si128((__m128i *)dst + 1, chorba7);
+ _mm_storeu_si128((__m128i *)dst + 2, chorba6);
+ _mm_storeu_si128((__m128i *)dst + 3, chorba5);
+ _mm_storeu_si128((__m128i *)dst + 4, chorba4);
+ _mm_storeu_si128((__m128i *)dst + 5, chorba3);
+ _mm_storeu_si128((__m128i *)dst + 6, chorba2);
+ _mm_storeu_si128((__m128i *)dst + 7, chorba1);
+ dst += 16*8;
#else
- XOR_INITIAL128(chorba8);
+ XOR_INITIAL128(chorba8);
#endif
- chorba2 = _mm_xor_si128(chorba2, chorba8);
- chorba1 = _mm_xor_si128(chorba1, chorba7);
- src += 16*8;
- len -= 16*8;
+ chorba2 = _mm_xor_si128(chorba2, chorba8);
+ chorba1 = _mm_xor_si128(chorba1, chorba7);
+ src += 16*8;
+ len -= 16*8;
- xmm_t0 = _mm_loadu_si128((__m128i *)src);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
+ xmm_t0 = _mm_loadu_si128((__m128i *)src);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 1);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 2);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 3);
- fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ fold_12(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(xmm_t0, chorba6);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba5), chorba8);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba4), chorba8), chorba7);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba3), chorba7), chorba6);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 4);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 5);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 6);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 7);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba6), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba4), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba3), chorba4);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(xmm_t3, chorba2), chorba3);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 8);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 9);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 10);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 11);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
- xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
- xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba8);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba7);
+ xmm_t2 = _mm_xor_si128(xmm_t2, chorba6);
+ xmm_t3 = _mm_xor_si128(xmm_t3, chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 12);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 13);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 14);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 15);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba8), chorba7);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba8), chorba7), chorba6);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba7), chorba6), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 16);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 17);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 18);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 19);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba4), chorba8), chorba6), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba3), chorba4), chorba8), chorba7), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba4), chorba7), chorba6);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba3), chorba8), chorba6), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 20);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 21);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 22);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 23);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba3), chorba4), chorba7), chorba6);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba2), chorba3), chorba8), chorba6), chorba5);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba2), chorba4), chorba8), chorba7), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 24);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 25);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 26);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 27);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
- xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
- xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
- xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
- xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
-
- fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba1), chorba3), chorba4), chorba8), chorba7), chorba6);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba2), chorba3), chorba7), chorba6), chorba5);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2), chorba4), chorba6), chorba5);
+ xmm_t3 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t3, chorba1), chorba3), chorba4), chorba5);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ xmm_t0 = _mm_loadu_si128((__m128i *)src + 28);
+ xmm_t1 = _mm_loadu_si128((__m128i *)src + 29);
+ xmm_t2 = _mm_loadu_si128((__m128i *)src + 30);
+ xmm_t3 = _mm_loadu_si128((__m128i *)src + 31);
+
+ fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
#ifdef COPY
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
- dst += 64;
+ _mm_storeu_si128((__m128i *)dst, xmm_t0);
+ _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
+ _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
+ _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ dst += 64;
#endif
- xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
- xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
- xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
- xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
- xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
- xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
- xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
- xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
-
- len -= 512;
- src += 512;
- }
-#endif /* WITHOUT_CHORBA */
+ xmm_t0 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t0, chorba2), chorba3), chorba4);
+ xmm_t1 = _mm_xor_si128(_mm_xor_si128(_mm_xor_si128(xmm_t1, chorba1), chorba2), chorba3);
+ xmm_t2 = _mm_xor_si128(_mm_xor_si128(xmm_t2, chorba1), chorba2);
+ xmm_t3 = _mm_xor_si128(xmm_t3, chorba1);
+ xmm_crc0 = _mm_xor_si128(xmm_t0, xmm_crc0);
+ xmm_crc1 = _mm_xor_si128(xmm_t1, xmm_crc1);
+ xmm_crc2 = _mm_xor_si128(xmm_t2, xmm_crc2);
+ xmm_crc3 = _mm_xor_si128(xmm_t3, xmm_crc3);
+
+ len -= 512;
+ src += 512;
+ }
while (len >= 64) {
len -= 64;