#include "../../crc32_fold.h"
-Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)crc->fold + 0, _mm_cvtsi32_si128(0x9db42487));
- _mm_storeu_si128((__m128i *)crc->fold + 1, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)crc->fold + 2, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)crc->fold + 3, _mm_setzero_si128());
-
- return 0;
-}
-
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
const __m128i xmm_fold4 = _mm_set_epi32( 0x00000001, 0x54442bd4,
0x00000001, 0xc6e41596);
*xmm_crc3 = _mm_castps_si128(ps_res);
}
+static inline void crc32_fold_load(__m128i *fold, __m128i *fold0, __m128i *fold1, __m128i *fold2, __m128i *fold3) {
+ *fold0 = _mm_load_si128(fold + 0);
+ *fold1 = _mm_load_si128(fold + 1);
+ *fold2 = _mm_load_si128(fold + 2);
+ *fold3 = _mm_load_si128(fold + 3);
+}
+
+static inline void crc32_fold_save(__m128i *fold, __m128i fold0, __m128i fold1, __m128i fold2, __m128i fold3) {
+ _mm_storeu_si128(fold + 0, fold0);
+ _mm_storeu_si128(fold + 1, fold1);
+ _mm_storeu_si128(fold + 2, fold2);
+ _mm_storeu_si128(fold + 3, fold3);
+}
+
+static inline void crc32_fold_save_partial(__m128i *fold, __m128i foldp) {
+ _mm_store_si128(fold + 4, foldp);
+}
+
+Z_INTERNAL uint32_t crc32_fold_reset_pclmulqdq(crc32_fold *crc) {
+ __m128i xmm_crc0 = _mm_cvtsi32_si128(0x9db42487);
+ __m128i xmm_zero = _mm_setzero_si128();
+ crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_zero, xmm_zero, xmm_zero);
+ return 0;
+}
+
Z_INTERNAL void crc32_fold_copy_pclmulqdq(crc32_fold *crc, uint8_t *dst, const uint8_t *src, size_t len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3, xmm_crc_part;
char ALIGNED_(16) partial_buf[16] = { 0 };
- /* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
- __m128i xmm_crc_part;
+ crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
if (len < 16) {
if (len == 0)
return;
memcpy(partial_buf, src, len);
- xmm_crc_part = _mm_loadu_si128((const __m128i *)partial_buf);
+ xmm_crc_part = _mm_load_si128((const __m128i *)partial_buf);
memcpy(dst, partial_buf, len);
goto partial;
}
}
while (len >= 64) {
- /* CRC_LOAD */
- xmm_t0 = _mm_load_si128((__m128i *)src);
- xmm_t1 = _mm_load_si128((__m128i *)src + 1);
- xmm_t2 = _mm_load_si128((__m128i *)src + 2);
- xmm_t3 = _mm_load_si128((__m128i *)src + 3);
+ crc32_fold_load((__m128i *)src, &xmm_t0, &xmm_t1, &xmm_t2, &xmm_t3);
fold_4(&xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)dst, xmm_t0);
- _mm_storeu_si128((__m128i *)dst + 1, xmm_t1);
- _mm_storeu_si128((__m128i *)dst + 2, xmm_t2);
- _mm_storeu_si128((__m128i *)dst + 3, xmm_t3);
+ crc32_fold_save((__m128i *)dst, xmm_t0, xmm_t1, xmm_t2, xmm_t3);
xmm_crc0 = _mm_xor_si128(xmm_crc0, xmm_t0);
xmm_crc1 = _mm_xor_si128(xmm_crc1, xmm_t1);
partial:
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
- /* CRC_SAVE */
- _mm_storeu_si128((__m128i *)crc->fold + 0, xmm_crc0);
- _mm_storeu_si128((__m128i *)crc->fold + 1, xmm_crc1);
- _mm_storeu_si128((__m128i *)crc->fold + 2, xmm_crc2);
- _mm_storeu_si128((__m128i *)crc->fold + 3, xmm_crc3);
- _mm_storeu_si128((__m128i *)crc->fold + 4, xmm_crc_part);
+ crc32_fold_save((__m128i *)&crc->fold, xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3);
+ crc32_fold_save_partial((__m128i *)&crc->fold, xmm_crc_part);
}
static const unsigned ALIGNED_(16) crc_k[] = {
Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
-
+ __m128i xmm_crc0, xmm_crc1, xmm_crc2, xmm_crc3;
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
- /* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc->fold + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc->fold + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc->fold + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc->fold + 3);
+ crc32_fold_load((__m128i *)&crc->fold, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3);
/*
* k1