#ifdef X86_PCLMULQDQ_CRC
-#include "../../zbuild.h"
#include <inttypes.h>
#include <immintrin.h>
#include <wmmintrin.h>
#include "crc_folding.h"
-Z_INTERNAL void crc_fold_init(deflate_state *const s) {
+Z_INTERNAL uint32_t crc_fold_init(unsigned int crc0[4 * 5]) {
/* CRC_SAVE */
- _mm_storeu_si128((__m128i *)s->crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
- _mm_storeu_si128((__m128i *)s->crc0 + 1, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)s->crc0 + 2, _mm_setzero_si128());
- _mm_storeu_si128((__m128i *)s->crc0 + 3, _mm_setzero_si128());
+ _mm_storeu_si128((__m128i *)crc0 + 0, _mm_cvtsi32_si128(0x9db42487));
+ _mm_storeu_si128((__m128i *)crc0 + 1, _mm_setzero_si128());
+ _mm_storeu_si128((__m128i *)crc0 + 2, _mm_setzero_si128());
+ _mm_storeu_si128((__m128i *)crc0 + 3, _mm_setzero_si128());
- s->strm->adler = 0;
+ return 0;
}
static void fold_1(__m128i *xmm_crc0, __m128i *xmm_crc1, __m128i *xmm_crc2, __m128i *xmm_crc3) {
*xmm_crc3 = _mm_castps_si128(ps_res);
}
-Z_INTERNAL void crc_fold_copy(deflate_state *const s, unsigned char *dst, const unsigned char *src, long len) {
+Z_INTERNAL void crc_fold_copy(unsigned int crc0[4 * 5], unsigned char *dst, const unsigned char *src, long len) {
unsigned long algn_diff;
__m128i xmm_t0, xmm_t1, xmm_t2, xmm_t3;
char ALIGNED_(16) partial_buf[16] = { 0 };
/* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+ __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
+ __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
+ __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
+ __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
__m128i xmm_crc_part;
if (len < 16) {
partial_fold((size_t)len, &xmm_crc0, &xmm_crc1, &xmm_crc2, &xmm_crc3, &xmm_crc_part);
done:
/* CRC_SAVE */
- _mm_storeu_si128((__m128i *)s->crc0 + 0, xmm_crc0);
- _mm_storeu_si128((__m128i *)s->crc0 + 1, xmm_crc1);
- _mm_storeu_si128((__m128i *)s->crc0 + 2, xmm_crc2);
- _mm_storeu_si128((__m128i *)s->crc0 + 3, xmm_crc3);
- _mm_storeu_si128((__m128i *)s->crc0 + 4, xmm_crc_part);
+ _mm_storeu_si128((__m128i *)crc0 + 0, xmm_crc0);
+ _mm_storeu_si128((__m128i *)crc0 + 1, xmm_crc1);
+ _mm_storeu_si128((__m128i *)crc0 + 2, xmm_crc2);
+ _mm_storeu_si128((__m128i *)crc0 + 3, xmm_crc3);
+ _mm_storeu_si128((__m128i *)crc0 + 4, xmm_crc_part);
}
static const unsigned ALIGNED_(16) crc_k[] = {
0x00000000, 0xFFFFFFFF, 0xFFFFFFFF, 0xFFFFFFFF
};
-uint32_t Z_INTERNAL crc_fold_512to32(deflate_state *const s) {
+uint32_t Z_INTERNAL crc_fold_512to32(unsigned int crc0[4 * 5]) {
const __m128i xmm_mask = _mm_load_si128((__m128i *)crc_mask);
const __m128i xmm_mask2 = _mm_load_si128((__m128i *)crc_mask2);
__m128i x_tmp0, x_tmp1, x_tmp2, crc_fold;
/* CRC_LOAD */
- __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)s->crc0 + 0);
- __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)s->crc0 + 1);
- __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)s->crc0 + 2);
- __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)s->crc0 + 3);
+ __m128i xmm_crc0 = _mm_loadu_si128((__m128i *)crc0 + 0);
+ __m128i xmm_crc1 = _mm_loadu_si128((__m128i *)crc0 + 1);
+ __m128i xmm_crc2 = _mm_loadu_si128((__m128i *)crc0 + 2);
+ __m128i xmm_crc3 = _mm_loadu_si128((__m128i *)crc0 + 3);
/*
* k1
Z_INTERNAL void crc_finalize(deflate_state *const s) {
if (x86_cpu_has_pclmulqdq)
- s->strm->adler = crc_fold_512to32(s);
+ s->strm->adler = crc_fold_512to32(s->crc0);
}
#endif
#ifdef X86_PCLMULQDQ_CRC
x86_check_features();
if (x86_cpu_has_pclmulqdq) {
- crc_fold_init(s);
+ s->strm->adler = crc_fold_init(s->crc0);
return;
}
#endif
Z_INTERNAL void copy_with_crc(PREFIX3(stream) *strm, unsigned char *dst, unsigned long size) {
#ifdef X86_PCLMULQDQ_CRC
if (x86_cpu_has_pclmulqdq) {
- crc_fold_copy(strm->state, dst, strm->next_in, size);
+ crc_fold_copy(strm->state->crc0, dst, strm->next_in, size);
return;
}
#endif