/* Copyright (C) 1995-2011, 2016 Mark Adler
* Copyright (C) 2017 ARM Holdings Inc.
- * Authors:
+ * Authors:
* Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
* Adam Stylinski <kungfujesus06@gmail.com>
* For conditions of distribution and use, see copyright notice in zlib.h
40, 39, 38, 37, 36, 35, 34, 33,
32, 31, 30, 29, 28, 27, 26, 25,
24, 23, 22, 21, 20, 19, 18, 17,
- 16, 15, 14, 13, 12, 11, 10, 9,
+ 16, 15, 14, 13, 12, 11, 10, 9,
8, 7, 6, 5, 4, 3, 2, 1 };
uint32x4_t adacc = vdupq_n_u32(0);
hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
- adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
+ adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
s3acc = vaddq_u32(s3acc, adacc_prev);
- adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
-
+ adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
/* If we do straight widening additions to the 16 bit values, we don't incur
* the usual penalties of a pairwise add. We can defer the multiplications
* until the very end. These will not overflow because we are incurring at
* most 408 loop iterations (NMAX / 64), and a given lane is only going to be
* summed into once. This means for the maximum input size, the largest value
* we will see is 255 * 102 = 26010, safely under uint16 max */
- s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
+ s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
- s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
+ s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
n = size / (sizeof(uint16x8_t) * 8);
do {
- p0 = vld1q_u16_x4(table);
- p1 = vld1q_u16_x4(table+32);
+ p0 = vld1q_u16_x4(table);
+ p1 = vld1q_u16_x4(table+32);
vqsubq_u16_x4_x1(p0, p0, v);
vqsubq_u16_x4_x1(p1, p1, v);
vst1q_u16_x4(table, p0);
/* The compiler is generating the following sequence for this integer modulus
* when done the scalar way, in GPRs:
-
+
adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
(s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
...
vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi
mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax
- imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
+ imul %rdi,%rsi // do a signed multiplication with magic constant and vector element
shr $0x2f,%rsi // shift right by 47
- imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
+ imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above
...
// repeats for each element with vpextract instructions
This is tricky with AVX2 for a number of reasons:
1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
- back down to 32 bit precision later (there is in AVX512)
+ back down to 32 bit precision later (there is in AVX512)
3.) Full width integer multiplications aren't cheap
- We can, however, and do a relatively cheap sequence for horizontal sums.
+ We can, however, and do a relatively cheap sequence for horizontal sums.
Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
performed on the maximum possible inputs before overflow
*/
-
+
/* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
* conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
* This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
/* For impossibly tiny sizes, use the smaller width versions. We still need
* to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
- if (len < 32)
+#ifdef X86_SSE41_ADLER32
+ if (len < 32)
return adler32_sse41(adler, buf, len);
#endif
static inline uint32_t partial_hsum(__m512i x) {
/* We need a permutation vector to extract every other integer. The
* rest are going to be zeros. Marking this const so the compiler stands
- * a better chance of keeping this resident in a register through entire
+ * a better chance of keeping this resident in a register through entire
* loop execution. We certainly have enough zmm registers (32) */
const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
1, 1, 1, 1, 1, 1, 1, 1);
/* From here, it's a simple 256 bit wide reduction sum */
__m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-
+
/* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
* pretty slow, much slower than the longer instruction sequence below */
__m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
/* For impossibly tiny sizes, use the smaller width versions. We still need
* to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32
- if (len < 32)
+#ifdef X86_SSE41_ADLER32
+ if (len < 32)
return adler32_sse41(adler, buf, len);
#endif
/* lop off the max number of sums based on the scalar sums done
* above */
len -= align_offset;
- max_iters -= align_offset;
+ max_iters -= align_offset;
}
int32_t first = init_crc != 0;
/* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
- * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
+ * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to
* carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
* by definition can be up to 15 bytes + one full vector load. */
assert(len >= 31 || first == 0);
xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
len -= 48;
- src += 48;
+ src += 48;
} else if (len >= 32) {
xmm_t0 = _mm_load_si128((__m128i *)src);
xmm_t1 = _mm_load_si128((__m128i *)src + 1);
uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
/* For lens < 64, crc32_byfour method is faster. The CRC32 instruction for
* these short lengths might also prove to be effective */
- if (len < 64)
+ if (len < 64)
return crc32_byfour(crc32, buf, len);
crc32_fold ALIGNED_(16) crc_state;
bytes_remaining -= cpy_dist;
cur_chunk += cpy_dist;
/* This allows us to bypass an expensive integer division since we're effectively
- * counting in this loop, anyway. However, we may have to derive a similarly
+ * counting in this loop, anyway. However, we may have to derive a similarly
* sensible solution for if we use a permutation table that allows us to construct
* this vector in one load and one permute instruction */
chunk_mod = cpy_dist;
endif()
endif()
endif()
- # Check whether compiler supports loading 4 neon vecs into a register range
+ # Check whether compiler supports loading 4 neon vecs into a register range
set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
check_c_source_compiles(
"#ifdef _M_ARM64