From: Dženan Zukić Date: Mon, 25 Apr 2022 18:42:11 +0000 (-0400) Subject: Remove trailing whitespace in several source code files X-Git-Tag: 2.1.0-beta1~263 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=ae433e7ee1fc70c507f7d5ce5a19c6f704bf4acf;p=thirdparty%2Fzlib-ng.git Remove trailing whitespace in several source code files --- diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c index dfbd54de..7c2c51fa 100644 --- a/arch/arm/adler32_neon.c +++ b/arch/arm/adler32_neon.c @@ -1,6 +1,6 @@ /* Copyright (C) 1995-2011, 2016 Mark Adler * Copyright (C) 2017 ARM Holdings Inc. - * Authors: + * Authors: * Adenilson Cavalcanti * Adam Stylinski * For conditions of distribution and use, see copyright notice in zlib.h @@ -23,7 +23,7 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) { 40, 39, 38, 37, 36, 35, 34, 33, 32, 31, 30, 29, 28, 27, 26, 25, 24, 23, 22, 21, 20, 19, 18, 17, - 16, 15, 14, 13, 12, 11, 10, 9, + 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1 }; uint32x4_t adacc = vdupq_n_u32(0); @@ -59,21 +59,21 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) { hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]); hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]); - adacc = vpadalq_u16(adacc, hsum_fold.val[0]); + adacc = vpadalq_u16(adacc, hsum_fold.val[0]); s3acc = vaddq_u32(s3acc, adacc_prev); - adacc = vpadalq_u16(adacc, hsum_fold.val[1]); - + adacc = vpadalq_u16(adacc, hsum_fold.val[1]); + /* If we do straight widening additions to the 16 bit values, we don't incur * the usual penalties of a pairwise add. We can defer the multiplications * until the very end. These will not overflow because we are incurring at * most 408 loop iterations (NMAX / 64), and a given lane is only going to be * summed into once. This means for the maximum input size, the largest value * we will see is 255 * 102 = 26010, safely under uint16 max */ - s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); + s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]); s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1])); s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]); - s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); + s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]); s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3])); s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]); diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c index 6ff7a0bb..8dc88737 100644 --- a/arch/arm/slide_hash_neon.c +++ b/arch/arm/slide_hash_neon.c @@ -32,8 +32,8 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize n = size / (sizeof(uint16x8_t) * 8); do { - p0 = vld1q_u16_x4(table); - p1 = vld1q_u16_x4(table+32); + p0 = vld1q_u16_x4(table); + p1 = vld1q_u16_x4(table+32); vqsubq_u16_x4_x1(p0, p0, v); vqsubq_u16_x4_x1(p1, p1, v); vst1q_u16_x4(table, p0); diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c index bd3547d0..50cea317 100644 --- a/arch/x86/adler32_avx2.c +++ b/arch/x86/adler32_avx2.c @@ -92,7 +92,7 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ /* The compiler is generating the following sequence for this integer modulus * when done the scalar way, in GPRs: - + adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) + (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE); @@ -100,9 +100,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ ... vmovd %xmm1,%esi // move vector lane 0 to 32 bit register %esi mov %rsi,%rax // zero-extend this value to 64 bit precision in %rax - imul %rdi,%rsi // do a signed multiplication with magic constant and vector element + imul %rdi,%rsi // do a signed multiplication with magic constant and vector element shr $0x2f,%rsi // shift right by 47 - imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 + imul $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 sub %esi,%eax // subtract lower 32 bits of original vector value from modified one above ... // repeats for each element with vpextract instructions @@ -110,17 +110,17 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_ This is tricky with AVX2 for a number of reasons: 1.) There's no 64 bit multiplication instruction, but there is a sequence to get there 2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate - back down to 32 bit precision later (there is in AVX512) + back down to 32 bit precision later (there is in AVX512) 3.) Full width integer multiplications aren't cheap - We can, however, and do a relatively cheap sequence for horizontal sums. + We can, however, and do a relatively cheap sequence for horizontal sums. Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be performed on the maximum possible inputs before overflow */ - + /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant). * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c index 05f8068a..5571be45 100644 --- a/arch/x86/adler32_avx512.c +++ b/arch/x86/adler32_avx512.c @@ -19,8 +19,8 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, siz /* For impossibly tiny sizes, use the smaller width versions. We still need * to check for compile time support for these but they are likely there */ -#ifdef X86_SSE41_ADLER32 - if (len < 32) +#ifdef X86_SSE41_ADLER32 + if (len < 32) return adler32_sse41(adler, buf, len); #endif diff --git a/arch/x86/adler32_avx512_p.h b/arch/x86/adler32_avx512_p.h index 3751a449..5b79d2ab 100644 --- a/arch/x86/adler32_avx512_p.h +++ b/arch/x86/adler32_avx512_p.h @@ -24,7 +24,7 @@ static inline uint32_t _mm512_reduce_add_epu32(__m512i x) { static inline uint32_t partial_hsum(__m512i x) { /* We need a permutation vector to extract every other integer. The * rest are going to be zeros. Marking this const so the compiler stands - * a better chance of keeping this resident in a register through entire + * a better chance of keeping this resident in a register through entire * loop execution. We certainly have enough zmm registers (32) */ const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14, 1, 1, 1, 1, 1, 1, 1, 1); @@ -33,7 +33,7 @@ static inline uint32_t partial_hsum(__m512i x) { /* From here, it's a simple 256 bit wide reduction sum */ __m256i non_zero_avx = _mm512_castsi512_si256(non_zero); - + /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is * pretty slow, much slower than the longer instruction sequence below */ __m128i sum1 = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1), diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c index 180f7f41..253eed9c 100644 --- a/arch/x86/adler32_avx512_vnni.c +++ b/arch/x86/adler32_avx512_vnni.c @@ -20,8 +20,8 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf /* For impossibly tiny sizes, use the smaller width versions. We still need * to check for compile time support for these but they are likely there */ -#ifdef X86_SSE41_ADLER32 - if (len < 32) +#ifdef X86_SSE41_ADLER32 + if (len < 32) return adler32_sse41(adler, buf, len); #endif diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c index 014c89a8..b2ef0115 100644 --- a/arch/x86/adler32_ssse3.c +++ b/arch/x86/adler32_ssse3.c @@ -87,7 +87,7 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size /* lop off the max number of sums based on the scalar sums done * above */ len -= align_offset; - max_iters -= align_offset; + max_iters -= align_offset; } diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c index 9072a47e..800e3d17 100644 --- a/arch/x86/crc32_fold_pclmulqdq.c +++ b/arch/x86/crc32_fold_pclmulqdq.c @@ -402,7 +402,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t int32_t first = init_crc != 0; /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31 - * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to + * bytes of input is needed for the aligning load that occurs. If there's an initial CRC, to * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which * by definition can be up to 15 bytes + one full vector load. */ assert(len >= 31 || first == 0); @@ -477,7 +477,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1); xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2); len -= 48; - src += 48; + src += 48; } else if (len >= 32) { xmm_t0 = _mm_load_si128((__m128i *)src); xmm_t1 = _mm_load_si128((__m128i *)src + 1); @@ -596,7 +596,7 @@ Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) { uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) { /* For lens < 64, crc32_byfour method is faster. The CRC32 instruction for * these short lengths might also prove to be effective */ - if (len < 64) + if (len < 64) return crc32_byfour(crc32, buf, len); crc32_fold ALIGNED_(16) crc_state; diff --git a/chunkset_tpl.h b/chunkset_tpl.h index f0539921..25ee9559 100644 --- a/chunkset_tpl.h +++ b/chunkset_tpl.h @@ -109,7 +109,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) { bytes_remaining -= cpy_dist; cur_chunk += cpy_dist; /* This allows us to bypass an expensive integer division since we're effectively - * counting in this loop, anyway. However, we may have to derive a similarly + * counting in this loop, anyway. However, we may have to derive a similarly * sensible solution for if we use a permutation table that allows us to construct * this vector in one load and one permute instruction */ chunk_mod = cpy_dist; diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake index 80fecf2b..578a489d 100644 --- a/cmake/detect-intrinsics.cmake +++ b/cmake/detect-intrinsics.cmake @@ -167,7 +167,7 @@ macro(check_neon_ld4_intrinsics) endif() endif() endif() - # Check whether compiler supports loading 4 neon vecs into a register range + # Check whether compiler supports loading 4 neon vecs into a register range set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}") check_c_source_compiles( "#ifdef _M_ARM64