Remove trailing whitespace in several source code files

author Dženan Zukić <dzenan.zukic@kitware.com>

Mon, 25 Apr 2022 18:42:11 +0000 (14:42 -0400)

committer Hans Kristian Rosbach <hk-github@circlestorm.org>

Wed, 27 Apr 2022 08:38:10 +0000 (10:38 +0200)
author Dženan Zukić <dzenan.zukic@kitware.com>
Mon, 25 Apr 2022 18:42:11 +0000 (14:42 -0400)
committer Hans Kristian Rosbach <hk-github@circlestorm.org>
Wed, 27 Apr 2022 08:38:10 +0000 (10:38 +0200)
diff --git a/arch/arm/adler32_neon.c b/arch/arm/adler32_neon.c

index dfbd54def9375db39c5bfe5c5905be81ae13e50a..7c2c51fae0f8e95ad1bd9febf16e928edcde62ca 100644 (file)
--- a/arch/arm/adler32_neon.c
+++ b/arch/arm/adler32_neon.c
@@ -1,6 +1,6 @@
  /* Copyright (C) 1995-2011, 2016 Mark Adler
   * Copyright (C) 2017 ARM Holdings Inc.
- * Authors: 
+ * Authors:
   *   Adenilson Cavalcanti <adenilson.cavalcanti@arm.com>
   *   Adam Stylinski <kungfujesus06@gmail.com>
   * For conditions of distribution and use, see copyright notice in zlib.h
@@ -23,7 +23,7 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
          40, 39, 38, 37, 36, 35, 34, 33,
          32, 31, 30, 29, 28, 27, 26, 25,
          24, 23, 22, 21, 20, 19, 18, 17,
-        16, 15, 14, 13, 12, 11, 10, 9, 
+        16, 15, 14, 13, 12, 11, 10, 9,
          8, 7, 6, 5, 4, 3, 2, 1 };
  
      uint32x4_t adacc = vdupq_n_u32(0);
@@ -59,21 +59,21 @@ static void NEON_accum32(uint32_t *s, const unsigned char *buf, size_t len) {
          hsum_fold.val[0] = vpadalq_u8(hsum.val[0], d0_d3.val[2]);
          hsum_fold.val[1] = vpadalq_u8(hsum.val[1], d0_d3.val[3]);
  
-        adacc = vpadalq_u16(adacc, hsum_fold.val[0]); 
+        adacc = vpadalq_u16(adacc, hsum_fold.val[0]);
          s3acc = vaddq_u32(s3acc, adacc_prev);
-        adacc = vpadalq_u16(adacc, hsum_fold.val[1]); 
-        
+        adacc = vpadalq_u16(adacc, hsum_fold.val[1]);
+
          /* If we do straight widening additions to the 16 bit values, we don't incur
           * the usual penalties of a pairwise add. We can defer the multiplications
           * until the very end. These will not overflow because we are incurring at
           * most 408 loop iterations (NMAX / 64), and a given lane is only going to be
           * summed into once. This means for the maximum input size, the largest value
           * we will see is 255 * 102 = 26010, safely under uint16 max */
-        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0])); 
+        s2_0 = vaddw_u8(s2_0, vget_low_u8(d0_d3.val[0]));
          s2_1 = vaddw_high_u8(s2_1, d0_d3.val[0]);
          s2_2 = vaddw_u8(s2_2, vget_low_u8(d0_d3.val[1]));
          s2_3 = vaddw_high_u8(s2_3, d0_d3.val[1]);
-        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2])); 
+        s2_4 = vaddw_u8(s2_4, vget_low_u8(d0_d3.val[2]));
          s2_5 = vaddw_high_u8(s2_5, d0_d3.val[2]);
          s2_6 = vaddw_u8(s2_6, vget_low_u8(d0_d3.val[3]));
          s2_7 = vaddw_high_u8(s2_7, d0_d3.val[3]);
diff --git a/arch/arm/slide_hash_neon.c b/arch/arm/slide_hash_neon.c

index 6ff7a0bb60789beff82177d88f3c78802cc87f40..8dc887379c8e777c92ea02756d0ca4d9b5431cf0 100644 (file)
--- a/arch/arm/slide_hash_neon.c
+++ b/arch/arm/slide_hash_neon.c
@@ -32,8 +32,8 @@ static inline void slide_hash_chain(Pos *table, uint32_t entries, uint16_t wsize
  
      n = size / (sizeof(uint16x8_t) * 8);
      do {
-        p0 = vld1q_u16_x4(table); 
-        p1 = vld1q_u16_x4(table+32); 
+        p0 = vld1q_u16_x4(table);
+        p1 = vld1q_u16_x4(table+32);
          vqsubq_u16_x4_x1(p0, p0, v);
          vqsubq_u16_x4_x1(p1, p1, v);
          vst1q_u16_x4(table, p0);
diff --git a/arch/x86/adler32_avx2.c b/arch/x86/adler32_avx2.c

index bd3547d0b13b8760b3d07e74643605a85944f412..50cea317ec055173847fb1036827301e8480b39c 100644 (file)
--- a/arch/x86/adler32_avx2.c
+++ b/arch/x86/adler32_avx2.c
@@ -92,7 +92,7 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_
  
         /* The compiler is generating the following sequence for this integer modulus
          * when done the scalar way, in GPRs:
-        
+
          adler = (s1_unpack[0] % BASE) + (s1_unpack[1] % BASE) + (s1_unpack[2] % BASE) + (s1_unpack[3] % BASE) +
                  (s1_unpack[4] % BASE) + (s1_unpack[5] % BASE) + (s1_unpack[6] % BASE) + (s1_unpack[7] % BASE);
  
@@ -100,9 +100,9 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_
          ...
          vmovd  %xmm1,%esi // move vector lane 0 to 32 bit register %esi
          mov    %rsi,%rax  // zero-extend this value to 64 bit precision in %rax
-        imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element 
+        imul   %rdi,%rsi // do a signed multiplication with magic constant and vector element
          shr    $0x2f,%rsi // shift right by 47
-        imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1 
+        imul   $0xfff1,%esi,%esi // do a signed multiplication with value truncated to 32 bits with 0xfff1
          sub    %esi,%eax // subtract lower 32 bits of original vector value from modified one above
          ...
          // repeats for each element with vpextract instructions
@@ -110,17 +110,17 @@ Z_INTERNAL uint32_t adler32_avx2(uint32_t adler, const unsigned char *buf, size_
          This is tricky with AVX2 for a number of reasons:
              1.) There's no 64 bit multiplication instruction, but there is a sequence to get there
              2.) There's ways to extend vectors to 64 bit precision, but no simple way to truncate
-                back down to 32 bit precision later (there is in AVX512) 
+                back down to 32 bit precision later (there is in AVX512)
              3.) Full width integer multiplications aren't cheap
  
-        We can, however, and do a relatively cheap sequence for horizontal sums. 
+        We can, however, and do a relatively cheap sequence for horizontal sums.
          Then, we simply do the integer modulus on the resulting 64 bit GPR, on a scalar value. It was
          previously thought that casting to 64 bit precision was needed prior to the horizontal sum, but
          that is simply not the case, as NMAX is defined as the maximum number of scalar sums that can be
          performed on the maximum possible inputs before overflow
          */
  
- 
+
          /* In AVX2-land, this trip through GPRs will probably be unvoidable, as there's no cheap and easy
           * conversion from 64 bit integer to 32 bit (needed for the inexpensive modulus with a constant).
           * This casting to 32 bit is cheap through GPRs (just register aliasing). See above for exactly
diff --git a/arch/x86/adler32_avx512.c b/arch/x86/adler32_avx512.c

index 05f8068a6084e3a3752f4fe8ecd640f2925ffabd..5571be45eb4c83b5a22e82a87f555017a38be76c 100644 (file)
--- a/arch/x86/adler32_avx512.c
+++ b/arch/x86/adler32_avx512.c
@@ -19,8 +19,8 @@ Z_INTERNAL uint32_t adler32_avx512(uint32_t adler, const unsigned char *buf, siz
  
      /* For impossibly tiny sizes, use the smaller width versions. We still need
       * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32 
-    if (len < 32) 
+#ifdef X86_SSE41_ADLER32
+    if (len < 32)
          return adler32_sse41(adler, buf, len);
  #endif
  
diff --git a/arch/x86/adler32_avx512_p.h b/arch/x86/adler32_avx512_p.h

index 3751a449e1d16ce4dfbf8b5f0e848819f3de2623..5b79d2ab6ee7dde1791e18d02f3ab51008b0d3b1 100644 (file)
--- a/arch/x86/adler32_avx512_p.h
+++ b/arch/x86/adler32_avx512_p.h
@@ -24,7 +24,7 @@ static inline uint32_t _mm512_reduce_add_epu32(__m512i x) {
  static inline uint32_t partial_hsum(__m512i x) {
      /* We need a permutation vector to extract every other integer. The
       * rest are going to be zeros. Marking this const so the compiler stands
-     * a better chance of keeping this resident in a register through entire 
+     * a better chance of keeping this resident in a register through entire
       * loop execution. We certainly have enough zmm registers (32) */
      const __m512i perm_vec = _mm512_setr_epi32(0, 2, 4, 6, 8, 10, 12, 14,
                                                 1, 1, 1, 1, 1,  1,  1,  1);
@@ -33,7 +33,7 @@ static inline uint32_t partial_hsum(__m512i x) {
  
      /* From here, it's a simple 256 bit wide reduction sum */
      __m256i non_zero_avx = _mm512_castsi512_si256(non_zero);
-    
+
      /* See Agner Fog's vectorclass for a decent reference. Essentially, phadd is
       * pretty slow, much slower than the longer instruction sequence below */
      __m128i sum1  = _mm_add_epi32(_mm256_extracti128_si256(non_zero_avx, 1),
diff --git a/arch/x86/adler32_avx512_vnni.c b/arch/x86/adler32_avx512_vnni.c

index 180f7f413e5a53b8dd1d6a2752c4fc0b0449fa67..253eed9c6a8f22904d2621b0174cae7fb0c920d1 100644 (file)
--- a/arch/x86/adler32_avx512_vnni.c
+++ b/arch/x86/adler32_avx512_vnni.c
@@ -20,8 +20,8 @@ Z_INTERNAL uint32_t adler32_avx512_vnni(uint32_t adler, const unsigned char *buf
  
      /* For impossibly tiny sizes, use the smaller width versions. We still need
       * to check for compile time support for these but they are likely there */
-#ifdef X86_SSE41_ADLER32 
-    if (len < 32) 
+#ifdef X86_SSE41_ADLER32
+    if (len < 32)
          return adler32_sse41(adler, buf, len);
  #endif
  
diff --git a/arch/x86/adler32_ssse3.c b/arch/x86/adler32_ssse3.c

index 014c89a88133bd60c646808b67f0c568f73ba533..b2ef0115c88064cc3e8b297d01faac5122c94f3d 100644 (file)
--- a/arch/x86/adler32_ssse3.c
+++ b/arch/x86/adler32_ssse3.c
@@ -87,7 +87,7 @@ Z_INTERNAL uint32_t adler32_ssse3(uint32_t adler, const unsigned char *buf, size
          /* lop off the max number of sums based on the scalar sums done
           * above */
          len -= align_offset;
-        max_iters -= align_offset; 
+        max_iters -= align_offset;
      }
  
  
diff --git a/arch/x86/crc32_fold_pclmulqdq.c b/arch/x86/crc32_fold_pclmulqdq.c

index 9072a47e7d369e58bcead1360a382793269c9a01..800e3d176690737bcae6de28e2849db4a8101629 100644 (file)
--- a/arch/x86/crc32_fold_pclmulqdq.c
+++ b/arch/x86/crc32_fold_pclmulqdq.c
@@ -402,7 +402,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
      int32_t first = init_crc != 0;
  
      /* Technically the CRC functions don't even call this for input < 64, but a bare minimum of 31
-     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to 
+     * bytes of input is needed for the aligning load that occurs.  If there's an initial CRC, to
       * carry it forward through the folded CRC there must be 16 - src % 16 + 16 bytes available, which
       * by definition can be up to 15 bytes + one full vector load. */
      assert(len >= 31 || first == 0);
@@ -477,7 +477,7 @@ Z_INTERNAL void crc32_fold_pclmulqdq(crc32_fold *crc, const uint8_t *src, size_t
          xmm_crc2 = _mm_xor_si128(xmm_crc2, xmm_t1);
          xmm_crc3 = _mm_xor_si128(xmm_crc3, xmm_t2);
          len -= 48;
-        src += 48; 
+        src += 48;
      } else if (len >= 32) {
          xmm_t0 = _mm_load_si128((__m128i *)src);
          xmm_t1 = _mm_load_si128((__m128i *)src + 1);
@@ -596,7 +596,7 @@ Z_INTERNAL uint32_t crc32_fold_final_pclmulqdq(crc32_fold *crc) {
  uint32_t crc32_pclmulqdq(uint32_t crc32, const unsigned char* buf, uint64_t len) {
      /* For lens < 64, crc32_byfour method is faster. The CRC32 instruction for
       * these short lengths might also prove to be effective */
-    if (len < 64) 
+    if (len < 64)
          return crc32_byfour(crc32, buf, len);
  
      crc32_fold ALIGNED_(16) crc_state;
diff --git a/chunkset_tpl.h b/chunkset_tpl.h

index f0539921ad995022b6da827ef6d83011649f775b..25ee9559671e17aaffce5052c61b4f6f6403cccb 100644 (file)
--- a/chunkset_tpl.h
+++ b/chunkset_tpl.h
@@ -109,7 +109,7 @@ Z_INTERNAL uint8_t* CHUNKMEMSET(uint8_t *out, unsigned dist, unsigned len) {
              bytes_remaining -= cpy_dist;
              cur_chunk += cpy_dist;
              /* This allows us to bypass an expensive integer division since we're effectively
-             * counting in this loop, anyway. However, we may have to derive a similarly 
+             * counting in this loop, anyway. However, we may have to derive a similarly
               * sensible solution for if we use a permutation table that allows us to construct
               * this vector in one load and one permute instruction */
              chunk_mod = cpy_dist;
diff --git a/cmake/detect-intrinsics.cmake b/cmake/detect-intrinsics.cmake

index 80fecf2b1d610683710e416a52a3415d28bc819a..578a489dff07624af112ba417d012bac13ffb8bf 100644 (file)
--- a/cmake/detect-intrinsics.cmake
+++ b/cmake/detect-intrinsics.cmake
@@ -167,7 +167,7 @@ macro(check_neon_ld4_intrinsics)
              endif()
          endif()
      endif()
-    # Check whether compiler supports loading 4 neon vecs into a register range 
+    # Check whether compiler supports loading 4 neon vecs into a register range
      set(CMAKE_REQUIRED_FLAGS "${NEONFLAG}")
      check_c_source_compiles(
          "#ifdef _M_ARM64
author	Dženan Zukić <dzenan.zukic@kitware.com>
	Mon, 25 Apr 2022 18:42:11 +0000 (14:42 -0400)
committer	Hans Kristian Rosbach <hk-github@circlestorm.org>
	Wed, 27 Apr 2022 08:38:10 +0000 (10:38 +0200)
arch/arm/adler32_neon.c		patch \| blob \| blame \| history
arch/arm/slide_hash_neon.c		patch \| blob \| blame \| history
arch/x86/adler32_avx2.c		patch \| blob \| blame \| history
arch/x86/adler32_avx512.c		patch \| blob \| blame \| history
arch/x86/adler32_avx512_p.h		patch \| blob \| blame \| history
arch/x86/adler32_avx512_vnni.c		patch \| blob \| blame \| history
arch/x86/adler32_ssse3.c		patch \| blob \| blame \| history
arch/x86/crc32_fold_pclmulqdq.c		patch \| blob \| blame \| history
chunkset_tpl.h		patch \| blob \| blame \| history
cmake/detect-intrinsics.cmake		patch \| blob \| blame \| history