]> git.ipfire.org Git - thirdparty/xz.git/commitdiff
Speed up CRC32 calculation on ARM64
authorChenxi Mao <chenxi.mao2013@gmail.com>
Tue, 9 Jan 2024 09:23:11 +0000 (17:23 +0800)
committerJia Tan <jiat0218@gmail.com>
Sat, 27 Jan 2024 13:49:26 +0000 (21:49 +0800)
The CRC32 instructions in ARM64 can calculate the CRC32 result
for 8 bytes in a single operation, making the use of ARM64
instructions much faster compared to the general CRC32 algorithm.

Optimized CRC32 will be enabled if ARM64 has CRC extension
running on Linux.

Signed-off-by: Chenxi Mao <chenxi.mao2013@gmail.com>
CMakeLists.txt
src/liblzma/check/Makefile.inc
src/liblzma/check/crc32_aarch64.h [new file with mode: 0644]
src/liblzma/check/crc32_fast.c
src/liblzma/check/crc64_fast.c
src/liblzma/check/crc_common.h

index c5573d7b32a07604d5d41044098df09d50c7d49d..0cb08fc7b8a0f041fff661289afdb4ab6559bc9a 100644 (file)
@@ -230,6 +230,7 @@ add_library(liblzma
     src/liblzma/check/check.h
     src/liblzma/check/crc_common.h
     src/liblzma/check/crc_x86_clmul.h
+    src/liblzma/check/crc32_aarch64.h
     src/liblzma/common/block_util.c
     src/liblzma/common/common.c
     src/liblzma/common/common.h
index acff40c38d93f21d203a768ee08ea682f0b29017..e7f87c8585a3fa5f060a4277ec25558190dbfa76 100644 (file)
@@ -15,7 +15,8 @@ liblzma_la_SOURCES += \
        check/check.c \
        check/check.h \
        check/crc_common.h \
-       check/crc_x86_clmul.h
+       check/crc_x86_clmul.h \
+       check/crc32_aarch64.h
 
 if COND_SMALL
 liblzma_la_SOURCES += check/crc32_small.c
diff --git a/src/liblzma/check/crc32_aarch64.h b/src/liblzma/check/crc32_aarch64.h
new file mode 100644 (file)
index 0000000..77b14af
--- /dev/null
@@ -0,0 +1,109 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       crc32_aarch64.c
+/// \brief      CRC32 calculation with aarch64 optimization
+//
+//  Authors:    Chenxi Mao
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifdef LZMA_CRC_CRC32_AARCH64_H
+#      error crc_arm64_clmul.h was included twice.
+#endif
+#define LZMA_CRC_CRC32_AARCH64_H
+#include <sys/auxv.h>
+// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
+// define __GNUC__ but the attribute must not be used with them.
+// The new Clang-based ICX needs the attribute.
+//
+// NOTE: Build systems check for this too, keep them in sync with this.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
+#      define crc_attr_target \
+        __attribute__((__target__("+crc")))
+#else
+#      define crc_attr_target
+#endif
+#ifdef BUILDING_CRC32_AARCH64
+crc_attr_target
+crc_attr_no_sanitize_address
+static uint32_t
+crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
+{
+       crc = ~crc;
+       while ((uintptr_t)(buf) & 7) {
+               crc = __builtin_aarch64_crc32b(crc, *buf);
+               buf++;
+               size--;
+       }
+       for (;size>=8;size-=8,buf+=8) {
+               crc = __builtin_aarch64_crc32x(crc, aligned_read64le(buf));
+       }
+       for (;size>0;size--,buf++)
+               crc = __builtin_aarch64_crc32b(crc, *buf);
+       return ~crc;
+}
+#endif
+#ifdef BUILDING_CRC64_AARCH64
+//FIXME: there is no crc64_arch_optimized implementation,
+// to make compiler happy, add crc64_generic here.
+#ifdef WORDS_BIGENDIAN
+#      define A1(x) ((x) >> 56)
+#else
+#      define A1 A
+#endif
+crc_attr_target
+crc_attr_no_sanitize_address
+static uint64_t
+crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
+{
+       crc = ~crc;
+
+#ifdef WORDS_BIGENDIAN
+       crc = bswap64(crc);
+#endif
+
+       if (size > 4) {
+               while ((uintptr_t)(buf) & 3) {
+                       crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
+                       --size;
+               }
+
+               const uint8_t *const limit = buf + (size & ~(size_t)(3));
+               size &= (size_t)(3);
+
+               while (buf < limit) {
+#ifdef WORDS_BIGENDIAN
+                       const uint32_t tmp = (uint32_t)(crc >> 32)
+                                       ^ aligned_read32ne(buf);
+#else
+                       const uint32_t tmp = (uint32_t)crc
+                                       ^ aligned_read32ne(buf);
+#endif
+                       buf += 4;
+
+                       crc = lzma_crc64_table[3][A(tmp)]
+                           ^ lzma_crc64_table[2][B(tmp)]
+                           ^ S32(crc)
+                           ^ lzma_crc64_table[1][C(tmp)]
+                           ^ lzma_crc64_table[0][D(tmp)];
+               }
+       }
+
+       while (size-- != 0)
+               crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
+
+#ifdef WORDS_BIGENDIAN
+       crc = bswap64(crc);
+#endif
+
+       return ~crc;
+}
+#endif
+static inline bool
+is_arch_extension_supported(void)
+{
+       return (getauxval(AT_HWCAP) & HWCAP_CRC32)!=0;
+}
+
index cf7d75dafd92107a1ad3125a85e3ddd363c37f0b..07d5afb10ddcb8e9d53a37fb699aa19e8b842469 100644 (file)
 #include "check.h"
 #include "crc_common.h"
 
-#ifdef CRC_X86_CLMUL
+#if defined(CRC_X86_CLMUL)
 #      define BUILDING_CRC32_CLMUL
 #      include "crc_x86_clmul.h"
+#elif defined(CRC32_ARM64)
+#      define BUILDING_CRC32_AARCH64
+#      include "crc32_aarch64.h"
 #endif
 
 
index eb1a4ae4e7677181951a6febc283277fb040269f..cb5d3e4c6b71b368c607aea1c58f8ff5cf4b8716 100644 (file)
 #include "check.h"
 #include "crc_common.h"
 
-#ifdef CRC_X86_CLMUL
+#if defined(CRC_X86_CLMUL)
 #      define BUILDING_CRC64_CLMUL
 #      include "crc_x86_clmul.h"
+#elif defined(CRC32_ARM64)
+#      define BUILDING_CRC64_AARCH64
+#      include "crc32_aarch64.h"
 #endif
 
 
index 417d88bb0281f490312c106ec320086dd56d9572..7c7f098d1dba3057006ea22d3a7ce331316548c4 100644 (file)
 #undef CRC_GENERIC
 #undef CRC_ARCH_OPTIMIZED
 #undef CRC_X86_CLMUL
+#undef CRC32_ARM64
 #undef CRC_USE_IFUNC
 #undef CRC_USE_GENERIC_FOR_SMALL_INPUTS
 
-// If CLMUL cannot be used then only the generic slice-by-eight (CRC32)
-// or slice-by-four (CRC64) is built.
-#if !defined(HAVE_USABLE_CLMUL)
-#      define CRC_GENERIC 1
-
 // If CLMUL is allowed unconditionally in the compiler options then the
 // generic version can be omitted. Note that this doesn't work with MSVC
 // as I don't know how to detect the features here.
 //
 // NOTE: Keep this this in sync with crc32_table.c.
-#elif (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
+#if (defined(__SSSE3__) && defined(__SSE4_1__) && defined(__PCLMUL__)) \
                || (defined(__e2k__) && __iset__ >= 6)
 #      define CRC_ARCH_OPTIMIZED 1
 #      define CRC_X86_CLMUL 1
 
+#elif (defined(__aarch64__))
+#      define CRC_ARCH_OPTIMIZED 1
+#      define CRC32_ARM64 1
+// If CLMUL cannot be used then only the generic slice-by-eight (CRC32)
+// or slice-by-four (CRC64) is built.
+#elif !defined(HAVE_USABLE_CLMUL)
+#      define CRC_GENERIC 1
 // Otherwise build both and detect at runtime which version to use.
 #else
 #      define CRC_GENERIC 1
 #      define CRC_ARCH_OPTIMIZED 1
 #      define CRC_X86_CLMUL 1
+#      define CRC32_ARM64 1
 
 #      ifdef HAVE_FUNC_ATTRIBUTE_IFUNC
 #              define CRC_USE_IFUNC 1