aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/check/crc32_aarch64.h
diff options
context:
space:
mode:
authorChenxi Mao <chenxi.mao2013@gmail.com>2024-01-09 17:23:11 +0800
committerJia Tan <jiat0218@gmail.com>2024-01-27 21:49:26 +0800
commit849d0f282a6a890c5cf5a0e0f02980b12d9ebb0f (patch)
tree2f8da4c7fcf61a8b9f6425f3075727c972e6bf94 /src/liblzma/check/crc32_aarch64.h
parentBump version number for 5.5.1alpha. (diff)
downloadxz-849d0f282a6a890c5cf5a0e0f02980b12d9ebb0f.tar.xz
Speed up CRC32 calculation on ARM64
The CRC32 instructions in ARM64 can calculate the CRC32 result for 8 bytes in a single operation, making the use of ARM64 instructions much faster compared to the general CRC32 algorithm. Optimized CRC32 will be enabled if ARM64 has CRC extension running on Linux. Signed-off-by: Chenxi Mao <chenxi.mao2013@gmail.com>
Diffstat (limited to 'src/liblzma/check/crc32_aarch64.h')
-rw-r--r--src/liblzma/check/crc32_aarch64.h109
1 files changed, 109 insertions, 0 deletions
diff --git a/src/liblzma/check/crc32_aarch64.h b/src/liblzma/check/crc32_aarch64.h
new file mode 100644
index 00000000..77b14af4
--- /dev/null
+++ b/src/liblzma/check/crc32_aarch64.h
@@ -0,0 +1,109 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file crc32_aarch64.c
+/// \brief CRC32 calculation with aarch64 optimization
+//
+// Authors: Chenxi Mao
+//
+// This file has been put into the public domain.
+// You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+#ifdef LZMA_CRC_CRC32_AARCH64_H
+# error crc_arm64_clmul.h was included twice.
+#endif
+#define LZMA_CRC_CRC32_AARCH64_H
+#include <sys/auxv.h>
+// EDG-based compilers (Intel's classic compiler and compiler for E2K) can
+// define __GNUC__ but the attribute must not be used with them.
+// The new Clang-based ICX needs the attribute.
+//
+// NOTE: Build systems check for this too, keep them in sync with this.
+#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__)
+# define crc_attr_target \
+ __attribute__((__target__("+crc")))
+#else
+# define crc_attr_target
+#endif
+#ifdef BUILDING_CRC32_AARCH64
+crc_attr_target
+crc_attr_no_sanitize_address
+static uint32_t
+crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc)
+{
+ crc = ~crc;
+ while ((uintptr_t)(buf) & 7) {
+ crc = __builtin_aarch64_crc32b(crc, *buf);
+ buf++;
+ size--;
+ }
+ for (;size>=8;size-=8,buf+=8) {
+ crc = __builtin_aarch64_crc32x(crc, aligned_read64le(buf));
+ }
+ for (;size>0;size--,buf++)
+ crc = __builtin_aarch64_crc32b(crc, *buf);
+ return ~crc;
+}
+#endif
+#ifdef BUILDING_CRC64_AARCH64
+//FIXME: there is no crc64_arch_optimized implementation,
+// to make compiler happy, add crc64_generic here.
+#ifdef WORDS_BIGENDIAN
+# define A1(x) ((x) >> 56)
+#else
+# define A1 A
+#endif
+crc_attr_target
+crc_attr_no_sanitize_address
+static uint64_t
+crc64_arch_optimized(const uint8_t *buf, size_t size, uint64_t crc)
+{
+ crc = ~crc;
+
+#ifdef WORDS_BIGENDIAN
+ crc = bswap64(crc);
+#endif
+
+ if (size > 4) {
+ while ((uintptr_t)(buf) & 3) {
+ crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
+ --size;
+ }
+
+ const uint8_t *const limit = buf + (size & ~(size_t)(3));
+ size &= (size_t)(3);
+
+ while (buf < limit) {
+#ifdef WORDS_BIGENDIAN
+ const uint32_t tmp = (uint32_t)(crc >> 32)
+ ^ aligned_read32ne(buf);
+#else
+ const uint32_t tmp = (uint32_t)crc
+ ^ aligned_read32ne(buf);
+#endif
+ buf += 4;
+
+ crc = lzma_crc64_table[3][A(tmp)]
+ ^ lzma_crc64_table[2][B(tmp)]
+ ^ S32(crc)
+ ^ lzma_crc64_table[1][C(tmp)]
+ ^ lzma_crc64_table[0][D(tmp)];
+ }
+ }
+
+ while (size-- != 0)
+ crc = lzma_crc64_table[0][*buf++ ^ A1(crc)] ^ S8(crc);
+
+#ifdef WORDS_BIGENDIAN
+ crc = bswap64(crc);
+#endif
+
+ return ~crc;
+}
+#endif
+static inline bool
+is_arch_extension_supported(void)
+{
+ return (getauxval(AT_HWCAP) & HWCAP_CRC32)!=0;
+}
+