diff options
author | Jia Tan <jiat0218@gmail.com> | 2024-01-22 20:54:56 +0800 |
---|---|---|
committer | Jia Tan <jiat0218@gmail.com> | 2024-02-01 20:09:11 +0800 |
commit | 761f5b69a4c778c8bcb09279b845b07c28790575 (patch) | |
tree | 10bfb5cd27861f5e062567532ec76246e06bd4b5 /src/liblzma/check/crc32_arm64.h | |
parent | liblzma: Refactor crc_common.h. (diff) | |
download | xz-761f5b69a4c778c8bcb09279b845b07c28790575.tar.xz |
liblzma: Rename crc32_aarch64.h to crc32_arm64.h.
Even though the proper name for the architecture is aarch64, this
project uses ARM64 throughout. So the rename is for consistency.
Additionally, crc32_arm64.h was slightly refactored for the following
changes:
* Added MSVC, FreeBSD, and macOS support in
is_arch_extension_supported().
* crc32_arch_optimized() now checks the size when aligning the
buffer.
* crc32_arch_optimized() loop conditions were slightly modified to
avoid both decrementing the size and incrementing the buffer
pointer.
* Use the intrinsic wrappers defined in <arm_acle.h> because GCC and
Clang name them differently.
* Minor spacing and comment changes.
Diffstat (limited to 'src/liblzma/check/crc32_arm64.h')
-rw-r--r-- | src/liblzma/check/crc32_arm64.h | 119 |
1 files changed, 119 insertions, 0 deletions
diff --git a/src/liblzma/check/crc32_arm64.h b/src/liblzma/check/crc32_arm64.h new file mode 100644 index 00000000..a1888ea7 --- /dev/null +++ b/src/liblzma/check/crc32_arm64.h @@ -0,0 +1,119 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc32_arm64.h +/// \brief CRC32 calculation with ARM64 optimization +// +// Authors: Chenxi Mao +// Jia Tan +// +// This file has been put into the public domain. +// You can do whatever you want with this file. +// +/////////////////////////////////////////////////////////////////////////////// + + +#ifndef LZMA_CRC32_ARM64_H +#define LZMA_CRC32_ARM64_H + +// MSVC always has the CRC intrinsics available when building for ARM64 +// there is no need to include any header files. +#ifndef _MSC_VER +# include <arm_acle.h> +#endif + +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) +# if defined(HAVE_GETAUXVAL) || defined(HAVE_ELF_AUX_INFO) +# include <sys/auxv.h> +# elif defined(_WIN32) +# include <processthreadsapi.h> +# elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) +# include <sys/sysctl.h> +# endif +#endif + +// Some EDG-based compilers support ARM64 and define __GNUC__ +// (such as Nvidia's nvcc), but do not support function attributes. +// +// NOTE: Build systems check for this too, keep them in sync with this. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +# define crc_attr_target \ + __attribute__((__target__("+crc"))) +#else +# define crc_attr_target +#endif + + +crc_attr_target +static uint32_t +crc32_arch_optimized(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + + // Align the input buffer because this was shown to be + // significantly faster than unaligned accesses. + const size_t align_amount = my_min(size, (8 - (uintptr_t)buf) & 7); + + for (const uint8_t *limit = buf + align_amount; buf < limit; ++buf) + crc = __crc32b(crc, *buf); + + size -= align_amount; + + // Process 8 bytes at a time. The end point is determined by + // ignoring the least significant three bits of size to ensure + // we do not process past the bounds of the buffer. This guarentees + // that limit is a multiple of 8 and is strictly less than size. + for (const uint8_t *limit = buf + (size & ~((size_t)7)); + buf < limit; buf += 8) + crc = __crc32d(crc, aligned_read64le(buf)); + + // Process the remaining bytes that are not 8 byte aligned. + for (const uint8_t *limit = buf + (size & 7); buf < limit; ++buf) + crc = __crc32b(crc, *buf); + + return ~crc; +} + + +#if defined(CRC32_GENERIC) && defined(CRC32_ARCH_OPTIMIZED) +static inline bool +is_arch_extension_supported(void) +{ +#if defined(HAVE_GETAUXVAL) + return (getauxval(AT_HWCAP) & HWCAP_CRC32) != 0; + +#elif defined(HAVE_ELF_AUX_INFO) + unsigned long feature_flags; + + elf_aux_info(AT_HWCAP, &feature_flags, sizeof(feature_flags)); + return feature_flags & HWCAP_CRC32 != 0; + +#elif defined(_WIN32) + return IsProcessorFeaturePresent( + PF_ARM_V8_CRC32_INSTRUCTIONS_AVAILABLE); + +#elif defined(__APPLE__) && defined(HAVE_SYSCTLBYNAME) + int has_crc32 = 0; + size_t size = sizeof(has_crc32); + + // The sysctlbyname() function requires a string identifier for the + // CPU feature it tests. The Apple documentation lists the string + // "hw.optional.armv8_crc32", which can be found here: + // (https://developer.apple.com/documentation/kernel/1387446-sysctlbyname/determining_instruction_set_characteristics#3915619) + int err = sysctlbyname("hw.optional.armv8_crc32", &has_crc32, + &size, NULL, 0); + + return !err && has_crc32; + +#else + // If a runtime detection method cannot be found, then this must + // be a compile time error. The checks in crc_common.h should ensure + // a runtime detection method is always found if this function is + // built. It would be possible to just return false here, but this + // is inefficient for binary size and runtime since only the generic + // method could ever be used. +# error Runtime detection method unavailable. +#endif +} +#endif + +#endif // LZMA_CRC32_ARM64_H |