diff options
author | Lasse Collin <lasse.collin@tukaani.org> | 2023-10-20 23:35:10 +0300 |
---|---|---|
committer | Lasse Collin <lasse.collin@tukaani.org> | 2024-01-11 14:29:42 +0200 |
commit | 419f55f9dfc2df8792902b8953d50690121afeea (patch) | |
tree | aa95af5e4119ab90423c19ad64cfa73df0044f1c /src | |
parent | liblzma: crc_clmul.c: Add crc_attr_target macro. (diff) | |
download | xz-419f55f9dfc2df8792902b8953d50690121afeea.tar.xz |
liblzma: Avoid extern lzma_crc32_clmul() and lzma_crc64_clmul().
A CLMUL-only build will have the crcxx_clmul() inlined into
lzma_crcxx(). Previously a jump to the extern lzma_crcxx_clmul()
was needed. Notes about shared liblzma on ELF platforms:
- On platforms that support ifunc and -fvisibility=hidden, this
was silly because CLMUL-only build would have that single extra
jump instruction of extra overhead.
- On platforms that support neither -fvisibility=hidden nor linker
version script (liblzma*.map), jumping to lzma_crcxx_clmul()
would go via PLT so a few more instructions of overhead (still
not a big issue but silly nevertheless).
There was a downside with static liblzma too: if an application only
needs lzma_crc64(), static linking would make the linker include the
CLMUL code for both CRC32 and CRC64 from crc_x86_clmul.o even though
the CRC32 code wouldn't be needed, thus increasing code size of the
executable (assuming that -ffunction-sections isn't used).
Also, now compilers are likely to inline crc_simd_body()
even if they don't support the always_inline attribute
(or MSVC's __forceinline). Quite possibly all compilers
that build the code do support such an attribute. But now
it likely isn't a problem even if the attribute wasn't supported.
Now all x86-specific stuff is in crc_x86_clmul.h. If other archs
The other archs can then have their own headers with their own
is_clmul_supported() and crcxx_clmul().
Another bonus is that the build system doesn't need to care if
crc_clmul.c is needed.
is_clmul_supported() stays as inline function as it's not needed
when doing a CLMUL-only build (avoids a warning about unused function).
Diffstat (limited to 'src')
-rw-r--r-- | src/liblzma/check/Makefile.inc | 6 | ||||
-rw-r--r-- | src/liblzma/check/crc32_fast.c | 9 | ||||
-rw-r--r-- | src/liblzma/check/crc64_fast.c | 9 | ||||
-rw-r--r-- | src/liblzma/check/crc_common.h | 64 | ||||
-rw-r--r-- | src/liblzma/check/crc_x86_clmul.h (renamed from src/liblzma/check/crc_clmul.c) | 86 |
5 files changed, 89 insertions, 85 deletions
diff --git a/src/liblzma/check/Makefile.inc b/src/liblzma/check/Makefile.inc index 6186e10a..acff40c3 100644 --- a/src/liblzma/check/Makefile.inc +++ b/src/liblzma/check/Makefile.inc @@ -14,7 +14,8 @@ EXTRA_DIST += \ liblzma_la_SOURCES += \ check/check.c \ check/check.h \ - check/crc_common.h + check/crc_common.h \ + check/crc_x86_clmul.h if COND_SMALL liblzma_la_SOURCES += check/crc32_small.c @@ -27,9 +28,6 @@ if COND_ASM_X86 liblzma_la_SOURCES += check/crc32_x86.S else liblzma_la_SOURCES += check/crc32_fast.c -if COND_CRC_CLMUL -liblzma_la_SOURCES += check/crc_clmul.c -endif endif endif diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c index 9fce94d3..6982836a 100644 --- a/src/liblzma/check/crc32_fast.c +++ b/src/liblzma/check/crc32_fast.c @@ -15,6 +15,11 @@ #include "check.h" #include "crc_common.h" +#ifdef CRC_CLMUL +# define BUILDING_CRC32_CLMUL +# include "crc_x86_clmul.h" +#endif + #ifdef CRC_GENERIC @@ -132,7 +137,7 @@ typedef uint32_t (*crc32_func_type)( static crc32_func_type crc32_resolve(void) { - return is_clmul_supported() ? &lzma_crc32_clmul : &crc32_generic; + return is_clmul_supported() ? &crc32_clmul : &crc32_generic; } #if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__) @@ -221,7 +226,7 @@ lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) return crc32_func(buf, size, crc); #elif defined(CRC_CLMUL) - return lzma_crc32_clmul(buf, size, crc); + return crc32_clmul(buf, size, crc); #else return crc32_generic(buf, size, crc); diff --git a/src/liblzma/check/crc64_fast.c b/src/liblzma/check/crc64_fast.c index ce74901c..46b5c646 100644 --- a/src/liblzma/check/crc64_fast.c +++ b/src/liblzma/check/crc64_fast.c @@ -14,6 +14,11 @@ #include "check.h" #include "crc_common.h" +#ifdef CRC_CLMUL +# define BUILDING_CRC64_CLMUL +# include "crc_x86_clmul.h" +#endif + #ifdef CRC_GENERIC @@ -97,7 +102,7 @@ typedef uint64_t (*crc64_func_type)( static crc64_func_type crc64_resolve(void) { - return is_clmul_supported() ? &lzma_crc64_clmul : &crc64_generic; + return is_clmul_supported() ? &crc64_clmul : &crc64_generic; } #if defined(HAVE_FUNC_ATTRIBUTE_IFUNC) && defined(__clang__) @@ -160,7 +165,7 @@ lzma_crc64(const uint8_t *buf, size_t size, uint64_t crc) // // FIXME: Lookup table isn't currently omitted on 32-bit x86, // see crc64_table.c. - return lzma_crc64_clmul(buf, size, crc); + return crc64_clmul(buf, size, crc); #else return crc64_generic(buf, size, crc); diff --git a/src/liblzma/check/crc_common.h b/src/liblzma/check/crc_common.h index c949f793..552219fe 100644 --- a/src/liblzma/check/crc_common.h +++ b/src/liblzma/check/crc_common.h @@ -108,70 +108,6 @@ # define CRC_USE_GENERIC_FOR_SMALL_INPUTS 1 # endif */ - -# if defined(_MSC_VER) -# include <intrin.h> -# elif defined(HAVE_CPUID_H) -# include <cpuid.h> -# endif - -// is_clmul_supported() must be inlined in this header file because the -// ifunc resolver function may not support calling a function in another -// translation unit. Depending on compiler-toolchain and flags, a call to -// a function defined in another translation unit could result in a -// reference to the PLT, which is unsafe to do in an ifunc resolver. The -// ifunc resolver runs very early when loading a shared library, so the PLT -// entries may not be setup at that time. Inlining this function duplicates -// the function body in crc32_resolve() and crc64_resolve(), but this is -// acceptable because the function results in very few instructions. -static inline bool -is_clmul_supported(void) -{ - int success = 1; - uint32_t r[4]; // eax, ebx, ecx, edx - -#if defined(_MSC_VER) - // This needs <intrin.h> with MSVC. ICC has it as a built-in - // on all platforms. - __cpuid(r, 1); -#elif defined(HAVE_CPUID_H) - // Compared to just using __asm__ to run CPUID, this also checks - // that CPUID is supported and saves and restores ebx as that is - // needed with GCC < 5 with position-independent code (PIC). - success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); -#else - // Just a fallback that shouldn't be needed. - __asm__("cpuid\n\t" - : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) - : "a"(1), "c"(0)); #endif - // Returns true if these are supported: - // CLMUL (bit 1 in ecx) - // SSSE3 (bit 9 in ecx) - // SSE4.1 (bit 19 in ecx) - const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); - return success && (r[2] & ecx_mask) == ecx_mask; - - // Alternative methods that weren't used: - // - ICC's _may_i_use_cpu_feature: the other methods should work too. - // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") - // - // CPUID decding is needed with MSVC anyway and older GCC. This keeps - // the feature checks in the build system simpler too. The nice thing - // about __builtin_cpu_supports would be that it generates very short - // code as is it only reads a variable set at startup but a few bytes - // doesn't matter here. -} - -#endif - -/// CRC32 implemented with the x86 CLMUL instruction. -extern uint32_t lzma_crc32_clmul(const uint8_t *buf, size_t size, - uint32_t crc); - -/// CRC64 implemented with the x86 CLMUL instruction. -extern uint64_t lzma_crc64_clmul(const uint8_t *buf, size_t size, - uint64_t crc); - #endif diff --git a/src/liblzma/check/crc_clmul.c b/src/liblzma/check/crc_x86_clmul.h index 381948a9..7a47204a 100644 --- a/src/liblzma/check/crc_clmul.c +++ b/src/liblzma/check/crc_x86_clmul.h @@ -1,11 +1,10 @@ /////////////////////////////////////////////////////////////////////////////// // -/// \file crc_clmul.c +/// \file crc_x86_clmul.h /// \brief CRC32 and CRC64 implementations using CLMUL instructions. /// -/// lzma_crc32_clmul() and lzma_crc64_clmul() use 32/64-bit x86 -/// SSSE3, SSE4.1, and CLMUL instructions. This is compatible with -/// Elbrus 2000 (E2K) too. +/// crc32_clmul() and crc64_clmul() use 32/64-bit x86 SSSE3, SSE4.1, and +/// CLMUL instructions. This is compatible with Elbrus 2000 (E2K) too. /// /// They were derived from /// https://www.researchgate.net/publication/263424619_Fast_CRC_computation @@ -27,9 +26,20 @@ // /////////////////////////////////////////////////////////////////////////////// -#include "crc_common.h" +// This file must not be included more than once. +#ifdef LZMA_CRC_X86_CLMUL_H +# error crc_x86_clmul.h was included twice. +#endif +#define LZMA_CRC_X86_CLMUL_H + #include <immintrin.h> +#if defined(_MSC_VER) +# include <intrin.h> +#elif defined(HAVE_CPUID_H) +# include <cpuid.h> +#endif + // EDG-based compilers (Intel's classic compiler and compiler for E2K) can // define __GNUC__ but the attribute must not be used with them. @@ -225,12 +235,12 @@ calc_hi(uint64_t p, uint64_t a, int n) } */ -#ifdef HAVE_CHECK_CRC32 +#ifdef BUILDING_CRC32_CLMUL crc_attr_target crc_attr_no_sanitize_address -extern uint32_t -lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) +static uint32_t +crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) { #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS // The code assumes that there is at least one byte of input. @@ -265,7 +275,7 @@ lzma_crc32_clmul(const uint8_t *buf, size_t size, uint32_t crc) v0 = _mm_xor_si128(v0, v2); // [2] return ~(uint32_t)_mm_extract_epi32(v0, 2); } -#endif // HAVE_CHECK_CRC32 +#endif // BUILDING_CRC32_CLMUL ///////////////////// @@ -299,7 +309,7 @@ calc_hi(uint64_t poly, uint64_t a) } */ -#ifdef HAVE_CHECK_CRC64 +#ifdef BUILDING_CRC64_CLMUL // MSVC (VS2015 - VS2022) produces bad 32-bit x86 code from the CLMUL CRC // code when optimizations are enabled (release build). According to the bug @@ -318,8 +328,8 @@ calc_hi(uint64_t poly, uint64_t a) crc_attr_target crc_attr_no_sanitize_address -extern uint64_t -lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) +static uint64_t +crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) { #ifndef CRC_USE_GENERIC_FOR_SMALL_INPUTS // The code assumes that there is at least one byte of input. @@ -366,4 +376,54 @@ lzma_crc64_clmul(const uint8_t *buf, size_t size, uint64_t crc) # pragma optimize("", on) #endif -#endif // HAVE_CHECK_CRC64 +#endif // BUILDING_CRC64_CLMUL + + +// is_clmul_supported() must be inlined in this header file because the +// ifunc resolver function may not support calling a function in another +// translation unit. Depending on compiler-toolchain and flags, a call to +// a function defined in another translation unit could result in a +// reference to the PLT, which is unsafe to do in an ifunc resolver. The +// ifunc resolver runs very early when loading a shared library, so the PLT +// entries may not be setup at that time. Inlining this function duplicates +// the function body in crc32_resolve() and crc64_resolve(), but this is +// acceptable because the function results in very few instructions. +static inline bool +is_clmul_supported(void) +{ + int success = 1; + uint32_t r[4]; // eax, ebx, ecx, edx + +#if defined(_MSC_VER) + // This needs <intrin.h> with MSVC. ICC has it as a built-in + // on all platforms. + __cpuid(r, 1); +#elif defined(HAVE_CPUID_H) + // Compared to just using __asm__ to run CPUID, this also checks + // that CPUID is supported and saves and restores ebx as that is + // needed with GCC < 5 with position-independent code (PIC). + success = __get_cpuid(1, &r[0], &r[1], &r[2], &r[3]); +#else + // Just a fallback that shouldn't be needed. + __asm__("cpuid\n\t" + : "=a"(r[0]), "=b"(r[1]), "=c"(r[2]), "=d"(r[3]) + : "a"(1), "c"(0)); +#endif + + // Returns true if these are supported: + // CLMUL (bit 1 in ecx) + // SSSE3 (bit 9 in ecx) + // SSE4.1 (bit 19 in ecx) + const uint32_t ecx_mask = (1 << 1) | (1 << 9) | (1 << 19); + return success && (r[2] & ecx_mask) == ecx_mask; + + // Alternative methods that weren't used: + // - ICC's _may_i_use_cpu_feature: the other methods should work too. + // - GCC >= 6 / Clang / ICX __builtin_cpu_supports("pclmul") + // + // CPUID decding is needed with MSVC anyway and older GCC. This keeps + // the feature checks in the build system simpler too. The nice thing + // about __builtin_cpu_supports would be that it generates very short + // code as is it only reads a variable set at startup but a few bytes + // doesn't matter here. +} |