diff options
author | Lasse Collin <lasse.collin@tukaani.org> | 2022-11-14 21:34:57 +0200 |
---|---|---|
committer | Lasse Collin <lasse.collin@tukaani.org> | 2022-11-14 23:05:46 +0200 |
commit | f644473a211394447824ea00518d0a214ff3f7f2 (patch) | |
tree | 8fc19fa7f3811b090f3f73398ce8bd0708d10a53 /CMakeLists.txt | |
parent | Translations: Update the Swedish translation one more time. (diff) | |
download | xz-f644473a211394447824ea00518d0a214ff3f7f2.tar.xz |
liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL.
It also works on E2K as it supports these intrinsics.
On x86-64 runtime detection is used so the code keeps working on
older processors too. A CLMUL-only build can be done by using
-msse4.1 -mpclmul in CFLAGS and this will reduce the library
size since the generic implementation and its 8 KiB lookup table
will be omitted.
On 32-bit x86 this isn't used by default for now because by default
on 32-bit x86 the separate assembly file crc64_x86.S is used.
If --disable-assembler is used then this new CLMUL code is used
the same way as on 64-bit x86. However, a CLMUL-only build
(-msse4.1 -mpclmul) won't omit the 8 KiB lookup table on
32-bit x86 due to a currently-missing check for disabled
assembler usage.
The configure.ac check should be such that the code won't be
built if something in the toolchain doesn't support it but
--disable-clmul-crc option can be used to unconditionally
disable this feature.
CLMUL speeds up decompression of files that have compressed very
well (assuming CRC64 is used as a check type). It is know that
the CLMUL code is significantly slower than the generic code for
tiny inputs (especially 1-8 bytes but up to 16 bytes). If that
is a real-world problem then there is already a commented-out
variant that uses the generic version for small inputs.
Thanks to Ilya Kurdyukov for the original patch which was
derived from a white paper from Intel [1] (published in 2009)
and public domain code from [2] (released in 2016).
[1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf
[2] https://github.com/rawrunprotected/crc
Diffstat (limited to 'CMakeLists.txt')
-rw-r--r-- | CMakeLists.txt | 26 |
1 files changed, 24 insertions, 2 deletions
diff --git a/CMakeLists.txt b/CMakeLists.txt index 11f3a055..ec44bac3 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -49,8 +49,10 @@ cmake_minimum_required(VERSION 3.13...3.16 FATAL_ERROR) include(CMakePushCheckState) +include(CheckIncludeFile) include(CheckSymbolExists) include(CheckStructHasMember) +include(CheckCSourceCompiles) include(cmake/tuklib_integer.cmake) include(cmake/tuklib_cpucores.cmake) include(cmake/tuklib_physmem.cmake) @@ -402,14 +404,16 @@ check_c_source_compiles(" cmake_pop_check_state() tuklib_add_definition_if(liblzma HAVE_FUNC_ATTRIBUTE_CONSTRUCTOR) +# cpuid.h +check_include_file(cpuid.h HAVE_CPUID_H) +tuklib_add_definition_if(liblzma HAVE_CPUID_H) + # immintrin.h: -include(CheckIncludeFile) check_include_file(immintrin.h HAVE_IMMINTRIN_H) if(HAVE_IMMINTRIN_H) target_compile_definitions(liblzma PRIVATE HAVE_IMMINTRIN_H) # SSE2 intrinsics: - include(CheckCSourceCompiles) check_c_source_compiles(" #include <immintrin.h> int main(void) @@ -421,6 +425,24 @@ if(HAVE_IMMINTRIN_H) " HAVE__MM_MOVEMASK_EPI8) tuklib_add_definition_if(liblzma HAVE__MM_MOVEMASK_EPI8) + + # CLMUL intrinsic: + check_c_source_compiles(" + #include <immintrin.h> + #if defined(__e2k__) && __iset__ < 6 + # error + #endif + #if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) + __attribute__((__target__(\"ssse3,sse4.1,pclmul\"))) + #endif + __m128i my_clmul(__m128i a, __m128i b) + { + return _mm_clmulepi64_si128(a, b, 0); + } + int main(void) { return 0; } + " + HAVE_USABLE_CLMUL) + tuklib_add_definition_if(liblzma HAVE_USABLE_CLMUL) endif() # Support -fvisiblity=hidden when building shared liblzma. |