From f644473a211394447824ea00518d0a214ff3f7f2 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Mon, 14 Nov 2022 21:34:57 +0200 Subject: liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL. It also works on E2K as it supports these intrinsics. On x86-64 runtime detection is used so the code keeps working on older processors too. A CLMUL-only build can be done by using -msse4.1 -mpclmul in CFLAGS and this will reduce the library size since the generic implementation and its 8 KiB lookup table will be omitted. On 32-bit x86 this isn't used by default for now because by default on 32-bit x86 the separate assembly file crc64_x86.S is used. If --disable-assembler is used then this new CLMUL code is used the same way as on 64-bit x86. However, a CLMUL-only build (-msse4.1 -mpclmul) won't omit the 8 KiB lookup table on 32-bit x86 due to a currently-missing check for disabled assembler usage. The configure.ac check should be such that the code won't be built if something in the toolchain doesn't support it but --disable-clmul-crc option can be used to unconditionally disable this feature. CLMUL speeds up decompression of files that have compressed very well (assuming CRC64 is used as a check type). It is know that the CLMUL code is significantly slower than the generic code for tiny inputs (especially 1-8 bytes but up to 16 bytes). If that is a real-world problem then there is already a commented-out variant that uses the generic version for small inputs. Thanks to Ilya Kurdyukov for the original patch which was derived from a white paper from Intel [1] (published in 2009) and public domain code from [2] (released in 2016). [1] https://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf [2] https://github.com/rawrunprotected/crc --- configure.ac | 59 +++++++++++++++++++++++++++++++++++++++++++++++++++++++++-- 1 file changed, 57 insertions(+), 2 deletions(-) (limited to 'configure.ac') diff --git a/configure.ac b/configure.ac index ddc82a3b..60a63825 100644 --- a/configure.ac +++ b/configure.ac @@ -370,6 +370,16 @@ esac AM_CONDITIONAL(COND_ASM_X86, test "x$enable_assembler" = xx86) +############# +# CLMUL CRC # +############# + +AC_ARG_ENABLE([clmul-crc], AS_HELP_STRING([--disable-clmul-crc], + [Do not use carryless multiplication for CRC calculation + even if support for it is detected.]), + [], [enable_clmul_crc=yes]) + + ##################### # Size optimization # ##################### @@ -733,8 +743,9 @@ AC_CHECK_HEADERS([fcntl.h limits.h sys/time.h], [], [AC_MSG_ERROR([Required header file(s) are missing.])]) -# This allows the use of the intrinsic functions if they are available. -AC_CHECK_HEADERS([immintrin.h]) +# immintrin.h allows the use of the intrinsic functions if they are available. +# cpuid.h may be used for detecting x86 processor features at runtime. +AC_CHECK_HEADERS([immintrin.h cpuid.h]) ############################################################################### @@ -874,6 +885,50 @@ AC_CHECK_DECL([_mm_movemask_epi8], #include #endif]) +# For faster CRC on 32/64-bit x86 and E2K (see also crc64_fast.c): +# +# - Check for the CLMUL intrinsic _mm_clmulepi64_si128 in . +# +# - Check that __attribute__((__target__("ssse3,sse4.1,pclmul"))) works +# together with _mm_clmulepi64_si128 from . The attribute +# was added in GCC 4.4 but some GCC 4.x versions don't allow intrinsics +# with it. Exception: it must be not be used with EDG-based compilers +# like ICC and the compiler on E2K. +# +# If everything above is supported, runtime detection will be used to keep the +# binaries working on systems that don't support the required extensions. +AC_MSG_CHECKING([if _mm_clmulepi64_si128 is usable]) +if test "x$enable_clmul_crc" = xno ; then + AC_MSG_RESULT([no, --disable-clmul-crc was used]) +else + AC_COMPILE_IFELSE([AC_LANG_SOURCE([[ +#include + +// CLMUL works on older E2K instruction set but it is slow due to emulation. +#if defined(__e2k__) && __iset__ < 6 +# error +#endif + +// Intel's old compiler (ICC) can define __GNUC__ but the attribute must not +// be used with it. The new Clang-based ICX needs the attribute. +// Checking for !defined(__EDG__) catches ICC and other EDG-based compilers. +#if (defined(__GNUC__) || defined(__clang__)) && !defined(__EDG__) +__attribute__((__target__("ssse3,sse4.1,pclmul"))) +#endif +__m128i my_clmul(__m128i a, __m128i b) +{ + return _mm_clmulepi64_si128(a, b, 0); +} + ]])], [ + AC_DEFINE([HAVE_USABLE_CLMUL], [1], + [Define to 1 if _mm_clmulepi64_si128 is usable. + See configure.ac for details.]) + AC_MSG_RESULT([yes]) + ], [ + AC_MSG_RESULT([no]) + ]) +fi + # Check for sandbox support. If one is found, set enable_sandbox=found. case $enable_sandbox in auto | capsicum) -- cgit v1.2.3