diff options
Diffstat (limited to 'src/liblzma/check/crc32_fast.c')
-rw-r--r-- | src/liblzma/check/crc32_fast.c | 88 |
1 files changed, 88 insertions, 0 deletions
diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c new file mode 100644 index 00000000..698cf768 --- /dev/null +++ b/src/liblzma/check/crc32_fast.c @@ -0,0 +1,88 @@ +/////////////////////////////////////////////////////////////////////////////// +// +/// \file crc32.c +/// \brief CRC32 calculation +// +// This code has been put into the public domain. +// +// This library is distributed in the hope that it will be useful, +// but WITHOUT ANY WARRANTY; without even the implied warranty of +// MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +// +/////////////////////////////////////////////////////////////////////////////// + +#include "check.h" +#include "crc_macros.h" + + +// If you make any changes, do some bench marking! Seemingly unrelated +// changes can very easily ruin the performance (and very probably is +// very compiler dependent). +extern LZMA_API uint32_t +lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc) +{ + crc = ~crc; + +#ifdef WORDS_BIGENDIAN + crc = bswap_32(crc); +#endif + + if (size > 8) { + // Fix the alignment, if needed. The if statement above + // ensures that this won't read past the end of buf[]. + while ((uintptr_t)(buf) & 7) { + crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc); + --size; + } + + // Calculate the position where to stop. + const uint8_t *const limit = buf + (size & ~(size_t)(7)); + + // Calculate how many bytes must be calculated separately + // before returning the result. + size &= (size_t)(7); + + // Calculate the CRC32 using the slice-by-eight algorithm. + // It is explained in this document: + // http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf + // + // The code below is different than the code in Intel's + // paper, but the principle is identical. This should be + // faster with GCC than Intel's code. This is tested only + // with GCC 3.4.6 and 4.1.2 on x86, so your results may vary. + // + // Using -Os and -fomit-frame-pointer seem to give the best + // results at least with GCC 4.1.2 on x86. It's sill far + // from the speed of hand-optimized assembler. + while (buf < limit) { + crc ^= *(uint32_t *)(buf); + buf += 4; + + crc = lzma_crc32_table[7][A(crc)] + ^ lzma_crc32_table[6][B(crc)] + ^ lzma_crc32_table[5][C(crc)] + ^ lzma_crc32_table[4][D(crc)]; + + const uint32_t tmp = *(uint32_t *)(buf); + buf += 4; + + // It is critical for performance, that + // the crc variable is XORed between the + // two table-lookup pairs. + crc = lzma_crc32_table[3][A(tmp)] + ^ lzma_crc32_table[2][B(tmp)] + ^ crc + ^ lzma_crc32_table[1][C(tmp)] + ^ lzma_crc32_table[0][D(tmp)]; + } + } + + while (size-- != 0) + crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc); + +#ifdef WORDS_BIGENDIAN + crc = bswap_32(crc); +#endif + + return ~crc; +} |