1 files changed, 88 insertions, 0 deletions
diff --git a/src/liblzma/check/crc32_fast.c b/src/liblzma/check/crc32_fast.c
new file mode 100644
index 00000000..698cf768
--- /dev/null
+++ b/src/liblzma/check/crc32_fast.c
@@ -0,0 +1,88 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       crc32.c
+/// \brief      CRC32 calculation
+//
+//  This code has been put into the public domain.
+//
+//  This library is distributed in the hope that it will be useful,
+//  but WITHOUT ANY WARRANTY; without even the implied warranty of
+//  MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "check.h"
+#include "crc_macros.h"
+
+
+// If you make any changes, do some bench marking! Seemingly unrelated
+// changes can very easily ruin the performance (and very probably is
+// very compiler dependent).
+extern LZMA_API uint32_t
+lzma_crc32(const uint8_t *buf, size_t size, uint32_t crc)
+{
+	crc = ~crc;
+
+#ifdef WORDS_BIGENDIAN
+	crc = bswap_32(crc);
+#endif
+
+	if (size > 8) {
+		// Fix the alignment, if needed. The if statement above
+		// ensures that this won't read past the end of buf[].
+		while ((uintptr_t)(buf) & 7) {
+			crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc);
+			--size;
+		}
+
+		// Calculate the position where to stop.
+		const uint8_t *const limit = buf + (size & ~(size_t)(7));
+
+		// Calculate how many bytes must be calculated separately
+		// before returning the result.
+		size &= (size_t)(7);
+
+		// Calculate the CRC32 using the slice-by-eight algorithm.
+		// It is explained in this document:
+		// http://www.intel.com/technology/comms/perfnet/download/CRC_generators.pdf
+		//
+		// The code below is different than the code in Intel's
+		// paper, but the principle is identical. This should be
+		// faster with GCC than Intel's code. This is tested only
+		// with GCC 3.4.6 and 4.1.2 on x86, so your results may vary.
+		//
+		// Using -Os and -fomit-frame-pointer seem to give the best
+		// results at least with GCC 4.1.2 on x86. It's sill far
+		// from the speed of hand-optimized assembler.
+		while (buf < limit) {
+			crc ^= *(uint32_t *)(buf);
+			buf += 4;
+
+			crc = lzma_crc32_table[7][A(crc)]
+			    ^ lzma_crc32_table[6][B(crc)]
+			    ^ lzma_crc32_table[5][C(crc)]
+			    ^ lzma_crc32_table[4][D(crc)];
+
+			const uint32_t tmp = *(uint32_t *)(buf);
+			buf += 4;
+
+			// It is critical for performance, that
+			// the crc variable is XORed between the
+			// two table-lookup pairs.
+			crc = lzma_crc32_table[3][A(tmp)]
+			    ^ lzma_crc32_table[2][B(tmp)]
+			    ^ crc
+			    ^ lzma_crc32_table[1][C(tmp)]
+			    ^ lzma_crc32_table[0][D(tmp)];
+		}
+	}
+
+	while (size-- != 0)
+		crc = lzma_crc32_table[0][*buf++ ^ A(crc)] ^ S8(crc);
+
+#ifdef WORDS_BIGENDIAN
+	crc = bswap_32(crc);
+#endif
+
+	return ~crc;
+}