liblzma: memcmplen.h: Add a comment why subtraction is used.

author: Lasse Collin <lasse.collin@tukaani.org> 2024-03-22 17:46:30 +0200
committer: Lasse Collin <lasse.collin@tukaani.org> 2024-03-22 17:46:30 +0200
commit: 0b99783d63f27606936bb79a16c52d0d70c0b56f (patch)
tree: 0796e5ca626b6bdd7948f1eb32a441422e76fe1b /src/liblzma
parent: INSTALL: Document arguments of --enable-symbol-versions. (diff)
download: xz-0b99783d63f27606936bb79a16c52d0d70c0b56f.tar.xz
1 files changed, 13 insertions, 0 deletions
diff --git a/src/liblzma/common/memcmplen.h b/src/liblzma/common/memcmplen.h
index d8c42040..394a4856 100644
--- a/src/liblzma/common/memcmplen.h
+++ b/src/liblzma/common/memcmplen.h
@@ -67,6 +67,19 @@ lzma_memcmplen(const uint8_t *buf1, const uint8_t *buf2,
 	// This is only for x86-64 and ARM64 for now. This might be fine on
 	// other 64-bit processors too. On big endian one should use xor
 	// instead of subtraction and switch to __builtin_clzll().
+	//
+	// Reasons to use subtraction instead of xor:
+	//
+	//   - On some x86-64 processors (Intel Sandy Bridge to Tiger Lake),
+	//     sub+jz and sub+jnz can be fused but xor+jz or xor+jnz cannot.
+	//     Thus using subtraction has potential to be a tiny amount faster
+	//     since the code checks if the quotient is non-zero.
+	//
+	//   - Some processors (Intel Pentium 4) used to have more ALU
+	//     resources for add/sub instructions than and/or/xor.
+	//
+	// The processor info is based on Agner Fog's microarchitecture.pdf
+	// version 2023-05-26. https://www.agner.org/optimize/
 #define LZMA_MEMCMPLEN_EXTRA 8
 	while (len < limit) {
 		const uint64_t x = read64ne(buf1 + len) - read64ne(buf2 + len);
author	Lasse Collin <lasse.collin@tukaani.org>	2024-03-22 17:46:30 +0200
committer	Lasse Collin <lasse.collin@tukaani.org>	2024-03-22 17:46:30 +0200
commit	0b99783d63f27606936bb79a16c52d0d70c0b56f (patch)
tree	0796e5ca626b6bdd7948f1eb32a441422e76fe1b /src/liblzma
parent	INSTALL: Document arguments of --enable-symbol-versions. (diff)
download	xz-0b99783d63f27606936bb79a16c52d0d70c0b56f.tar.xz