Replace the experimental ARM64 filter with a new experimental version.

This is incompatible with the previous version. This has space/tab fixes in filter_*.c and bcj.h too.
author: Lasse Collin <lasse.collin@tukaani.org> 2022-11-14 23:14:41 +0200
committer: Lasse Collin <lasse.collin@tukaani.org> 2022-11-14 23:16:38 +0200
commit: 8370ec8edf9ddf8d1d9fef03d8d1027503ec4c35 (patch)
tree: 5a5badf9edc6b6aaf3f8aa4e6cea15940c34b733 /src/liblzma/simple
parent: liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL. (diff)
download: xz-8370ec8edf9ddf8d1d9fef03d8d1027503ec4c35.tar.xz
3 files changed, 107 insertions, 182 deletions
diff --git a/src/liblzma/simple/arm64.c b/src/liblzma/simple/arm64.c
index 0437f736..05b4be29 100644
--- a/src/liblzma/simple/arm64.c
+++ b/src/liblzma/simple/arm64.c
@@ -3,6 +3,22 @@
 /// \file       arm64.c
 /// \brief      Filter for ARM64 binaries
 ///
+/// This converts ARM64 relative addresses in the BL and ADRP immediates
+/// to absolute values to increase redundancy of ARM64 code.
+///
+/// Unlike the older BCJ filters, this handles zeros specially. This way
+/// the filter won't be counterproductive on Linux kernel modules, object
+/// files, and static libraries where the immediates are all zeros (to be
+/// filled later by a linker). Usually this has no downsides but with bad
+/// luck it can reduce the effectiveness of the filter and trying a different
+/// start offset can mitigate the problem.
+///
+/// Converting B or ADR instructions was also tested but it's not useful.
+/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
+/// These are typical for loops and if-statements. Encoding them to their
+/// absolute address reduces redundancy since many of the small relative
+/// jump values are repeated, but very few of the absolute addresses are.
+//
 //  Authors:    Lasse Collin
 //              Jia Tan
 //
@@ -13,126 +29,110 @@
 
 #include "simple_private.h"
 
-#ifdef HAVE_ENCODER_ARM64
-#	include "simple_encoder.h"
-#endif
-
-#ifdef HAVE_DECODER_ARM64
-#	include "simple_decoder.h"
-#endif
-
-
-// In ARM64, there are two main branch instructions.
-// bl - branch and link: Calls a function and stores the return address.
-// b - branch: Jumps to a location, but does not store a return address.
-//
-// After some benchmarking, it was determined that only the bl instruction
-// is beneficial for compression. A majority of the jumps for the b
-// instruction are very small (+/- 0xFF). These are typical for loops
-// and if-statements. Encoding them to their absolute address reduces
-// redundancy since many of the small relative jump values are repeated,
-// but very few of the absolute addresses are.
-//
-// Thus, only the bl instruction will be encoded and decoded.
-// The bl instruction is 32 bits in size. The highest 6 bits contain
-// the opcode (10 0101 == 0x25) and the remaining 26 bits are
-// the immediate value. The immediate is a signed integer that
-// encodes the target address as a multiple of four bytes so
-// the range is +/-128 MiB.
-
-// The 6-bit op code for the bl instruction in ARM64
-#define ARM64_BL_OPCODE 0x25
 
-// Once the 26-bit immediate is multiple by four, the address is 28 bits
-// with the two lowest bits being zero. This mask is used to clear the
-// unwanted bits.
-#define ADDR28_MASK 0x0FFFFFFCU
+static uint32_t
+arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
+{
+	if (!is_encoder)
+		pc = 0U - pc;
 
+	uint32_t dest = src + pc;
+	if ((dest & mask) == 0)
+		dest = pc;
 
-typedef struct {
-	uint32_t sign_bit;
-	uint32_t sign_mask;
-} lzma_simple_arm64;
+	return dest;
+}
 
 
 static size_t
-arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder,
+arm64_code(void *simple lzma_attribute((__unused__)),
+		uint32_t now_pos, bool is_encoder,
 		uint8_t *buffer, size_t size)
 {
-	const lzma_simple_arm64 *simple = simple_ptr;
-	const uint32_t sign_bit = simple->sign_bit;
-	const uint32_t sign_mask = simple->sign_mask;
-
 	size_t i;
+
+	// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
+	// with auto-vectorization that is enabled by default with -O2.
+	// Even -Os, which doesn't use vectorization, produces faster code.
+	// Disabling vectorization with -O2 gives good speed (faster than -Os)
+	// and reasonable code size.
+	//
+	// Such vectorization bloat happens with -O2 when targeting ARM64 too
+	// but performance hasn't been tested.
+	//
+	// Clang 14 and 15 won't auto-vectorize this loop if the condition
+	// for ADRP is replaced with the commented-out version. However,
+	// at least Clang 14.0.6 doesn't generate as fast code with that
+	// condition. The commented-out code is also bigger.
+	//
+	// GCC 12.2 on x86-64 with -O2 produces good code with both versions
+	// of the ADRP if-statement although the single-branch version is
+	// slightly faster and smaller than the commented-out version.
+	// Speed is similar to non-vectorized clang -O2.
+#ifdef __clang__
+#	pragma clang loop vectorize(disable)
+#endif
 	for (i = 0; i + 4 <= size; i += 4) {
-		if ((buffer[i + 3] >> 2) == ARM64_BL_OPCODE) {
-			// Get the relative 28-bit address from
-			// the 26-bit immediate.
-			uint32_t src = read32le(buffer + i);
-			src <<= 2;
-			src &= ADDR28_MASK;
-
-			// When the conversion width isn't the maximum,
-			// check that the highest bits are either all zero
-			// or all one.
-			if ((src & sign_mask) != 0
-					&& (src & sign_mask) != sign_mask)
+		const uint32_t pc = (uint32_t)(now_pos + i);
+		uint32_t instr = read32le(buffer + i);
+
+		if ((instr >> 26) == 0x25) {
+			// BL instruction:
+			// The full 26-bit immediate is converted.
+			// The range is +/-128 MiB.
+			//
+			// Using the full range is helps quite a lot with
+			// big executables. Smaller range would reduce false
+			// positives in non-code sections of the input though
+			// so this is a compromise that slightly favors big
+			// files. With the full range only six bits of the 32
+			// need to match to trigger a conversion.
+			const uint32_t mask26 = 0x03FFFFFF;
+			const uint32_t src = instr & mask26;
+			instr = 0x94000000;
+
+			if (src == 0)
 				continue;
 
-			// Some files like static libraries or Linux kernel
-			// modules have the immediate value filled with
-			// zeros. Converting these placeholder values would
-			// make compression worse so don't touch them.
+			instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
+					& mask26;
+			write32le(buffer + i, instr);
+
+/*
+		// This is a more readable version of the one below but this
+		// has two branches. It results in bigger and slower code.
+		} else if ((instr & 0x9FF00000) == 0x90000000
+				|| (instr & 0x9FF00000) == 0x90F00000) {
+*/
+		// This is only a rotation, addition, and testing that
+		// none of the bits covered by the bitmask are set.
+		} else if (((((instr << 8) | (instr >> 24))
+				+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
+			// ADRP instruction:
+			// Only values in the range +/-512 MiB are converted.
+			//
+			// Using less than the full +/-4 GiB range reduces
+			// false positives on non-code sections of the input
+			// while being excellent for executables up to 512 MiB.
+			// The positive effect of ADRP conversion is smaller
+			// than that of BL but it also doesn't hurt so much in
+			// non-code sections of input because, with +/-512 MiB
+			// range, nine bits of 32 need to match to trigger a
+			// conversion (two 10-bit match choices = 9 bits).
+			const uint32_t src = ((instr >> 29) & 3)
+					| ((instr >> 3) & 0x0003FFFC);
+			instr &= 0x9000001F;
+
 			if (src == 0)
 				continue;
 
-			const uint32_t pc = now_pos + (uint32_t)(i);
-
-			uint32_t dest;
-			if (is_encoder)
-				dest = pc + src;
-			else
-				dest = src - pc;
-
-			dest &= ADDR28_MASK;
-
-			// Sign-extend negative values or unset sign bits
-			// from positive values.
-			if (dest & sign_bit)
-				dest |= sign_mask;
-			else
-				dest &= ~sign_mask;
-
-			assert((dest & sign_mask) == 0
-					|| (dest & sign_mask) == sign_mask);
-
-			// Since also the decoder will ignore src values
-			// of 0, we must ensure that nothing is ever encoded
-			// to 0. This is achieved by encoding such values
-			// as pc instead. When decoding, pc will be first
-			// converted to 0 which we will catch here and fix.
-			if (dest == 0) {
-				// We cannot get here if pc is zero because
-				// then src would need to be zero too but we
-				// already ensured that src != 0.
-				assert((pc & ADDR28_MASK) != 0);
-				dest = is_encoder ? pc : 0U - pc;
-				dest &= ADDR28_MASK;
-
-				if (dest & sign_bit)
-					dest |= sign_mask;
-				else
-					dest &= ~sign_mask;
-			}
-
-			assert((dest & sign_mask) == 0
-					|| (dest & sign_mask) == sign_mask);
-			assert((dest & ~ADDR28_MASK) == 0);
-
-			// Construct and store the modified 32-bit instruction.
-			dest >>= 2;
-			dest |= (uint32_t)ARM64_BL_OPCODE << 26;
-			write32le(buffer + i, dest);
+			const uint32_t dest = arm64_conv(
+					src, pc >> 12, 0x3FFFF, is_encoder);
+
+			instr |= (dest & 3) << 29;
+			instr |= (dest & 0x0003FFFC) << 3;
+			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
+			write32le(buffer + i, instr);
 		}
 	}
 
@@ -140,81 +140,12 @@ arm64_code(void *simple_ptr, uint32_t now_pos, bool is_encoder,
 }
 
 
-#ifdef HAVE_ENCODER_ARM64
-extern lzma_ret
-lzma_arm64_props_encode(const void *options, uint8_t *out)
-{
-	const lzma_options_arm64 *const opt = options;
-
-	if (opt->width < LZMA_ARM64_WIDTH_MIN
-			|| opt->width > LZMA_ARM64_WIDTH_MAX)
-		return LZMA_OPTIONS_ERROR;
-
-	out[0] = (uint8_t)(opt->width - LZMA_ARM64_WIDTH_MIN);
-	return LZMA_OK;
-}
-#endif
-
-
-#ifdef HAVE_DECODER_ARM64
-extern lzma_ret
-lzma_arm64_props_decode(void **options, const lzma_allocator *allocator,
-		const uint8_t *props, size_t props_size)
-{
-	if (props_size != 1)
-		return LZMA_OPTIONS_ERROR;
-
-	if (props[0] > LZMA_ARM64_WIDTH_MAX - LZMA_ARM64_WIDTH_MIN)
-		return LZMA_OPTIONS_ERROR;
-
-	lzma_options_arm64 *opt = lzma_alloc(sizeof(lzma_options_arm64),
-			allocator);
-	if (opt == NULL)
-		return LZMA_MEM_ERROR;
-
-	opt->width = props[0] + LZMA_ARM64_WIDTH_MIN;
-	*options = opt;
-	return LZMA_OK;
-
-}
-#endif
-
-
 static lzma_ret
 arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 		const lzma_filter_info *filters, bool is_encoder)
 {
-	if (filters[0].options == NULL)
-		return LZMA_PROG_ERROR;
-
-	const lzma_options_arm64 *opt = filters[0].options;
-	if (opt->width < LZMA_ARM64_WIDTH_MIN
-			|| opt->width > LZMA_ARM64_WIDTH_MAX)
-		return LZMA_OPTIONS_ERROR;
-
-	const lzma_ret ret = lzma_simple_coder_init(next, allocator, filters,
-			&arm64_code, sizeof(lzma_simple_arm64), 4, 4,
-			is_encoder, false);
-
-	if (ret == LZMA_OK) {
-		lzma_simple_coder *coder = next->coder;
-		lzma_simple_arm64 *simple = coder->simple;
-
-		// This will be used to detect if the value, after
-		// conversion has been done, is negative. The location
-		// of the sign bit depends on the conversion width.
-		simple->sign_bit = UINT32_C(1) << (opt->width - 1);
-
-		// When conversion width isn't the maximum, the highest
-		// bits must all be either zero or one, that is, they
-		// all are copies of the sign bit. This mask is used to
-		// (1) detect if input value is in the range specified
-		// by the conversion width and (2) clearing or setting
-		// the high bits after conversion (integers can wrap around).
-		simple->sign_mask = (UINT32_C(1) << 28) - simple->sign_bit;
-	}
-
-	return ret;
+	return lzma_simple_coder_init(next, allocator, filters,
+			&arm64_code, 0, 4, 4, is_encoder, true);
 }
 
 
diff --git a/src/liblzma/simple/simple_decoder.h b/src/liblzma/simple/simple_decoder.h
index 188d8370..bed8d37a 100644
--- a/src/liblzma/simple/simple_decoder.h
+++ b/src/liblzma/simple/simple_decoder.h
@@ -19,8 +19,4 @@ extern lzma_ret lzma_simple_props_decode(
 		void **options, const lzma_allocator *allocator,
 		const uint8_t *props, size_t props_size);
 
-extern lzma_ret lzma_arm64_props_decode(
-		void **options, const lzma_allocator *allocator,
-		const uint8_t *props, size_t props_size);
-
 #endif
diff --git a/src/liblzma/simple/simple_encoder.h b/src/liblzma/simple/simple_encoder.h
index 10828f8f..1cee4823 100644
--- a/src/liblzma/simple/simple_encoder.h
+++ b/src/liblzma/simple/simple_encoder.h
@@ -20,6 +20,4 @@ extern lzma_ret lzma_simple_props_size(uint32_t *size, const void *options);
 
 extern lzma_ret lzma_simple_props_encode(const void *options, uint8_t *out);
 
-extern lzma_ret lzma_arm64_props_encode(const void *options, uint8_t *out);
-
 #endif
author	Lasse Collin <lasse.collin@tukaani.org>	2022-11-14 23:14:41 +0200
committer	Lasse Collin <lasse.collin@tukaani.org>	2022-11-14 23:16:38 +0200
commit	8370ec8edf9ddf8d1d9fef03d8d1027503ec4c35 (patch)
tree	5a5badf9edc6b6aaf3f8aa4e6cea15940c34b733 /src/liblzma/simple
parent	liblzma: Add fast CRC64 for 32/64-bit x86 using SSSE3 + SSE4.1 + CLMUL. (diff)
download	xz-8370ec8edf9ddf8d1d9fef03d8d1027503ec4c35.tar.xz