aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/simple/arm64.c
blob: 05b4be297039b4c3c6b412ee846c47b50edaac32 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
///////////////////////////////////////////////////////////////////////////////
//
/// \file       arm64.c
/// \brief      Filter for ARM64 binaries
///
/// This converts ARM64 relative addresses in the BL and ADRP immediates
/// to absolute values to increase redundancy of ARM64 code.
///
/// Unlike the older BCJ filters, this handles zeros specially. This way
/// the filter won't be counterproductive on Linux kernel modules, object
/// files, and static libraries where the immediates are all zeros (to be
/// filled later by a linker). Usually this has no downsides but with bad
/// luck it can reduce the effectiveness of the filter and trying a different
/// start offset can mitigate the problem.
///
/// Converting B or ADR instructions was also tested but it's not useful.
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
/// These are typical for loops and if-statements. Encoding them to their
/// absolute address reduces redundancy since many of the small relative
/// jump values are repeated, but very few of the absolute addresses are.
//
//  Authors:    Lasse Collin
//              Jia Tan
//
//  This file has been put into the public domain.
//  You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////

#include "simple_private.h"


static uint32_t
arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
{
	if (!is_encoder)
		pc = 0U - pc;

	uint32_t dest = src + pc;
	if ((dest & mask) == 0)
		dest = pc;

	return dest;
}


static size_t
arm64_code(void *simple lzma_attribute((__unused__)),
		uint32_t now_pos, bool is_encoder,
		uint8_t *buffer, size_t size)
{
	size_t i;

	// Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
	// with auto-vectorization that is enabled by default with -O2.
	// Even -Os, which doesn't use vectorization, produces faster code.
	// Disabling vectorization with -O2 gives good speed (faster than -Os)
	// and reasonable code size.
	//
	// Such vectorization bloat happens with -O2 when targeting ARM64 too
	// but performance hasn't been tested.
	//
	// Clang 14 and 15 won't auto-vectorize this loop if the condition
	// for ADRP is replaced with the commented-out version. However,
	// at least Clang 14.0.6 doesn't generate as fast code with that
	// condition. The commented-out code is also bigger.
	//
	// GCC 12.2 on x86-64 with -O2 produces good code with both versions
	// of the ADRP if-statement although the single-branch version is
	// slightly faster and smaller than the commented-out version.
	// Speed is similar to non-vectorized clang -O2.
#ifdef __clang__
#	pragma clang loop vectorize(disable)
#endif
	for (i = 0; i + 4 <= size; i += 4) {
		const uint32_t pc = (uint32_t)(now_pos + i);
		uint32_t instr = read32le(buffer + i);

		if ((instr >> 26) == 0x25) {
			// BL instruction:
			// The full 26-bit immediate is converted.
			// The range is +/-128 MiB.
			//
			// Using the full range is helps quite a lot with
			// big executables. Smaller range would reduce false
			// positives in non-code sections of the input though
			// so this is a compromise that slightly favors big
			// files. With the full range only six bits of the 32
			// need to match to trigger a conversion.
			const uint32_t mask26 = 0x03FFFFFF;
			const uint32_t src = instr & mask26;
			instr = 0x94000000;

			if (src == 0)
				continue;

			instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
					& mask26;
			write32le(buffer + i, instr);

/*
		// This is a more readable version of the one below but this
		// has two branches. It results in bigger and slower code.
		} else if ((instr & 0x9FF00000) == 0x90000000
				|| (instr & 0x9FF00000) == 0x90F00000) {
*/
		// This is only a rotation, addition, and testing that
		// none of the bits covered by the bitmask are set.
		} else if (((((instr << 8) | (instr >> 24))
				+ (0x10000000 - 0x90)) & 0xE000009F) == 0) {
			// ADRP instruction:
			// Only values in the range +/-512 MiB are converted.
			//
			// Using less than the full +/-4 GiB range reduces
			// false positives on non-code sections of the input
			// while being excellent for executables up to 512 MiB.
			// The positive effect of ADRP conversion is smaller
			// than that of BL but it also doesn't hurt so much in
			// non-code sections of input because, with +/-512 MiB
			// range, nine bits of 32 need to match to trigger a
			// conversion (two 10-bit match choices = 9 bits).
			const uint32_t src = ((instr >> 29) & 3)
					| ((instr >> 3) & 0x0003FFFC);
			instr &= 0x9000001F;

			if (src == 0)
				continue;

			const uint32_t dest = arm64_conv(
					src, pc >> 12, 0x3FFFF, is_encoder);

			instr |= (dest & 3) << 29;
			instr |= (dest & 0x0003FFFC) << 3;
			instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
			write32le(buffer + i, instr);
		}
	}

	return i;
}


static lzma_ret
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
		const lzma_filter_info *filters, bool is_encoder)
{
	return lzma_simple_coder_init(next, allocator, filters,
			&arm64_code, 0, 4, 4, is_encoder, true);
}


#ifdef HAVE_ENCODER_ARM64
extern lzma_ret
lzma_simple_arm64_encoder_init(lzma_next_coder *next,
		const lzma_allocator *allocator,
		const lzma_filter_info *filters)
{
	return arm64_coder_init(next, allocator, filters, true);
}
#endif


#ifdef HAVE_DECODER_ARM64
extern lzma_ret
lzma_simple_arm64_decoder_init(lzma_next_coder *next,
		const lzma_allocator *allocator,
		const lzma_filter_info *filters)
{
	return arm64_coder_init(next, allocator, filters, false);
}
#endif