1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
|
///////////////////////////////////////////////////////////////////////////////
//
/// \file arm64.c
/// \brief Filter for ARM64 binaries
///
/// This converts ARM64 relative addresses in the BL and ADRP immediates
/// to absolute values to increase redundancy of ARM64 code.
///
/// Converting B or ADR instructions was also tested but it's not useful.
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
/// These are typical for loops and if-statements. Encoding them to their
/// absolute address reduces redundancy since many of the small relative
/// jump values are repeated, but very few of the absolute addresses are.
//
// Authors: Lasse Collin
// Jia Tan
// Igor Pavlov
//
// This file has been put into the public domain.
// You can do whatever you want with this file.
//
///////////////////////////////////////////////////////////////////////////////
#include "simple_private.h"
static size_t
arm64_code(void *simple lzma_attribute((__unused__)),
uint32_t now_pos, bool is_encoder,
uint8_t *buffer, size_t size)
{
size_t i;
// Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
// with auto-vectorization that is enabled by default with -O2.
// Such vectorization bloat happens with -O2 when targeting ARM64 too
// but performance hasn't been tested.
#ifdef __clang__
# pragma clang loop vectorize(disable)
#endif
for (i = 0; i + 4 <= size; i += 4) {
uint32_t pc = (uint32_t)(now_pos + i);
uint32_t instr = read32le(buffer + i);
if ((instr >> 26) == 0x25) {
// BL instruction:
// The full 26-bit immediate is converted.
// The range is +/-128 MiB.
//
// Using the full range is helps quite a lot with
// big executables. Smaller range would reduce false
// positives in non-code sections of the input though
// so this is a compromise that slightly favors big
// files. With the full range only six bits of the 32
// need to match to trigger a conversion.
const uint32_t src = instr;
instr = 0x94000000;
pc >>= 2;
if (!is_encoder)
pc = 0U - pc;
instr |= (src + pc) & 0x03FFFFFF;
write32le(buffer + i, instr);
} else if ((instr & 0x9F000000) == 0x90000000) {
// ADRP instruction:
// Only values in the range +/-512 MiB are converted.
//
// Using less than the full +/-4 GiB range reduces
// false positives on non-code sections of the input
// while being excellent for executables up to 512 MiB.
// The positive effect of ADRP conversion is smaller
// than that of BL but it also doesn't hurt so much in
// non-code sections of input because, with +/-512 MiB
// range, nine bits of 32 need to match to trigger a
// conversion (two 10-bit match choices = 9 bits).
const uint32_t src = ((instr >> 29) & 3)
| ((instr >> 3) & 0x001FFFFC);
// With the addition only one branch is needed to
// check the +/- range. This is usually false when
// processing ARM64 code so branch prediction will
// handle it well in terms of performance.
//
//if ((src & 0x001E0000) != 0
// && (src & 0x001E0000) != 0x001E0000)
if ((src + 0x00020000) & 0x001C0000)
continue;
instr &= 0x9000001F;
pc >>= 12;
if (!is_encoder)
pc = 0U - pc;
const uint32_t dest = src + pc;
instr |= (dest & 3) << 29;
instr |= (dest & 0x0003FFFC) << 3;
instr |= (0U - (dest & 0x00020000)) & 0x00E00000;
write32le(buffer + i, instr);
}
}
return i;
}
static lzma_ret
arm64_coder_init(lzma_next_coder *next, const lzma_allocator *allocator,
const lzma_filter_info *filters, bool is_encoder)
{
return lzma_simple_coder_init(next, allocator, filters,
&arm64_code, 0, 4, 4, is_encoder);
}
#ifdef HAVE_ENCODER_ARM64
extern lzma_ret
lzma_simple_arm64_encoder_init(lzma_next_coder *next,
const lzma_allocator *allocator,
const lzma_filter_info *filters)
{
return arm64_coder_init(next, allocator, filters, true);
}
#endif
#ifdef HAVE_DECODER_ARM64
extern lzma_ret
lzma_simple_arm64_decoder_init(lzma_next_coder *next,
const lzma_allocator *allocator,
const lzma_filter_info *filters)
{
return arm64_coder_init(next, allocator, filters, false);
}
#endif
|