aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/simple/arm64.c
diff options
context:
space:
mode:
authorLasse Collin <lasse.collin@tukaani.org>2022-12-01 18:51:52 +0200
committerLasse Collin <lasse.collin@tukaani.org>2022-12-01 18:55:00 +0200
commitf9ca7d45162664ddd9fb70e19335c2426e5d75bb (patch)
tree914395592d375d3cec303b678ae53fea5b421585 /src/liblzma/simple/arm64.c
parentxz: Omit the special notes about ARM64 filter on the man page. (diff)
downloadxz-f9ca7d45162664ddd9fb70e19335c2426e5d75bb.tar.xz
liblzma: Omit zero-skipping from ARM64 filter.
It has some complicated downsides and its usefulness is more limited than I originally thought. So this change is bad for certain very specific situations but a generic solution that works for other filters (and is otherwise better too) is planned anyway. And this way 7-Zip can use the same compatible filter for the .7z format. This is still marked as experimental with a new temporary Filter ID.
Diffstat (limited to '')
-rw-r--r--src/liblzma/simple/arm64.c81
1 files changed, 23 insertions, 58 deletions
diff --git a/src/liblzma/simple/arm64.c b/src/liblzma/simple/arm64.c
index f7ad6578..5e7f2656 100644
--- a/src/liblzma/simple/arm64.c
+++ b/src/liblzma/simple/arm64.c
@@ -6,13 +6,6 @@
/// This converts ARM64 relative addresses in the BL and ADRP immediates
/// to absolute values to increase redundancy of ARM64 code.
///
-/// Unlike the older BCJ filters, this handles zeros specially. This way
-/// the filter won't be counterproductive on Linux kernel modules, object
-/// files, and static libraries where the immediates are all zeros (to be
-/// filled later by a linker). Usually this has no downsides but with bad
-/// luck it can reduce the effectiveness of the filter and trying a different
-/// start offset can mitigate the problem.
-///
/// Converting B or ADR instructions was also tested but it's not useful.
/// A majority of the jumps for the B instruction are very small (+/- 0xFF).
/// These are typical for loops and if-statements. Encoding them to their
@@ -30,20 +23,6 @@
#include "simple_private.h"
-static uint32_t
-arm64_conv(uint32_t src, uint32_t pc, uint32_t mask, bool is_encoder)
-{
- if (!is_encoder)
- pc = 0U - pc;
-
- uint32_t dest = src + pc;
- if ((dest & mask) == 0)
- dest = pc;
-
- return dest;
-}
-
-
static size_t
arm64_code(void *simple lzma_attribute((__unused__)),
uint32_t now_pos, bool is_encoder,
@@ -51,29 +30,15 @@ arm64_code(void *simple lzma_attribute((__unused__)),
{
size_t i;
- // Clang 14.0.6 on x86-64 makes this four times bigger and 60 % slower
+ // Clang 14.0.6 on x86-64 makes this four times bigger and 40 % slower
// with auto-vectorization that is enabled by default with -O2.
- // Even -Os, which doesn't use vectorization, produces faster code.
- // Disabling vectorization with -O2 gives good speed (faster than -Os)
- // and reasonable code size.
- //
// Such vectorization bloat happens with -O2 when targeting ARM64 too
// but performance hasn't been tested.
- //
- // Clang 14 and 15 won't auto-vectorize this loop if the condition
- // for ADRP is replaced with the commented-out version. However,
- // at least Clang 14.0.6 doesn't generate as fast code with that
- // condition. The commented-out code is also bigger.
- //
- // GCC 12.2 on x86-64 with -O2 produces good code with both versions
- // of the ADRP if-statement although the single-branch version is
- // slightly faster and smaller than the commented-out version.
- // Speed is similar to non-vectorized clang -O2.
#ifdef __clang__
# pragma clang loop vectorize(disable)
#endif
for (i = 0; i + 4 <= size; i += 4) {
- const uint32_t pc = (uint32_t)(now_pos + i);
+ uint32_t pc = (uint32_t)(now_pos + i);
uint32_t instr = read32le(buffer + i);
if ((instr >> 26) == 0x25) {
@@ -87,27 +52,17 @@ arm64_code(void *simple lzma_attribute((__unused__)),
// so this is a compromise that slightly favors big
// files. With the full range only six bits of the 32
// need to match to trigger a conversion.
- const uint32_t mask26 = 0x03FFFFFF;
- const uint32_t src = instr & mask26;
+ const uint32_t src = instr;
instr = 0x94000000;
- if (src == 0)
- continue;
+ pc >>= 2;
+ if (!is_encoder)
+ pc = 0U - pc;
- instr |= arm64_conv(src, pc >> 2, mask26, is_encoder)
- & mask26;
+ instr |= (src + pc) & 0x03FFFFFF;
write32le(buffer + i, instr);
-/*
- // This is a more readable version of the one below but this
- // has two branches. It results in bigger and slower code.
- } else if ((instr & 0x9FF00000) == 0x90000000
- || (instr & 0x9FF00000) == 0x90F00000) {
-*/
- // This is only a rotation, addition, and testing that
- // none of the bits covered by the bitmask are set.
- } else if (((((instr << 8) | (instr >> 24))
- + (0x10000000 - 0x90)) & 0xE000009F) == 0) {
+ } else if ((instr & 0x9F000000) == 0x90000000) {
// ADRP instruction:
// Only values in the range +/-512 MiB are converted.
//
@@ -120,15 +75,25 @@ arm64_code(void *simple lzma_attribute((__unused__)),
// range, nine bits of 32 need to match to trigger a
// conversion (two 10-bit match choices = 9 bits).
const uint32_t src = ((instr >> 29) & 3)
- | ((instr >> 3) & 0x0003FFFC);
- instr &= 0x9000001F;
+ | ((instr >> 3) & 0x001FFFFC);
- if (src == 0)
+ // With the addition only one branch is needed to
+ // check the +/- range. This is usually false when
+ // processing ARM64 code so branch prediction will
+ // handle it well in terms of performance.
+ //
+ //if ((src & 0x001E0000) != 0
+ // && (src & 0x001E0000) != 0x001E0000)
+ if ((src + 0x00020000) & 0x001C0000)
continue;
- const uint32_t dest = arm64_conv(
- src, pc >> 12, 0x3FFFF, is_encoder);
+ instr &= 0x9000001F;
+
+ pc >>= 12;
+ if (!is_encoder)
+ pc = 0U - pc;
+ const uint32_t dest = src + pc;
instr |= (dest & 3) << 29;
instr |= (dest & 0x0003FFFC) << 3;
instr |= (0U - (dest & 0x00020000)) & 0x00E00000;