liblzma: Optimize LZ decoder slightly.

Now extra buffer space is reserved so that repeating bytes for any single match will never need to copy from two places (both the beginning and the end of the buffer). This simplifies dict_repeat() and helps a little with speed. This seems to reduce .lzma decompression time about 2 %, so with .xz and CRC it could be slightly less. The small things add up still.
author: Lasse Collin <lasse.collin@tukaani.org> 2024-02-12 17:09:10 +0200
committer: Lasse Collin <lasse.collin@tukaani.org> 2024-02-14 18:31:16 +0200
commit: f3872a59475456c5d365cad9f1c5be514cfa54b5 (patch)
tree: 38070fa3421729f881bb73d6c6053391482523ea /src
parent: liblzma: LZMA decoder: Get rid of next_state[]. (diff)
download: xz-f3872a59475456c5d365cad9f1c5be514cfa54b5.tar.xz
3 files changed, 88 insertions, 60 deletions
diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c
index 73bf20d9..92913f22 100644
--- a/src/liblzma/lz/lz_decoder.c
+++ b/src/liblzma/lz/lz_decoder.c
@@ -53,9 +53,10 @@ typedef struct {
 static void
 lz_decoder_reset(lzma_coder *coder)
 {
-	coder->dict.pos = 0;
+	coder->dict.pos = 2 * LZ_DICT_REPEAT_MAX;
 	coder->dict.full = 0;
-	coder->dict.buf[coder->dict.size - 1] = '\0';
+	coder->dict.buf[2 * LZ_DICT_REPEAT_MAX - 1] = '\0';
+	coder->dict.has_wrapped = false;
 	coder->dict.need_reset = false;
 	return;
 }
@@ -69,8 +70,15 @@ decode_buffer(lzma_coder *coder,
 {
 	while (true) {
 		// Wrap the dictionary if needed.
-		if (coder->dict.pos == coder->dict.size)
-			coder->dict.pos = 0;
+		if (coder->dict.pos == coder->dict.size) {
+			// See the comment of #define LZ_DICT_REPEAT_MAX.
+			coder->dict.pos = LZ_DICT_REPEAT_MAX;
+			coder->dict.has_wrapped = true;
+			memcpy(coder->dict.buf, coder->dict.buf
+						+ coder->dict.size
+						- LZ_DICT_REPEAT_MAX,
+					LZ_DICT_REPEAT_MAX);
+		}
 
 		// Store the current dictionary position. It is needed to know
 		// where to start copying to the out[] buffer.
@@ -252,21 +260,31 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 	// dictionary to the output buffer, since applications are
 	// recommended to give aligned buffers to liblzma.
 	//
+	// Reserve 2 * LZ_DICT_REPEAT_MAX bytes of extra space which is
+	// needed for alloc_size.
+	//
 	// Avoid integer overflow.
-	if (lz_options.dict_size > SIZE_MAX - 15)
+	if (lz_options.dict_size > SIZE_MAX - 15 - 2 * LZ_DICT_REPEAT_MAX)
 		return LZMA_MEM_ERROR;
 
 	lz_options.dict_size = (lz_options.dict_size + 15) & ~((size_t)(15));
 
+	// Reserve extra space as explained in the comment
+	// of #define LZ_DICT_REPEAT_MAX.
+	const size_t alloc_size
+			= lz_options.dict_size + 2 * LZ_DICT_REPEAT_MAX;
+
 	// Allocate and initialize the dictionary.
-	if (coder->dict.size != lz_options.dict_size) {
+	if (coder->dict.size != alloc_size) {
 		lzma_free(coder->dict.buf, allocator);
-		coder->dict.buf
-				= lzma_alloc(lz_options.dict_size, allocator);
+		coder->dict.buf = lzma_alloc(alloc_size, allocator);
 		if (coder->dict.buf == NULL)
 			return LZMA_MEM_ERROR;
 
-		coder->dict.size = lz_options.dict_size;
+		// NOTE: Yes, alloc_size, not lz_options.dict_size. The way
+		// coder->dict.full is updated will take care that we will
+		// still reject distances larger than lz_options.dict_size.
+		coder->dict.size = alloc_size;
 	}
 
 	lz_decoder_reset(next->coder);
@@ -279,9 +297,12 @@ lzma_lz_decoder_init(lzma_next_coder *next, const lzma_allocator *allocator,
 		const size_t copy_size = my_min(lz_options.preset_dict_size,
 				lz_options.dict_size);
 		const size_t offset = lz_options.preset_dict_size - copy_size;
-		memcpy(coder->dict.buf, lz_options.preset_dict + offset,
+		memcpy(coder->dict.buf + coder->dict.pos,
+				lz_options.preset_dict + offset,
 				copy_size);
-		coder->dict.pos = copy_size;
+
+		// dict.pos isn't zero after lz_decoder_reset().
+		coder->dict.pos += copy_size;
 		coder->dict.full = copy_size;
 	}
 
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 3b41649c..da273480 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -16,10 +16,28 @@
 #include "common.h"
 
 
+/// Maximum length of a match rounded up to a nice power of 2 which is
+/// a good size for aligned memcpy(). The allocated dictionary buffer will
+/// be 2 * LZ_DICT_REPEAT_MAX bytes larger than the actual dictionary size:
+///
+/// (1) Every time the decoder reaches the end of the dictionary buffer,
+///     the last LZ_DICT_REPEAT_MAX bytes will be copied to the beginning.
+///     This way dict_repeat() will only need to copy from one place,
+///     never from both the end and beginning of the buffer.
+///
+/// (2) The other LZ_DICT_REPEAT_MAX bytes is kept as a buffer between
+///     the oldest byte still in the dictionary and the current write
+///     position. This way dict_repeat(dict, dict->size - 1, &len)
+///     won't need memmove() as the copying cannot overlap.
+///
+/// Note that memcpy() still cannot be used if distance < len.
+///
+/// LZMA's longest match length is 273 so pick a multiple of 16 above that.
+#define LZ_DICT_REPEAT_MAX 288
+
+
 typedef struct {
-	/// Pointer to the dictionary buffer. It can be an allocated buffer
-	/// internal to liblzma, or it can a be a buffer given by the
-	/// application when in single-call mode (not implemented yet).
+	/// Pointer to the dictionary buffer.
 	uint8_t *buf;
 
 	/// Write position in dictionary. The next byte will be written to
@@ -34,9 +52,16 @@ typedef struct {
 	/// Write limit
 	size_t limit;
 
-	/// Size of the dictionary
+	/// Allocated size of buf. This is 2 * LZ_DICT_REPEAT_MAX bytes
+	/// larger than the actual dictionary size. This is enforced by
+	/// how the value for "full" is set; it can be at most
+	/// "size - 2 * LZ_DICT_REPEAT_MAX".
 	size_t size;
 
+	/// True once the dictionary has become full and the writing position
+	/// has been wrapped in decode_buffer() in lz_decoder.c.
+	bool has_wrapped;
+
 	/// True when dictionary should be reset before decoding more data.
 	bool need_reset;
 
@@ -102,7 +127,16 @@ static inline uint8_t
 dict_get(const lzma_dict *const dict, const uint32_t distance)
 {
 	return dict->buf[dict->pos - distance - 1
-			+ (distance < dict->pos ? 0 : dict->size)];
+			+ (distance < dict->pos
+				? 0 : dict->size - LZ_DICT_REPEAT_MAX)];
+}
+
+
+/// Optimized version of dict_get(dict, 0)
+static inline uint8_t
+dict_get0(const lzma_dict *const dict)
+{
+	return dict->buf[dict->pos - 1];
 }
 
 
@@ -131,50 +165,27 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 	uint32_t left = my_min(dict_avail, *len);
 	*len -= left;
 
+	size_t back = dict->pos - distance - 1;
+	if (distance >= dict->pos)
+		back += dict->size - LZ_DICT_REPEAT_MAX;
+
 	// Repeat a block of data from the history. Because memcpy() is faster
 	// than copying byte by byte in a loop, the copying process gets split
-	// into three cases.
+	// into two cases.
 	if (distance < left) {
 		// Source and target areas overlap, thus we can't use
 		// memcpy() nor even memmove() safely.
 		do {
-			dict->buf[dict->pos] = dict_get(dict, distance);
-			++dict->pos;
+			dict->buf[dict->pos++] = dict->buf[back++];
 		} while (--left > 0);
-
-	} else if (distance < dict->pos) {
-		// The easiest and fastest case
-		memcpy(dict->buf + dict->pos,
-				dict->buf + dict->pos - distance - 1,
-				left);
-		dict->pos += left;
-
 	} else {
-		// The bigger the dictionary, the more rare this
-		// case occurs. We need to "wrap" the dict, thus
-		// we might need two memcpy() to copy all the data.
-		assert(dict->full == dict->size);
-		const uint32_t copy_pos
-				= dict->pos - distance - 1 + dict->size;
-		uint32_t copy_size = dict->size - copy_pos;
-
-		if (copy_size < left) {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					copy_size);
-			dict->pos += copy_size;
-			copy_size = left - copy_size;
-			memcpy(dict->buf + dict->pos, dict->buf, copy_size);
-			dict->pos += copy_size;
-		} else {
-			memmove(dict->buf + dict->pos, dict->buf + copy_pos,
-					left);
-			dict->pos += left;
-		}
+		memcpy(dict->buf + dict->pos, dict->buf + back, left);
+		dict->pos += left;
 	}
 
 	// Update how full the dictionary is.
-	if (dict->full < dict->pos)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 
 	return unlikely(*len != 0);
 }
@@ -185,8 +196,8 @@ dict_put(lzma_dict *dict, uint8_t byte)
 {
 	dict->buf[dict->pos++] = byte;
 
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 }
 
 
@@ -198,11 +209,7 @@ dict_put_safe(lzma_dict *dict, uint8_t byte)
 	if (dict->pos == dict->limit)
 		return true;
 
-	dict->buf[dict->pos++] = byte;
-
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
-
+	dict_put(dict, byte);
 	return false;
 }
 
@@ -226,8 +233,8 @@ dict_write(lzma_dict *restrict dict, const uint8_t *restrict in,
 	*left -= lzma_bufcpy(in, in_pos, in_size,
 			dict->buf, &dict->pos, dict->limit);
 
-	if (dict->pos > dict->full)
-		dict->full = dict->pos;
+	if (!dict->has_wrapped)
+		dict->full = dict->pos - 2 * LZ_DICT_REPEAT_MAX;
 
 	return;
 }
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index c5049a48..81149006 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -360,7 +360,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			// lc params.
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));
 
 			if (is_literal_state(state)) {
 				update_literal_normal(state);
@@ -685,7 +685,7 @@ slow:
 
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
-					dict.pos, dict_get(&dict, 0));
+					dict.pos, dict_get0(&dict));
 			symbol = 1;
 
 			if (is_literal_state(state)) {
author	Lasse Collin <lasse.collin@tukaani.org>	2024-02-12 17:09:10 +0200
committer	Lasse Collin <lasse.collin@tukaani.org>	2024-02-14 18:31:16 +0200
commit	f3872a59475456c5d365cad9f1c5be514cfa54b5 (patch)
tree	38070fa3421729f881bb73d6c6053391482523ea /src
parent	liblzma: LZMA decoder: Get rid of next_state[]. (diff)
download	xz-f3872a59475456c5d365cad9f1c5be514cfa54b5.tar.xz