liblzma: Creates Non-resumable and Resumable modes for lzma_decoder.

The new decoder resumes the first decoder loop in the Resumable mode. Then, the code executes in Non-resumable mode until it detects that it cannot guarantee to have enough input/output to decode another symbol. The Resumable mode is how the decoder has always worked. Before decoding every input bit, it checks if there is enough space and will save its location to be resumed later. When the decoder has more input/output, it jumps back to the correct sequence in the Resumable mode code. When the input/output buffers are large, the Resumable mode is much slower than the Non-resumable because it has more branches and is harder for the compiler to optimize since it is in a large switch block. Early benchmarking shows significant time improvement (8-10% on gcc and clang x86) by using the Non-resumable code as much as possible.
author: Jia Tan <jiat0218@gmail.com> 2024-02-12 17:09:10 +0200
committer: Lasse Collin <lasse.collin@tukaani.org> 2024-02-14 18:31:16 +0200
commit: de5c5e417645ad8906ef914bc059d08c1462fc29 (patch)
tree: 62672e60baf3f24030c402f5b3364671a2b81dee /src
parent: liblzma: Creates separate "safe" range decoder mode. (diff)
download: xz-de5c5e417645ad8906ef914bc059d08c1462fc29.tar.xz
2 files changed, 521 insertions, 213 deletions
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 65acf0b1..3b41649c 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -180,12 +180,22 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 }
 
 
+static inline void
+dict_put(lzma_dict *dict, uint8_t byte)
+{
+	dict->buf[dict->pos++] = byte;
+
+	if (dict->pos > dict->full)
+		dict->full = dict->pos;
+}
+
+
 /// Puts one byte into the dictionary. Returns true if the dictionary was
 /// already full and the byte couldn't be added.
 static inline bool
-dict_put(lzma_dict *dict, uint8_t byte)
+dict_put_safe(lzma_dict *dict, uint8_t byte)
 {
-	if (unlikely(dict->pos == dict->limit))
+	if (dict->pos == dict->limit)
 		return true;
 
 	dict->buf[dict->pos++] = byte;
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index 2e8393d6..6203bae9 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -7,6 +7,7 @@
 ///
 //  Authors:    Igor Pavlov
 //              Lasse Collin
+//              Jia Tan
 //
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -21,8 +22,12 @@
 #	pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #endif
 
+// Minimum number of input bytes to safely decode one LZMA symbol.
+// The worst case is that we decode 22 bits using probabilities and 26
+// direct bits. This may decode at maximum 20 bytes of input plus one
+// extra byte after the final EOPM normalization.
+#define LZMA_IN_REQUIRED 21
 
-#ifdef HAVE_SMALL
 
 // Macros for (somewhat) size-optimized code.
 // This is used to decode the match length (how many bytes must be repeated
@@ -193,22 +198,26 @@ typedef struct {
 	enum {
 		SEQ_NORMALIZE,
 		SEQ_IS_MATCH,
-		seq_8(SEQ_LITERAL),
-		seq_8(SEQ_LITERAL_MATCHED),
+		SEQ_LITERAL,
+		SEQ_LITERAL_MATCHED,
 		SEQ_LITERAL_WRITE,
 		SEQ_IS_REP,
-		seq_len(SEQ_MATCH_LEN),
-		seq_6(SEQ_DIST_SLOT),
+		SEQ_MATCH_LEN_CHOICE,
+		SEQ_MATCH_LEN_CHOICE2,
+		SEQ_MATCH_LEN_BITTREE,
+		SEQ_DIST_SLOT,
 		SEQ_DIST_MODEL,
 		SEQ_DIRECT,
-		seq_4(SEQ_ALIGN),
+		SEQ_ALIGN,
 		SEQ_EOPM,
 		SEQ_IS_REP0,
 		SEQ_SHORTREP,
 		SEQ_IS_REP0_LONG,
 		SEQ_IS_REP1,
 		SEQ_IS_REP2,
-		seq_len(SEQ_REP_LEN),
+		SEQ_REP_LEN_CHOICE,
+		SEQ_REP_LEN_CHOICE2,
+		SEQ_REP_LEN_BITTREE,
 		SEQ_COPY,
 	} sequence;
 
@@ -309,8 +318,42 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		might_finish_without_eopm = true;
 	}
 
-	// The main decoder loop. The "switch" is used to restart the decoder at
-	// correct location. Once restarted, the "switch" is no longer used.
+	// Lookup table used to update the literal state.
+	// Compared to other state updates, this would need two branches.
+	// The lookup table is used by both Resumable and Non-resumable modes.
+	static const lzma_lzma_state next_state[] = {
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_MATCH_LIT_LIT,
+		STATE_REP_LIT_LIT,
+		STATE_SHORTREP_LIT_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT,
+		STATE_SHORTREP_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT
+	};
+
+	// The main decoder loop. The "switch" is used to resume the decoder at
+	// correct location. Once resumed, the "switch" is no longer used.
+	// The decoder loops is split into two modes:
+	//
+	// 1 - Non-resumable mode (fast). This is used when it is guaranteed
+	//     there is enough input to decode the next symbol. If the output
+	//     limit is reached, then the decoder loop will save the place
+	//     for the resumable mode to continue. This mode is not used if
+	//     HAVE_SMALL is defined. This is faster than Resumable mode
+	//     because it reduces the number of branches needed and allows
+	//     for more compiler optimizations.
+	//
+	// 2 - Resumable mode (slow). This is used when a previous decoder
+	//     loop did not have enough space in the input or output buffers
+	//     to complete. It uses sequence enum values to set remind
+	//     coder->sequence where to resume in the decoder loop. This
+	//     is the only mode used when HAVE_SMALL is defined.
+
 	switch (coder->sequence)
 	while (true) {
 		// Calculate new pos_state. This is skipped on the first loop
@@ -318,40 +361,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// variables.
 		pos_state = dict.pos & pos_mask;
 
-	case SEQ_NORMALIZE:
-	case SEQ_IS_MATCH:
-		if (unlikely(might_finish_without_eopm
-				&& dict.pos == dict.limit)) {
-			// In rare cases there is a useless byte that needs
-			// to be read anyway.
-			rc_normalize(SEQ_NORMALIZE);
+#ifndef HAVE_SMALL
 
-			// If the range decoder state is such that we can
-			// be at the end of the LZMA stream, then the
-			// decoding is finished.
-			if (rc_is_finished(rc)) {
-				ret = LZMA_STREAM_END;
-				goto out;
-			}
+		///////////////////////////////
+		// Non-resumable Mode (fast) //
+		///////////////////////////////
 
-			// If the caller hasn't allowed EOPM to be present
-			// together with known uncompressed size, then the
-			// LZMA stream is corrupt.
-			if (!coder->allow_eopm) {
-				ret = LZMA_DATA_ERROR;
-				goto out;
-			}
+		// If there is not enough room for another LZMA symbol
+		// go to Resumable mode.
+		if (rc_in_pos + LZMA_IN_REQUIRED > in_size
+			|| dict.pos == dict.limit)
+			goto slow;
 
-			// Otherwise continue decoding with the expectation
-			// that the next LZMA symbol is EOPM.
-			eopm_is_valid = true;
-		}
+		// Decode the first bit from the next LZMA symbol.
+		// If the bit is a 0, then we handle it as a literal.
+		// If the bit is a 1, then it is a match of previously
+		// decoded data.
+		rc_if_0(coder->is_match[state][pos_state]) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
 
-		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			// Update the RC that we have decoded a 0.
 			rc_update_0(coder->is_match[state][pos_state]);
 
-			// It's a literal i.e. a single 8-bit byte.
-
+			// Get the correct probability array from lp and
+			// lc params.
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
 					dict.pos, dict_get(&dict, 0));
@@ -359,21 +394,17 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
-#ifdef HAVE_SMALL
-	case SEQ_LITERAL:
-				do {
-					rc_bit(probs[symbol], , , SEQ_LITERAL);
-				} while (symbol < (1 << 8));
-#else
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL0);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL1);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL2);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL3);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL4);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL5);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL6);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL7);
-#endif
+				// We need to decode 8 bits, so instead
+				// of looping from 1 - 8, we unroll the
+				// loop for a speed optimization.
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
 			} else {
 				// Decode literal with match byte.
 				//
@@ -381,6 +412,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				// ("match byte") to "len" to minimize the
 				// number of variables we need to store
 				// between decoder calls.
+
 				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
 
 				// The usage of "offset" allows omitting some
@@ -389,99 +421,68 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				// set to zero if match_bit didn't match.
 				offset = 0x100;
 
-#ifdef HAVE_SMALL
-	case SEQ_LITERAL_MATCHED:
-				do {
-					const uint32_t match_bit
-							= len & offset;
-					const uint32_t subcoder_index
-							= offset + match_bit
-							+ symbol;
-
-					rc_bit(probs[subcoder_index],
-							offset &= ~match_bit,
-							offset &= match_bit,
-							SEQ_LITERAL_MATCHED);
-
-					// It seems to be faster to do this
-					// here instead of putting it to the
-					// beginning of the loop and then
-					// putting the "case" in the middle
-					// of the loop.
-					len <<= 1;
-
-				} while (symbol < (1 << 8));
-#else
 				// Unroll the loop.
 				uint32_t match_bit;
 				uint32_t subcoder_index;
 
-#	define d(seq) \
-		case seq: \
+#	define decode_with_match_bit \
 			match_bit = len & offset; \
 			subcoder_index = offset + match_bit + symbol; \
 			rc_bit(probs[subcoder_index], \
 					offset &= ~match_bit, \
-					offset &= match_bit, \
-					seq)
+					offset &= match_bit)
 
-				d(SEQ_LITERAL_MATCHED0);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED1);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED2);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED3);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED4);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED5);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED6);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED7);
-#	undef d
-#endif
+				decode_with_match_bit;
+#	undef decode_match_bit
 			}
 
-			//update_literal(state);
-			// Use a lookup table to update to literal state,
-			// since compared to other state updates, this would
-			// need two branches.
-			static const lzma_lzma_state next_state[] = {
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_MATCH_LIT_LIT,
-				STATE_REP_LIT_LIT,
-				STATE_SHORTREP_LIT_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT,
-				STATE_SHORTREP_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT
-			};
 			state = next_state[state];
 
-	case SEQ_LITERAL_WRITE:
-			if (unlikely(dict_put(&dict, symbol))) {
-				coder->sequence = SEQ_LITERAL_WRITE;
-				goto out;
-			}
-
+			// Write decoded literal to dictionary
+			dict_put(&dict, symbol);
 			continue;
 		}
 
-		// Instead of a new byte we are going to get a byte range
-		// (distance and length) which will be repeated from our
-		// output history.
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		// Instead of a new byte we are going to decode a
+		// distance-length pair. The distance represents how far
+		// back in the dictionary to begin copying. The length
+		// represents how many bytes to copy.
 
 		rc_update_1(coder->is_match[state][pos_state]);
 
-	case SEQ_IS_REP:
-		rc_if_0(coder->is_rep[state], SEQ_IS_REP) {
-			// Not a repeated match
+		rc_if_0(coder->is_rep[state]) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			// Not a repeated match. In this case,
+			// the length (how many bytes to copy) must be
+			// decoded first. Then, the distance (where to
+			// start copying) is decoded.
+			//
+			// This is also how we know when we are done
+			// decoding. If the distance decodes to UINT32_MAX,
+			// then we know to stop decoding (end of payload
+			// marker).
+
 			rc_update_0(coder->is_rep[state]);
 			update_match(state);
 
@@ -492,45 +493,50 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			rep1 = rep0;
 
 			// Decode the length of the match.
-			len_decode(len, coder->match_len_decoder,
-					pos_state, SEQ_MATCH_LEN);
+			len_decode_fast(len, coder->match_len_decoder,
+					pos_state);
+
+			// Next, decode the distance into rep0.
 
-			// Prepare to decode the highest two bits of the
-			// match distance.
+			// The next 6 bits determine how to decode the
+			// rest of the distance.
 			probs = coder->dist_slot[get_dist_state(len)];
 			symbol = 1;
 
-#ifdef HAVE_SMALL
-	case SEQ_DIST_SLOT:
-			do {
-				rc_bit(probs[symbol], , , SEQ_DIST_SLOT);
-			} while (symbol < DIST_SLOTS);
-#else
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT0);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT1);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT2);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT3);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT4);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT5);
-#endif
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+
 			// Get rid of the highest bit that was needed for
 			// indexing of the probability array.
 			symbol -= DIST_SLOTS;
 			assert(symbol <= 63);
 
 			if (symbol < DIST_MODEL_START) {
-				// Match distances [0, 3] have only two bits.
+				// If the decoded symbol is < DIST_MODEL_START
+				// then we use its value directly as the
+				// match distance. No other bits are needed.
+				// The only possible distance values
+				// are [0, 3].
 				rep0 = symbol;
 			} else {
-				// Decode the lowest [1, 29] bits of
-				// the match distance.
+				// Use the first two bits of symbol as the
+				// highest bits of the match distance.
+
+				// "limit" represents the number of low bits
+				// to decode.
 				limit = (symbol >> 1) - 1;
 				assert(limit >= 1 && limit <= 30);
 				rep0 = 2 + (symbol & 1);
 
 				if (symbol < DIST_MODEL_END) {
-					// Prepare to decode the low bits for
-					// a distance of [4, 127].
+					// When symbol is > DIST_MODEL_START,
+					// but symbol < DIST_MODEL_END, then
+					// it can decode distances between
+					// [4, 127].
 					assert(limit <= 5);
 					rep0 <<= limit;
 					assert(rep0 <= 96);
@@ -548,52 +554,39 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							- symbol - 1;
 					symbol = 1;
 					offset = 0;
-	case SEQ_DIST_MODEL:
-#ifdef HAVE_SMALL
-					do {
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-					} while (++offset < limit);
-#else
+
 					switch (limit) {
 					case 5:
 						assert(offset == 0);
 						rc_bit(probs[symbol], ,
-							rep0 += 1U,
-							SEQ_DIST_MODEL);
+							rep0 += 1U);
 						++offset;
 						--limit;
 					case 4:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 3:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 2:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 1:
 						// We need "symbol" only for
 						// indexing the probability
 						// array, thus we can use
-						// rc_bit_last() here to omit
-						// the unneeded updating of
-						// "symbol".
+						// rc_bit_last() here to
+						// omit the unneeded updating
+						// of "symbol".
 						rc_bit_last(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 					}
-#endif
 				} else {
 					// The distance is >= 128. Decode the
 					// lower bits without probabilities
@@ -602,44 +595,39 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					assert(limit >= 6);
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
-	case SEQ_DIRECT:
+
 					// Not worth manual unrolling
 					do {
-						rc_direct(rep0, SEQ_DIRECT);
+						rc_direct(rep0);
 					} while (--limit > 0);
 
 					// Decode the lowest four bits using
 					// probabilities.
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
-#ifdef HAVE_SMALL
-					offset = 0;
-	case SEQ_ALIGN:
-					do {
-						rc_bit(coder->pos_align[
-								symbol], ,
-							rep0 += 1U << offset,
-							SEQ_ALIGN);
-					} while (++offset < ALIGN_BITS);
-#else
-	case SEQ_ALIGN0:
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 1, SEQ_ALIGN0);
-	case SEQ_ALIGN1:
+							rep0 += 1);
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 2, SEQ_ALIGN1);
-	case SEQ_ALIGN2:
+							rep0 += 2);
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 4, SEQ_ALIGN2);
-	case SEQ_ALIGN3:
-					// Like in SEQ_DIST_MODEL, we don't
-					// need "symbol" for anything else
-					// than indexing the probability array.
-					rc_bit_last(coder->pos_align[symbol], ,
-							rep0 += 8, SEQ_ALIGN3);
-#endif
+							rep0 += 4);
+
+					// Like when distance [4, 127], we
+					// don't need "symbol" for anything
+					// other than indexing the probability
+					// array.
+					rc_bit_last(
+						coder->pos_align[symbol], ,
+						rep0 += 8);
 
 					if (rep0 == UINT32_MAX) {
+						///////////////////////////
+						// End of payload marker //
+						///////////////////////////
+
 						// End of payload marker was
 						// found. It may only be
 						// present if
@@ -657,10 +645,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							goto out;
 						}
 
-	case SEQ_EOPM:
 						// LZMA1 stream with
 						// end-of-payload marker.
-						rc_normalize(SEQ_EOPM);
+						rc_normalize();
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
 							: LZMA_DATA_ERROR;
@@ -678,10 +665,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		} else {
 			rc_update_1(coder->is_rep[state]);
 
-			// Repeated match
-			//
-			// The match distance is a value that we have had
-			// earlier. The latest four match distances are
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			// The match distance is a value that we have decoded
+			// recently. The latest four match distances are
 			// available as rep0, rep1, rep2 and rep3. We will
 			// now decode which of them is the new distance.
 			//
@@ -692,13 +681,331 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				goto out;
 			}
 
-	case SEQ_IS_REP0:
-			rc_if_0(coder->is_rep0[state], SEQ_IS_REP0) {
+			rc_if_0(coder->is_rep0[state]) {
 				rc_update_0(coder->is_rep0[state]);
 				// The distance is rep0.
 
+				// Decode the next bit to determine if 1 byte
+				// should be copied from rep0 distance or
+				// if the number of bytes needs to be decoded.
+
+				// If the next bit is 0, then it is a
+				// "Short Rep Match" and only 1 bit is copied.
+				// Otherwise, the length of the match is
+				// decoded after the "else" statement.
+				rc_if_0(coder->is_rep0_long[state][pos_state]) {
+					rc_update_0(coder->is_rep0_long[
+							state][pos_state]);
+
+					update_short_rep(state);
+					dict_put(&dict, dict_get(&dict, rep0));
+					continue;
+				}
+
+				// Repeating more than one byte at
+				// distance of rep0.
+				rc_update_1(coder->is_rep0_long[
+						state][pos_state]);
+
+			} else {
+				rc_update_1(coder->is_rep0[state]);
+
+				// The distance is rep1, rep2 or rep3. Once
+				// we find out which one of these three, it
+				// is stored to rep0 and rep1, rep2 and rep3
+				// are updated accordingly. There is no
+				// "Short Rep Match" option, so the length
+				// of the match must always be decoded next.
+				rc_if_0(coder->is_rep1[state]) {
+					// The distance is rep1.
+					rc_update_0(coder->is_rep1[state]);
+
+					const uint32_t distance = rep1;
+					rep1 = rep0;
+					rep0 = distance;
+
+				} else {
+					rc_update_1(coder->is_rep1[state]);
+
+					rc_if_0(coder->is_rep2[state]) {
+						// The distance is rep2.
+						rc_update_0(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+
+					} else {
+						// The distance is rep3.
+						rc_update_1(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep3;
+						rep3 = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+					}
+				}
+			}
+
+			update_long_rep(state);
+
+			// Decode the length of the repeated match.
+			len_decode_fast(len, coder->rep_len_decoder,
+					pos_state);
+		}
+
+		/////////////////////////////////
+		// Repeat from history buffer. //
+		/////////////////////////////////
+
+		// The length is always between these limits. There is no way
+		// to trigger the algorithm to set len outside this range.
+		assert(len >= MATCH_LEN_MIN);
+		assert(len <= MATCH_LEN_MAX);
+
+		// Repeat len bytes from distance of rep0.
+		if (unlikely(dict_repeat(&dict, rep0, &len))) {
+			coder->sequence = SEQ_COPY;
+			goto out;
+		}
+
+		continue;
+
+slow:
+#endif
+	///////////////////////////
+	// Resumable Mode (slow) //
+	///////////////////////////
+
+	// This is very similar to Non-resumable Mode, so most of the
+	// comments are not repeated. The main differences are:
+	// - case labels are used to resume at the correct location.
+	// - Loops are not unrolled.
+	// - Range coder macros take an extra sequence argument
+	//   so they can save to coder->sequence the location to
+	//   resume in case there is not enough input.
+	case SEQ_NORMALIZE:
+	case SEQ_IS_MATCH:
+		if (unlikely(might_finish_without_eopm
+				&& dict.pos == dict.limit)) {
+			// In rare cases there is a useless byte that needs
+			// to be read anyway.
+			rc_normalize_safe(SEQ_NORMALIZE);
+
+			// If the range decoder state is such that we can
+			// be at the end of the LZMA stream, then the
+			// decoding is finished.
+			if (rc_is_finished(rc)) {
+				ret = LZMA_STREAM_END;
+				goto out;
+			}
+
+			// If the caller hasn't allowed EOPM to be present
+			// together with known uncompressed size, then the
+			// LZMA stream is corrupt.
+			if (!coder->allow_eopm) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+			// Otherwise continue decoding with the expectation
+			// that the next LZMA symbol is EOPM.
+			eopm_is_valid = true;
+		}
+
+		rc_if_0_safe(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
+
+			rc_update_0(coder->is_match[state][pos_state]);
+
+			probs = literal_subcoder(coder->literal,
+					literal_context_bits, literal_pos_mask,
+					dict.pos, dict_get(&dict, 0));
+			symbol = 1;
+
+			if (is_literal_state(state)) {
+				// Decode literal without match byte.
+				// The "slow" version does not unroll
+				// the loop.
+	case SEQ_LITERAL:
+				do {
+					rc_bit_safe(probs[symbol], , ,
+							SEQ_LITERAL);
+				} while (symbol < (1 << 8));
+			} else {
+				// Decode literal with match byte.
+				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
+
+				offset = 0x100;
+
+	case SEQ_LITERAL_MATCHED:
+				do {
+					const uint32_t match_bit
+							= len & offset;
+					const uint32_t subcoder_index
+							= offset + match_bit
+							+ symbol;
+
+					rc_bit_safe(probs[subcoder_index],
+							offset &= ~match_bit,
+							offset &= match_bit,
+							SEQ_LITERAL_MATCHED);
+
+					// It seems to be faster to do this
+					// here instead of putting it to the
+					// beginning of the loop and then
+					// putting the "case" in the middle
+					// of the loop.
+					len <<= 1;
+
+				} while (symbol < (1 << 8));
+			}
+
+			state = next_state[state];
+
+	case SEQ_LITERAL_WRITE:
+			if (dict_put_safe(&dict, symbol)) {
+				coder->sequence = SEQ_LITERAL_WRITE;
+				goto out;
+			}
+
+			continue;
+		}
+
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		rc_update_1(coder->is_match[state][pos_state]);
+
+	case SEQ_IS_REP:
+		rc_if_0_safe(coder->is_rep[state], SEQ_IS_REP) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			rc_update_0(coder->is_rep[state]);
+			update_match(state);
+
+			rep3 = rep2;
+			rep2 = rep1;
+			rep1 = rep0;
+
+			len_decode(len, coder->match_len_decoder,
+					pos_state, SEQ_MATCH_LEN);
+
+			probs = coder->dist_slot[get_dist_state(len)];
+			symbol = 1;
+
+	case SEQ_DIST_SLOT:
+			do {
+				rc_bit_safe(probs[symbol], , , SEQ_DIST_SLOT);
+			} while (symbol < DIST_SLOTS);
+
+			symbol -= DIST_SLOTS;
+			assert(symbol <= 63);
+
+			if (symbol < DIST_MODEL_START) {
+				rep0 = symbol;
+			} else {
+				limit = (symbol >> 1) - 1;
+				assert(limit >= 1 && limit <= 30);
+				rep0 = 2 + (symbol & 1);
+
+				if (symbol < DIST_MODEL_END) {
+					assert(limit <= 5);
+					rep0 <<= limit;
+					assert(rep0 <= 96);
+					// -1 is fine, because we start
+					// decoding at probs[1], not probs[0].
+					// NOTE: This violates the C standard,
+					// since we are doing pointer
+					// arithmetic past the beginning of
+					// the array.
+					assert((int32_t)(rep0 - symbol - 1)
+							>= -1);
+					assert((int32_t)(rep0 - symbol - 1)
+							<= 82);
+					probs = coder->pos_special + rep0
+							- symbol - 1;
+					symbol = 1;
+					offset = 0;
+	case SEQ_DIST_MODEL:
+					do {
+						rc_bit_safe(probs[symbol], ,
+							rep0 += 1U << offset,
+							SEQ_DIST_MODEL);
+					} while (++offset < limit);
+				} else {
+					assert(symbol >= 14);
+					assert(limit >= 6);
+					limit -= ALIGN_BITS;
+					assert(limit >= 2);
+	case SEQ_DIRECT:
+					do {
+						rc_direct_safe(rep0,
+								SEQ_DIRECT);
+					} while (--limit > 0);
+
+					rep0 <<= ALIGN_BITS;
+					symbol = 1;
+
+					offset = 0;
+	case SEQ_ALIGN:
+					do {
+						rc_bit_safe(coder->pos_align[
+								symbol], ,
+							rep0 += 1U << offset,
+							SEQ_ALIGN);
+					} while (++offset < ALIGN_BITS);
+
+					// End of payload marker
+					if (rep0 == UINT32_MAX) {
+						if (!eopm_is_valid) {
+							ret = LZMA_DATA_ERROR;
+							goto out;
+						}
+
+	case SEQ_EOPM:
+						rc_normalize_safe(SEQ_EOPM);
+						ret = rc_is_finished(rc)
+							? LZMA_STREAM_END
+							: LZMA_DATA_ERROR;
+						goto out;
+					}
+				}
+			}
+
+			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+		} else {
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			rc_update_1(coder->is_rep[state]);
+
+			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+	case SEQ_IS_REP0:
+			rc_if_0_safe(coder->is_rep0[state], SEQ_IS_REP0) {
+				rc_update_0(coder->is_rep0[state]);
+
 	case SEQ_IS_REP0_LONG:
-				rc_if_0(coder->is_rep0_long[state][pos_state],
+				rc_if_0_safe(coder->is_rep0_long
+						[state][pos_state],
 						SEQ_IS_REP0_LONG) {
 					rc_update_0(coder->is_rep0_long[
 							state][pos_state]);
@@ -706,8 +1013,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					update_short_rep(state);
 
 	case SEQ_SHORTREP:
-					if (unlikely(dict_put(&dict, dict_get(
-							&dict, rep0)))) {
+					if (dict_put_safe(&dict,
+							dict_get(&dict,
+							rep0))) {
 						coder->sequence = SEQ_SHORTREP;
 						goto out;
 					}
@@ -715,8 +1023,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					continue;
 				}
 
-				// Repeating more than one byte at
-				// distance of rep0.
 				rc_update_1(coder->is_rep0_long[
 						state][pos_state]);
 
@@ -724,11 +1030,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				rc_update_1(coder->is_rep0[state]);
 
 	case SEQ_IS_REP1:
-				// The distance is rep1, rep2 or rep3. Once
-				// we find out which one of these three, it
-				// is stored to rep0 and rep1, rep2 and rep3
-				// are updated accordingly.
-				rc_if_0(coder->is_rep1[state], SEQ_IS_REP1) {
+				rc_if_0_safe(coder->is_rep1[state], SEQ_IS_REP1) {
 					rc_update_0(coder->is_rep1[state]);
 
 					const uint32_t distance = rep1;
@@ -738,7 +1040,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				} else {
 					rc_update_1(coder->is_rep1[state]);
 	case SEQ_IS_REP2:
-					rc_if_0(coder->is_rep2[state],
+					rc_if_0_safe(coder->is_rep2[state],
 							SEQ_IS_REP2) {
 						rc_update_0(coder->is_rep2[
 								state]);
@@ -763,7 +1065,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			update_long_rep(state);
 
-			// Decode the length of the repeated match.
 			len_decode(len, coder->rep_len_decoder,
 					pos_state, SEQ_REP_LEN);
 		}
@@ -772,13 +1073,10 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// Repeat from history buffer. //
 		/////////////////////////////////
 
-		// The length is always between these limits. There is no way
-		// to trigger the algorithm to set len outside this range.
 		assert(len >= MATCH_LEN_MIN);
 		assert(len <= MATCH_LEN_MAX);
 
 	case SEQ_COPY:
-		// Repeat len bytes from distance of rep0.
 		if (unlikely(dict_repeat(&dict, rep0, &len))) {
 			coder->sequence = SEQ_COPY;
 			goto out;
author	Jia Tan <jiat0218@gmail.com>	2024-02-12 17:09:10 +0200
committer	Lasse Collin <lasse.collin@tukaani.org>	2024-02-14 18:31:16 +0200
commit	de5c5e417645ad8906ef914bc059d08c1462fc29 (patch)
tree	62672e60baf3f24030c402f5b3364671a2b81dee /src
parent	liblzma: Creates separate "safe" range decoder mode. (diff)
download	xz-de5c5e417645ad8906ef914bc059d08c1462fc29.tar.xz