2 files changed, 521 insertions, 213 deletions
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 65acf0b1..3b41649c 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -180,12 +180,22 @@ dict_repeat(lzma_dict *dict, uint32_t distance, uint32_t *len)
 }
 
 
+static inline void
+dict_put(lzma_dict *dict, uint8_t byte)
+{
+	dict->buf[dict->pos++] = byte;
+
+	if (dict->pos > dict->full)
+		dict->full = dict->pos;
+}
+
+
 /// Puts one byte into the dictionary. Returns true if the dictionary was
 /// already full and the byte couldn't be added.
 static inline bool
-dict_put(lzma_dict *dict, uint8_t byte)
+dict_put_safe(lzma_dict *dict, uint8_t byte)
 {
-	if (unlikely(dict->pos == dict->limit))
+	if (dict->pos == dict->limit)
 		return true;
 
 	dict->buf[dict->pos++] = byte;
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index 2e8393d6..6203bae9 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -7,6 +7,7 @@
 ///
 //  Authors:    Igor Pavlov
 //              Lasse Collin
+//              Jia Tan
 //
 ///////////////////////////////////////////////////////////////////////////////
 
@@ -21,8 +22,12 @@
 #	pragma GCC diagnostic ignored "-Wimplicit-fallthrough"
 #endif
 
+// Minimum number of input bytes to safely decode one LZMA symbol.
+// The worst case is that we decode 22 bits using probabilities and 26
+// direct bits. This may decode at maximum 20 bytes of input plus one
+// extra byte after the final EOPM normalization.
+#define LZMA_IN_REQUIRED 21
 
-#ifdef HAVE_SMALL
 
 // Macros for (somewhat) size-optimized code.
 // This is used to decode the match length (how many bytes must be repeated
@@ -193,22 +198,26 @@ typedef struct {
 	enum {
 		SEQ_NORMALIZE,
 		SEQ_IS_MATCH,
-		seq_8(SEQ_LITERAL),
-		seq_8(SEQ_LITERAL_MATCHED),
+		SEQ_LITERAL,
+		SEQ_LITERAL_MATCHED,
 		SEQ_LITERAL_WRITE,
 		SEQ_IS_REP,
-		seq_len(SEQ_MATCH_LEN),
-		seq_6(SEQ_DIST_SLOT),
+		SEQ_MATCH_LEN_CHOICE,
+		SEQ_MATCH_LEN_CHOICE2,
+		SEQ_MATCH_LEN_BITTREE,
+		SEQ_DIST_SLOT,
 		SEQ_DIST_MODEL,
 		SEQ_DIRECT,
-		seq_4(SEQ_ALIGN),
+		SEQ_ALIGN,
 		SEQ_EOPM,
 		SEQ_IS_REP0,
 		SEQ_SHORTREP,
 		SEQ_IS_REP0_LONG,
 		SEQ_IS_REP1,
 		SEQ_IS_REP2,
-		seq_len(SEQ_REP_LEN),
+		SEQ_REP_LEN_CHOICE,
+		SEQ_REP_LEN_CHOICE2,
+		SEQ_REP_LEN_BITTREE,
 		SEQ_COPY,
 	} sequence;
 
@@ -309,8 +318,42 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		might_finish_without_eopm = true;
 	}
 
-	// The main decoder loop. The "switch" is used to restart the decoder at
-	// correct location. Once restarted, the "switch" is no longer used.
+	// Lookup table used to update the literal state.
+	// Compared to other state updates, this would need two branches.
+	// The lookup table is used by both Resumable and Non-resumable modes.
+	static const lzma_lzma_state next_state[] = {
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_LIT_LIT,
+		STATE_MATCH_LIT_LIT,
+		STATE_REP_LIT_LIT,
+		STATE_SHORTREP_LIT_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT,
+		STATE_SHORTREP_LIT,
+		STATE_MATCH_LIT,
+		STATE_REP_LIT
+	};
+
+	// The main decoder loop. The "switch" is used to resume the decoder at
+	// correct location. Once resumed, the "switch" is no longer used.
+	// The decoder loops is split into two modes:
+	//
+	// 1 - Non-resumable mode (fast). This is used when it is guaranteed
+	//     there is enough input to decode the next symbol. If the output
+	//     limit is reached, then the decoder loop will save the place
+	//     for the resumable mode to continue. This mode is not used if
+	//     HAVE_SMALL is defined. This is faster than Resumable mode
+	//     because it reduces the number of branches needed and allows
+	//     for more compiler optimizations.
+	//
+	// 2 - Resumable mode (slow). This is used when a previous decoder
+	//     loop did not have enough space in the input or output buffers
+	//     to complete. It uses sequence enum values to set remind
+	//     coder->sequence where to resume in the decoder loop. This
+	//     is the only mode used when HAVE_SMALL is defined.
+
 	switch (coder->sequence)
 	while (true) {
 		// Calculate new pos_state. This is skipped on the first loop
@@ -318,40 +361,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// variables.
 		pos_state = dict.pos & pos_mask;
 
-	case SEQ_NORMALIZE:
-	case SEQ_IS_MATCH:
-		if (unlikely(might_finish_without_eopm
-				&& dict.pos == dict.limit)) {
-			// In rare cases there is a useless byte that needs
-			// to be read anyway.
-			rc_normalize(SEQ_NORMALIZE);
+#ifndef HAVE_SMALL
 
-			// If the range decoder state is such that we can
-			// be at the end of the LZMA stream, then the
-			// decoding is finished.
-			if (rc_is_finished(rc)) {
-				ret = LZMA_STREAM_END;
-				goto out;
-			}
+		///////////////////////////////
+		// Non-resumable Mode (fast) //
+		///////////////////////////////
 
-			// If the caller hasn't allowed EOPM to be present
-			// together with known uncompressed size, then the
-			// LZMA stream is corrupt.
-			if (!coder->allow_eopm) {
-				ret = LZMA_DATA_ERROR;
-				goto out;
-			}
+		// If there is not enough room for another LZMA symbol
+		// go to Resumable mode.
+		if (rc_in_pos + LZMA_IN_REQUIRED > in_size
+			|| dict.pos == dict.limit)
+			goto slow;
 
-			// Otherwise continue decoding with the expectation
-			// that the next LZMA symbol is EOPM.
-			eopm_is_valid = true;
-		}
+		// Decode the first bit from the next LZMA symbol.
+		// If the bit is a 0, then we handle it as a literal.
+		// If the bit is a 1, then it is a match of previously
+		// decoded data.
+		rc_if_0(coder->is_match[state][pos_state]) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
 
-		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			// Update the RC that we have decoded a 0.
 			rc_update_0(coder->is_match[state][pos_state]);
 
-			// It's a literal i.e. a single 8-bit byte.
-
+			// Get the correct probability array from lp and
+			// lc params.
 			probs = literal_subcoder(coder->literal,
 					literal_context_bits, literal_pos_mask,
 					dict.pos, dict_get(&dict, 0));
@@ -359,21 +394,17 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
-#ifdef HAVE_SMALL
-	case SEQ_LITERAL:
-				do {
-					rc_bit(probs[symbol], , , SEQ_LITERAL);
-				} while (symbol < (1 << 8));
-#else
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL0);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL1);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL2);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL3);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL4);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL5);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL6);
-				rc_bit_case(probs[symbol], , , SEQ_LITERAL7);
-#endif
+				// We need to decode 8 bits, so instead
+				// of looping from 1 - 8, we unroll the
+				// loop for a speed optimization.
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
+				rc_bit(probs[symbol], , );
 			} else {
 				// Decode literal with match byte.
 				//
@@ -381,6 +412,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				// ("match byte") to "len" to minimize the
 				// number of variables we need to store
 				// between decoder calls.
+
 				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
 
 				// The usage of "offset" allows omitting some
@@ -389,99 +421,68 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				// set to zero if match_bit didn't match.
 				offset = 0x100;
 
-#ifdef HAVE_SMALL
-	case SEQ_LITERAL_MATCHED:
-				do {
-					const uint32_t match_bit
-							= len & offset;
-					const uint32_t subcoder_index
-							= offset + match_bit
-							+ symbol;
-
-					rc_bit(probs[subcoder_index],
-							offset &= ~match_bit,
-							offset &= match_bit,
-							SEQ_LITERAL_MATCHED);
-
-					// It seems to be faster to do this
-					// here instead of putting it to the
-					// beginning of the loop and then
-					// putting the "case" in the middle
-					// of the loop.
-					len <<= 1;
-
-				} while (symbol < (1 << 8));
-#else
 				// Unroll the loop.
 				uint32_t match_bit;
 				uint32_t subcoder_index;
 
-#	define d(seq) \
-		case seq: \
+#	define decode_with_match_bit \
 			match_bit = len & offset; \
 			subcoder_index = offset + match_bit + symbol; \
 			rc_bit(probs[subcoder_index], \
 					offset &= ~match_bit, \
-					offset &= match_bit, \
-					seq)
+					offset &= match_bit)
 
-				d(SEQ_LITERAL_MATCHED0);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED1);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED2);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED3);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED4);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED5);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED6);
+				decode_with_match_bit;
 				len <<= 1;
-				d(SEQ_LITERAL_MATCHED7);
-#	undef d
-#endif
+				decode_with_match_bit;
+#	undef decode_match_bit
 			}
 
-			//update_literal(state);
-			// Use a lookup table to update to literal state,
-			// since compared to other state updates, this would
-			// need two branches.
-			static const lzma_lzma_state next_state[] = {
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_LIT_LIT,
-				STATE_MATCH_LIT_LIT,
-				STATE_REP_LIT_LIT,
-				STATE_SHORTREP_LIT_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT,
-				STATE_SHORTREP_LIT,
-				STATE_MATCH_LIT,
-				STATE_REP_LIT
-			};
 			state = next_state[state];
 
-	case SEQ_LITERAL_WRITE:
-			if (unlikely(dict_put(&dict, symbol))) {
-				coder->sequence = SEQ_LITERAL_WRITE;
-				goto out;
-			}
-
+			// Write decoded literal to dictionary
+			dict_put(&dict, symbol);
 			continue;
 		}
 
-		// Instead of a new byte we are going to get a byte range
-		// (distance and length) which will be repeated from our
-		// output history.
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		// Instead of a new byte we are going to decode a
+		// distance-length pair. The distance represents how far
+		// back in the dictionary to begin copying. The length
+		// represents how many bytes to copy.
 
 		rc_update_1(coder->is_match[state][pos_state]);
 
-	case SEQ_IS_REP:
-		rc_if_0(coder->is_rep[state], SEQ_IS_REP) {
-			// Not a repeated match
+		rc_if_0(coder->is_rep[state]) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			// Not a repeated match. In this case,
+			// the length (how many bytes to copy) must be
+			// decoded first. Then, the distance (where to
+			// start copying) is decoded.
+			//
+			// This is also how we know when we are done
+			// decoding. If the distance decodes to UINT32_MAX,
+			// then we know to stop decoding (end of payload
+			// marker).
+
 			rc_update_0(coder->is_rep[state]);
 			update_match(state);
 
@@ -492,45 +493,50 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 			rep1 = rep0;
 
 			// Decode the length of the match.
-			len_decode(len, coder->match_len_decoder,
-					pos_state, SEQ_MATCH_LEN);
+			len_decode_fast(len, coder->match_len_decoder,
+					pos_state);
+
+			// Next, decode the distance into rep0.
 
-			// Prepare to decode the highest two bits of the
-			// match distance.
+			// The next 6 bits determine how to decode the
+			// rest of the distance.
 			probs = coder->dist_slot[get_dist_state(len)];
 			symbol = 1;
 
-#ifdef HAVE_SMALL
-	case SEQ_DIST_SLOT:
-			do {
-				rc_bit(probs[symbol], , , SEQ_DIST_SLOT);
-			} while (symbol < DIST_SLOTS);
-#else
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT0);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT1);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT2);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT3);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT4);
-			rc_bit_case(probs[symbol], , , SEQ_DIST_SLOT5);
-#endif
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+			rc_bit(probs[symbol], , );
+
 			// Get rid of the highest bit that was needed for
 			// indexing of the probability array.
 			symbol -= DIST_SLOTS;
 			assert(symbol <= 63);
 
 			if (symbol < DIST_MODEL_START) {
-				// Match distances [0, 3] have only two bits.
+				// If the decoded symbol is < DIST_MODEL_START
+				// then we use its value directly as the
+				// match distance. No other bits are needed.
+				// The only possible distance values
+				// are [0, 3].
 				rep0 = symbol;
 			} else {
-				// Decode the lowest [1, 29] bits of
-				// the match distance.
+				// Use the first two bits of symbol as the
+				// highest bits of the match distance.
+
+				// "limit" represents the number of low bits
+				// to decode.
 				limit = (symbol >> 1) - 1;
 				assert(limit >= 1 && limit <= 30);
 				rep0 = 2 + (symbol & 1);
 
 				if (symbol < DIST_MODEL_END) {
-					// Prepare to decode the low bits for
-					// a distance of [4, 127].
+					// When symbol is > DIST_MODEL_START,
+					// but symbol < DIST_MODEL_END, then
+					// it can decode distances between
+					// [4, 127].
 					assert(limit <= 5);
 					rep0 <<= limit;
 					assert(rep0 <= 96);
@@ -548,52 +554,39 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							- symbol - 1;
 					symbol = 1;
 					offset = 0;
-	case SEQ_DIST_MODEL:
-#ifdef HAVE_SMALL
-					do {
-						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
-					} while (++offset < limit);
-#else
+
 					switch (limit) {
 					case 5:
 						assert(offset == 0);
 						rc_bit(probs[symbol], ,
-							rep0 += 1U,
-							SEQ_DIST_MODEL);
+							rep0 += 1U);
 						++offset;
 						--limit;
 					case 4:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 3:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 2:
 						rc_bit(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 						++offset;
 						--limit;
 					case 1:
 						// We need "symbol" only for
 						// indexing the probability
 						// array, thus we can use
-						// rc_bit_last() here to omit
-						// the unneeded updating of
-						// "symbol".
+						// rc_bit_last() here to
+						// omit the unneeded updating
+						// of "symbol".
 						rc_bit_last(probs[symbol], ,
-							rep0 += 1U << offset,
-							SEQ_DIST_MODEL);
+							rep0 += 1U << offset);
 					}
-#endif
 				} else {
 					// The distance is >= 128. Decode the
 					// lower bits without probabilities
@@ -602,44 +595,39 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					assert(limit >= 6);
 					limit -= ALIGN_BITS;
 					assert(limit >= 2);
-	case SEQ_DIRECT:
+
 					// Not worth manual unrolling
 					do {
-						rc_direct(rep0, SEQ_DIRECT);
+						rc_direct(rep0);
 					} while (--limit > 0);
 
 					// Decode the lowest four bits using
 					// probabilities.
 					rep0 <<= ALIGN_BITS;
 					symbol = 1;
-#ifdef HAVE_SMALL
-					offset = 0;
-	case SEQ_ALIGN:
-					do {
-						rc_bit(coder->pos_align[
-								symbol], ,
-							rep0 += 1U << offset,
-							SEQ_ALIGN);
-					} while (++offset < ALIGN_BITS);
-#else
-	case SEQ_ALIGN0:
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 1, SEQ_ALIGN0);
-	case SEQ_ALIGN1:
+							rep0 += 1);
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 2, SEQ_ALIGN1);
-	case SEQ_ALIGN2:
+							rep0 += 2);
+
 					rc_bit(coder->pos_align[symbol], ,
-							rep0 += 4, SEQ_ALIGN2);
-	case SEQ_ALIGN3:
-					// Like in SEQ_DIST_MODEL, we don't
-					// need "symbol" for anything else
-					// than indexing the probability array.
-					rc_bit_last(coder->pos_align[symbol], ,
-							rep0 += 8, SEQ_ALIGN3);
-#endif
+							rep0 += 4);
+
+					// Like when distance [4, 127], we
+					// don't need "symbol" for anything
+					// other than indexing the probability
+					// array.
+					rc_bit_last(
+						coder->pos_align[symbol], ,
+						rep0 += 8);
 
 					if (rep0 == UINT32_MAX) {
+						///////////////////////////
+						// End of payload marker //
+						///////////////////////////
+
 						// End of payload marker was
 						// found. It may only be
 						// present if
@@ -657,10 +645,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 							goto out;
 						}
 
-	case SEQ_EOPM:
 						// LZMA1 stream with
 						// end-of-payload marker.
-						rc_normalize(SEQ_EOPM);
+						rc_normalize();
 						ret = rc_is_finished(rc)
 							? LZMA_STREAM_END
 							: LZMA_DATA_ERROR;
@@ -678,10 +665,12 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		} else {
 			rc_update_1(coder->is_rep[state]);
 
-			// Repeated match
-			//
-			// The match distance is a value that we have had
-			// earlier. The latest four match distances are
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			// The match distance is a value that we have decoded
+			// recently. The latest four match distances are
 			// available as rep0, rep1, rep2 and rep3. We will
 			// now decode which of them is the new distance.
 			//
@@ -692,13 +681,331 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				goto out;
 			}
 
-	case SEQ_IS_REP0:
-			rc_if_0(coder->is_rep0[state], SEQ_IS_REP0) {
+			rc_if_0(coder->is_rep0[state]) {
 				rc_update_0(coder->is_rep0[state]);
 				// The distance is rep0.
 
+				// Decode the next bit to determine if 1 byte
+				// should be copied from rep0 distance or
+				// if the number of bytes needs to be decoded.
+
+				// If the next bit is 0, then it is a
+				// "Short Rep Match" and only 1 bit is copied.
+				// Otherwise, the length of the match is
+				// decoded after the "else" statement.
+				rc_if_0(coder->is_rep0_long[state][pos_state]) {
+					rc_update_0(coder->is_rep0_long[
+							state][pos_state]);
+
+					update_short_rep(state);
+					dict_put(&dict, dict_get(&dict, rep0));
+					continue;
+				}
+
+				// Repeating more than one byte at
+				// distance of rep0.
+				rc_update_1(coder->is_rep0_long[
+						state][pos_state]);
+
+			} else {
+				rc_update_1(coder->is_rep0[state]);
+
+				// The distance is rep1, rep2 or rep3. Once
+				// we find out which one of these three, it
+				// is stored to rep0 and rep1, rep2 and rep3
+				// are updated accordingly. There is no
+				// "Short Rep Match" option, so the length
+				// of the match must always be decoded next.
+				rc_if_0(coder->is_rep1[state]) {
+					// The distance is rep1.
+					rc_update_0(coder->is_rep1[state]);
+
+					const uint32_t distance = rep1;
+					rep1 = rep0;
+					rep0 = distance;
+
+				} else {
+					rc_update_1(coder->is_rep1[state]);
+
+					rc_if_0(coder->is_rep2[state]) {
+						// The distance is rep2.
+						rc_update_0(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+
+					} else {
+						// The distance is rep3.
+						rc_update_1(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep3;
+						rep3 = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
+					}
+				}
+			}
+
+			update_long_rep(state);
+
+			// Decode the length of the repeated match.
+			len_decode_fast(len, coder->rep_len_decoder,
+					pos_state);
+		}
+
+		/////////////////////////////////
+		// Repeat from history buffer. //
+		/////////////////////////////////
+
+		// The length is always between these limits. There is no way
+		// to trigger the algorithm to set len outside this range.
+		assert(len >= MATCH_LEN_MIN);
+		assert(len <= MATCH_LEN_MAX);
+
+		// Repeat len bytes from distance of rep0.
+		if (unlikely(dict_repeat(&dict, rep0, &len))) {
+			coder->sequence = SEQ_COPY;
+			goto out;
+		}
+
+		continue;
+
+slow:
+#endif
+	///////////////////////////
+	// Resumable Mode (slow) //
+	///////////////////////////
+
+	// This is very similar to Non-resumable Mode, so most of the
+	// comments are not repeated. The main differences are:
+	// - case labels are used to resume at the correct location.
+	// - Loops are not unrolled.
+	// - Range coder macros take an extra sequence argument
+	//   so they can save to coder->sequence the location to
+	//   resume in case there is not enough input.
+	case SEQ_NORMALIZE:
+	case SEQ_IS_MATCH:
+		if (unlikely(might_finish_without_eopm
+				&& dict.pos == dict.limit)) {
+			// In rare cases there is a useless byte that needs
+			// to be read anyway.
+			rc_normalize_safe(SEQ_NORMALIZE);
+
+			// If the range decoder state is such that we can
+			// be at the end of the LZMA stream, then the
+			// decoding is finished.
+			if (rc_is_finished(rc)) {
+				ret = LZMA_STREAM_END;
+				goto out;
+			}
+
+			// If the caller hasn't allowed EOPM to be present
+			// together with known uncompressed size, then the
+			// LZMA stream is corrupt.
+			if (!coder->allow_eopm) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+			// Otherwise continue decoding with the expectation
+			// that the next LZMA symbol is EOPM.
+			eopm_is_valid = true;
+		}
+
+		rc_if_0_safe(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			/////////////////////
+			// Decode literal. //
+			/////////////////////
+
+			rc_update_0(coder->is_match[state][pos_state]);
+
+			probs = literal_subcoder(coder->literal,
+					literal_context_bits, literal_pos_mask,
+					dict.pos, dict_get(&dict, 0));
+			symbol = 1;
+
+			if (is_literal_state(state)) {
+				// Decode literal without match byte.
+				// The "slow" version does not unroll
+				// the loop.
+	case SEQ_LITERAL:
+				do {
+					rc_bit_safe(probs[symbol], , ,
+							SEQ_LITERAL);
+				} while (symbol < (1 << 8));
+			} else {
+				// Decode literal with match byte.
+				len = (uint32_t)(dict_get(&dict, rep0)) << 1;
+
+				offset = 0x100;
+
+	case SEQ_LITERAL_MATCHED:
+				do {
+					const uint32_t match_bit
+							= len & offset;
+					const uint32_t subcoder_index
+							= offset + match_bit
+							+ symbol;
+
+					rc_bit_safe(probs[subcoder_index],
+							offset &= ~match_bit,
+							offset &= match_bit,
+							SEQ_LITERAL_MATCHED);
+
+					// It seems to be faster to do this
+					// here instead of putting it to the
+					// beginning of the loop and then
+					// putting the "case" in the middle
+					// of the loop.
+					len <<= 1;
+
+				} while (symbol < (1 << 8));
+			}
+
+			state = next_state[state];
+
+	case SEQ_LITERAL_WRITE:
+			if (dict_put_safe(&dict, symbol)) {
+				coder->sequence = SEQ_LITERAL_WRITE;
+				goto out;
+			}
+
+			continue;
+		}
+
+		///////////////////
+		// Decode match. //
+		///////////////////
+
+		rc_update_1(coder->is_match[state][pos_state]);
+
+	case SEQ_IS_REP:
+		rc_if_0_safe(coder->is_rep[state], SEQ_IS_REP) {
+			///////////////////
+			// Simple match. //
+			///////////////////
+
+			rc_update_0(coder->is_rep[state]);
+			update_match(state);
+
+			rep3 = rep2;
+			rep2 = rep1;
+			rep1 = rep0;
+
+			len_decode(len, coder->match_len_decoder,
+					pos_state, SEQ_MATCH_LEN);
+
+			probs = coder->dist_slot[get_dist_state(len)];
+			symbol = 1;
+
+	case SEQ_DIST_SLOT:
+			do {
+				rc_bit_safe(probs[symbol], , , SEQ_DIST_SLOT);
+			} while (symbol < DIST_SLOTS);
+
+			symbol -= DIST_SLOTS;
+			assert(symbol <= 63);
+
+			if (symbol < DIST_MODEL_START) {
+				rep0 = symbol;
+			} else {
+				limit = (symbol >> 1) - 1;
+				assert(limit >= 1 && limit <= 30);
+				rep0 = 2 + (symbol & 1);
+
+				if (symbol < DIST_MODEL_END) {
+					assert(limit <= 5);
+					rep0 <<= limit;
+					assert(rep0 <= 96);
+					// -1 is fine, because we start
+					// decoding at probs[1], not probs[0].
+					// NOTE: This violates the C standard,
+					// since we are doing pointer
+					// arithmetic past the beginning of
+					// the array.
+					assert((int32_t)(rep0 - symbol - 1)
+							>= -1);
+					assert((int32_t)(rep0 - symbol - 1)
+							<= 82);
+					probs = coder->pos_special + rep0
+							- symbol - 1;
+					symbol = 1;
+					offset = 0;
+	case SEQ_DIST_MODEL:
+					do {
+						rc_bit_safe(probs[symbol], ,
+							rep0 += 1U << offset,
+							SEQ_DIST_MODEL);
+					} while (++offset < limit);
+				} else {
+					assert(symbol >= 14);
+					assert(limit >= 6);
+					limit -= ALIGN_BITS;
+					assert(limit >= 2);
+	case SEQ_DIRECT:
+					do {
+						rc_direct_safe(rep0,
+								SEQ_DIRECT);
+					} while (--limit > 0);
+
+					rep0 <<= ALIGN_BITS;
+					symbol = 1;
+
+					offset = 0;
+	case SEQ_ALIGN:
+					do {
+						rc_bit_safe(coder->pos_align[
+								symbol], ,
+							rep0 += 1U << offset,
+							SEQ_ALIGN);
+					} while (++offset < ALIGN_BITS);
+
+					// End of payload marker
+					if (rep0 == UINT32_MAX) {
+						if (!eopm_is_valid) {
+							ret = LZMA_DATA_ERROR;
+							goto out;
+						}
+
+	case SEQ_EOPM:
+						rc_normalize_safe(SEQ_EOPM);
+						ret = rc_is_finished(rc)
+							? LZMA_STREAM_END
+							: LZMA_DATA_ERROR;
+						goto out;
+					}
+				}
+			}
+
+			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+		} else {
+			/////////////////////
+			// Repeated match. //
+			/////////////////////
+
+			rc_update_1(coder->is_rep[state]);
+
+			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+	case SEQ_IS_REP0:
+			rc_if_0_safe(coder->is_rep0[state], SEQ_IS_REP0) {
+				rc_update_0(coder->is_rep0[state]);
+
 	case SEQ_IS_REP0_LONG:
-				rc_if_0(coder->is_rep0_long[state][pos_state],
+				rc_if_0_safe(coder->is_rep0_long
+						[state][pos_state],
 						SEQ_IS_REP0_LONG) {
 					rc_update_0(coder->is_rep0_long[
 							state][pos_state]);
@@ -706,8 +1013,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					update_short_rep(state);
 
 	case SEQ_SHORTREP:
-					if (unlikely(dict_put(&dict, dict_get(
-							&dict, rep0)))) {
+					if (dict_put_safe(&dict,
+							dict_get(&dict,
+							rep0))) {
 						coder->sequence = SEQ_SHORTREP;
 						goto out;
 					}
@@ -715,8 +1023,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 					continue;
 				}
 
-				// Repeating more than one byte at
-				// distance of rep0.
 				rc_update_1(coder->is_rep0_long[
 						state][pos_state]);
 
@@ -724,11 +1030,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				rc_update_1(coder->is_rep0[state]);
 
 	case SEQ_IS_REP1:
-				// The distance is rep1, rep2 or rep3. Once
-				// we find out which one of these three, it
-				// is stored to rep0 and rep1, rep2 and rep3
-				// are updated accordingly.
-				rc_if_0(coder->is_rep1[state], SEQ_IS_REP1) {
+				rc_if_0_safe(coder->is_rep1[state], SEQ_IS_REP1) {
 					rc_update_0(coder->is_rep1[state]);
 
 					const uint32_t distance = rep1;
@@ -738,7 +1040,7 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 				} else {
 					rc_update_1(coder->is_rep1[state]);
 	case SEQ_IS_REP2:
-					rc_if_0(coder->is_rep2[state],
+					rc_if_0_safe(coder->is_rep2[state],
 							SEQ_IS_REP2) {
 						rc_update_0(coder->is_rep2[
 								state]);
@@ -763,7 +1065,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 			update_long_rep(state);
 
-			// Decode the length of the repeated match.
 			len_decode(len, coder->rep_len_decoder,
 					pos_state, SEQ_REP_LEN);
 		}
@@ -772,13 +1073,10 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		// Repeat from history buffer. //
 		/////////////////////////////////
 
-		// The length is always between these limits. There is no way
-		// to trigger the algorithm to set len outside this range.
 		assert(len >= MATCH_LEN_MIN);
 		assert(len <= MATCH_LEN_MAX);
 
 	case SEQ_COPY:
-		// Repeat len bytes from distance of rep0.
 		if (unlikely(dict_repeat(&dict, rep0, &len))) {
 			coder->sequence = SEQ_COPY;
 			goto out;