1 files changed, 791 insertions, 515 deletions
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index 68941021..e9d047d3 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -4,7 +4,7 @@
 /// \brief      LZMA decoder
 //
 //  Copyright (C) 1999-2006 Igor Pavlov
-//  Copyright (C) 2007 Lasse Collin
+//  Copyright (C) 2007-2008 Lasse Collin
 //
 //  This library is free software; you can redistribute it and/or
 //  modify it under the terms of the GNU Lesser General Public
@@ -18,74 +18,147 @@
 //
 ///////////////////////////////////////////////////////////////////////////////
 
-// NOTE: If you want to keep the line length in 80 characters, set
-//       tab width to 4 or less in your editor when editing this file.
-
+#include "lz_decoder.h"
 #include "lzma_common.h"
 #include "lzma_decoder.h"
-#include "lz_decoder.h"
 #include "range_decoder.h"
 
 
-/// REQUIRED_IN_BUFFER_SIZE is the number of required input bytes
-/// for the worst case: longest match with longest distance.
-/// LZMA_IN_BUFFER_SIZE must be larger than REQUIRED_IN_BUFFER_SIZE.
-/// 23 bits = 2 (match select) + 10 (len) + 6 (distance) + 4 (align)
-///         + 1 (rc_normalize)
-///
-/// \todo       Is this correct for sure?
-///
-#define REQUIRED_IN_BUFFER_SIZE \
-	((23 * (BIT_MODEL_TOTAL_BITS - MOVE_BITS + 1) + 26 + 9) / 8)
+#ifdef HAVE_SMALL
 
+// Macros for (somewhat) size-optimized code.
+#define seq_4(seq) seq
 
-// Length decoders are easiest to have as macros, because they use range
-// decoder macros, which use local variables rc_range and rc_code.
+#define seq_6(seq) seq
 
-#define length_decode(target, len_decoder, pos_state) \
+#define seq_8(seq) seq
+
+#define seq_len(seq) \
+	seq ## _CHOICE, \
+	seq ## _CHOICE2, \
+	seq ## _BITTREE
+
+#define len_decode(target, ld, pos_state, seq) \
 do { \
-	if_bit_0(len_decoder.choice) { \
-		update_bit_0(len_decoder.choice); \
-		target = MATCH_MIN_LEN; \
-		bittree_decode(target, len_decoder.low[pos_state], LEN_LOW_BITS); \
+case seq ## _CHOICE: \
+	rc_if_0(ld.choice, seq ## _CHOICE) { \
+		rc_update_0(ld.choice); \
+		probs = ld.low[pos_state];\
+		limit = LEN_LOW_SYMBOLS; \
+		target = MATCH_LEN_MIN; \
 	} else { \
-		update_bit_1(len_decoder.choice); \
-		if_bit_0(len_decoder.choice2) { \
-			update_bit_0(len_decoder.choice2); \
-			target = MATCH_MIN_LEN + LEN_LOW_SYMBOLS; \
-			bittree_decode(target, len_decoder.mid[pos_state], LEN_MID_BITS); \
+		rc_update_1(ld.choice); \
+case seq ## _CHOICE2: \
+		rc_if_0(ld.choice2, seq ## _CHOICE2) { \
+			rc_update_0(ld.choice2); \
+			probs = ld.mid[pos_state]; \
+			limit = LEN_MID_SYMBOLS; \
+			target = MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \
 		} else { \
-			update_bit_1(len_decoder.choice2); \
-			target = MATCH_MIN_LEN + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
-			bittree_decode(target, len_decoder.high, LEN_HIGH_BITS); \
+			rc_update_1(ld.choice2); \
+			probs = ld.high; \
+			limit = LEN_HIGH_SYMBOLS; \
+			target = MATCH_LEN_MIN + LEN_LOW_SYMBOLS \
+					+ LEN_MID_SYMBOLS; \
 		} \
 	} \
+	symbol = 1; \
+case seq ## _BITTREE: \
+	do { \
+		rc_bit(probs[symbol], , , seq ## _BITTREE); \
+	} while (symbol < limit); \
+	target += symbol - limit; \
 } while (0)
 
-
-#define length_decode_dummy(target, len_decoder, pos_state) \
+#else // HAVE_SMALL
+
+// Unrolled versions
+#define seq_4(seq) \
+	seq ## 0, \
+	seq ## 1, \
+	seq ## 2, \
+	seq ## 3
+
+#define seq_6(seq) \
+	seq ## 0, \
+	seq ## 1, \
+	seq ## 2, \
+	seq ## 3, \
+	seq ## 4, \
+	seq ## 5
+
+#define seq_8(seq) \
+	seq ## 0, \
+	seq ## 1, \
+	seq ## 2, \
+	seq ## 3, \
+	seq ## 4, \
+	seq ## 5, \
+	seq ## 6, \
+	seq ## 7
+
+#define seq_len(seq) \
+	seq ## _CHOICE, \
+	seq ## _LOW0, \
+	seq ## _LOW1, \
+	seq ## _LOW2, \
+	seq ## _CHOICE2, \
+	seq ## _MID0, \
+	seq ## _MID1, \
+	seq ## _MID2, \
+	seq ## _HIGH0, \
+	seq ## _HIGH1, \
+	seq ## _HIGH2, \
+	seq ## _HIGH3, \
+	seq ## _HIGH4, \
+	seq ## _HIGH5, \
+	seq ## _HIGH6, \
+	seq ## _HIGH7
+
+#define len_decode(target, ld, pos_state, seq) \
 do { \
-	if_bit_0(len_decoder.choice) { \
-		update_bit_0_dummy(); \
-		target = MATCH_MIN_LEN; \
-		bittree_decode_dummy(target, \
-				len_decoder.low[pos_state], LEN_LOW_BITS); \
+	symbol = 1; \
+case seq ## _CHOICE: \
+	rc_if_0(ld.choice, seq ## _CHOICE) { \
+		rc_update_0(ld.choice); \
+		rc_bit_case(ld.low[pos_state][symbol], , , seq ## _LOW0); \
+		rc_bit_case(ld.low[pos_state][symbol], , , seq ## _LOW1); \
+		rc_bit_case(ld.low[pos_state][symbol], , , seq ## _LOW2); \
+		target = symbol - LEN_LOW_SYMBOLS + MATCH_LEN_MIN; \
 	} else { \
-		update_bit_1_dummy(); \
-		if_bit_0(len_decoder.choice2) { \
-			update_bit_0_dummy(); \
-			target = MATCH_MIN_LEN + LEN_LOW_SYMBOLS; \
-			bittree_decode_dummy(target, len_decoder.mid[pos_state], \
-					LEN_MID_BITS); \
+		rc_update_1(ld.choice); \
+case seq ## _CHOICE2: \
+		rc_if_0(ld.choice2, seq ## _CHOICE2) { \
+			rc_update_0(ld.choice2); \
+			rc_bit_case(ld.mid[pos_state][symbol], , , \
+					seq ## _MID0); \
+			rc_bit_case(ld.mid[pos_state][symbol], , , \
+					seq ## _MID1); \
+			rc_bit_case(ld.mid[pos_state][symbol], , , \
+					seq ## _MID2); \
+			target = symbol - LEN_MID_SYMBOLS \
+					+ MATCH_LEN_MIN + LEN_LOW_SYMBOLS; \
 		} else { \
-			update_bit_1_dummy(); \
-			target = MATCH_MIN_LEN + LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
-			bittree_decode_dummy(target, len_decoder.high, LEN_HIGH_BITS); \
+			rc_update_1(ld.choice2); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH0); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH1); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH2); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH3); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH4); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH5); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH6); \
+			rc_bit_case(ld.high[symbol], , , seq ## _HIGH7); \
+			target = symbol - LEN_HIGH_SYMBOLS \
+					+ MATCH_LEN_MIN \
+					+ LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; \
 		} \
 	} \
 } while (0)
 
+#endif // HAVE_SMALL
 
+
+/// Length decoder probabilities; see comments in lzma_common.h.
 typedef struct {
 	probability choice;
 	probability choice2;
@@ -96,26 +169,12 @@ typedef struct {
 
 
 struct lzma_coder_s {
-	/// Data of the next coder, if any.
-	lzma_next_coder next;
-
-	/// Sliding output window a.k.a. dictionary a.k.a. history buffer.
-	lzma_lz_decoder lz;
-
-	// Range coder
-	lzma_range_decoder rc;
-
-	// State
-	lzma_lzma_state state;
-	uint32_t rep0;      ///< Distance of the latest match
-	uint32_t rep1;      ///< Distance of second latest match
-	uint32_t rep2;      ///< Distance of third latest match
-	uint32_t rep3;      ///< Distance of fourth latest match
-	uint32_t pos_bits;
-	uint32_t pos_mask;
-	uint32_t now_pos; // Lowest 32-bits are enough here.
+	///////////////////
+	// Probabilities //
+	///////////////////
 
-	lzma_literal_coder literal_coder;
+	/// Literals; see comments in lzma_common.h.
+	probability literal[LITERAL_CODERS_MAX][LITERAL_CODER_SIZE];
 
 	/// If 1, it's a match. Otherwise it's a single 8-bit literal.
 	probability is_match[STATES][POS_STATES_MAX];
@@ -138,178 +197,107 @@ struct lzma_coder_s {
 	/// the length is decoded from rep_len_decoder.
 	probability is_rep0_long[STATES][POS_STATES_MAX];
 
-	probability pos_slot_decoder[LEN_TO_POS_STATES][1 << POS_SLOT_BITS];
-	probability pos_decoders[FULL_DISTANCES - END_POS_MODEL_INDEX];
-	probability pos_align_decoder[1 << ALIGN_BITS];
-
-	/// Length of a match
-	lzma_length_decoder match_len_decoder;
-
-	/// Length of a repeated match.
-	lzma_length_decoder rep_len_decoder;
-
-	/// True when we have produced at least one byte of output since the
-	/// beginning of the stream or the latest flush marker.
-	bool has_produced_output;
-};
-
-
-/// \brief      Check if the next iteration of the decoder loop can be run.
-///
-/// \note       There must always be REQUIRED_IN_BUFFER_SIZE bytes
-///             readable space!
-///
-static bool lzma_attribute((pure))
-decode_dummy(const lzma_coder *restrict coder,
-		const uint8_t *restrict in, size_t in_pos_local,
-		const size_t in_size, lzma_range_decoder rc,
-		uint32_t state, uint32_t rep0, const uint32_t now_pos)
-{
-	uint32_t rc_bound;
-
-	do {
-		const uint32_t pos_state = now_pos & coder->pos_mask;
-
-		if_bit_0(coder->is_match[state][pos_state]) {
-			// It's a literal i.e. a single 8-bit byte.
-
-			update_bit_0_dummy();
-
-			const probability *subcoder = literal_get_subcoder(
-					coder->literal_coder, now_pos, lz_get_byte(coder->lz, 0));
-			uint32_t symbol = 1;
-
-			if (is_literal_state(state)) {
-				// Decode literal without match byte.
-				do {
-					if_bit_0(subcoder[symbol]) {
-						update_bit_0_dummy();
-						symbol <<= 1;
-					} else {
-						update_bit_1_dummy();
-						symbol = (symbol << 1) | 1;
-					}
-				} while (symbol < 0x100);
-
-			} else {
-				// Decode literal with match byte.
-				uint32_t match_byte = lz_get_byte(coder->lz, rep0);
-				uint32_t subcoder_offset = 0x100;
-
-				do {
-					match_byte <<= 1;
-					const uint32_t match_bit = match_byte & subcoder_offset;
-					const uint32_t subcoder_index
-							= subcoder_offset + match_bit + symbol;
-
-					if_bit_0(subcoder[subcoder_index]) {
-						update_bit_0_dummy();
-						symbol <<= 1;
-						subcoder_offset &= ~match_bit;
-					} else {
-						update_bit_1_dummy();
-						symbol = (symbol << 1) | 1;
-						subcoder_offset &= match_bit;
-					}
-				} while (symbol < 0x100);
-			}
-
-			break;
-		}
-
-		update_bit_1_dummy();
-		uint32_t len;
-
-		if_bit_0(coder->is_rep[state]) {
-			update_bit_0_dummy();
-			length_decode_dummy(len, coder->match_len_decoder, pos_state);
-
-			const uint32_t len_to_pos_state = get_len_to_pos_state(len);
-			uint32_t pos_slot = 0;
-			bittree_decode_dummy(pos_slot,
-					coder->pos_slot_decoder[len_to_pos_state], POS_SLOT_BITS);
-			assert(pos_slot <= 63);
-
-			if (pos_slot >= START_POS_MODEL_INDEX) {
-				uint32_t direct_bits = (pos_slot >> 1) - 1;
-				assert(direct_bits >= 1 && direct_bits <= 31);
-				rep0 = 2 | (pos_slot & 1);
-
-				if (pos_slot < END_POS_MODEL_INDEX) {
-					assert(direct_bits <= 5);
-					rep0 <<= direct_bits;
-					assert(rep0 <= 96);
-					// -1 is fine, because bittree_reverse_decode()
-					// starts from table index [1] (not [0]).
-					assert((int32_t)(rep0 - pos_slot - 1) >= -1);
-					assert((int32_t)(rep0 - pos_slot - 1) <= 82);
-					// We add the result to rep0, so rep0
-					// must not be part of second argument
-					// of the macro.
-					const int32_t offset = rep0 - pos_slot - 1;
-					bittree_reverse_decode_dummy(coder->pos_decoders + offset,
-							direct_bits);
-				} else {
-					assert(pos_slot >= 14);
-					assert(direct_bits >= 6);
-					direct_bits -= ALIGN_BITS;
-					assert(direct_bits >= 2);
-					rc_decode_direct_dummy(direct_bits);
-
-					bittree_reverse_decode_dummy(coder->pos_align_decoder,
-							ALIGN_BITS);
-				}
-			}
+	/// Probability tree for the highest two bits of the match distance.
+	/// There is a separate probability tree for match lengths of
+	/// 2 (i.e. MATCH_LEN_MIN), 3, 4, and [5, 273].
+	probability pos_slot[LEN_TO_POS_STATES][POS_SLOTS];
 
-		} else {
-			update_bit_1_dummy();
+	/// Probility trees for additional bits for match distance when the
+	/// distance is in the range [4, 127].
+	probability pos_special[FULL_DISTANCES - END_POS_MODEL_INDEX];
 
-			if_bit_0(coder->is_rep0[state]) {
-				update_bit_0_dummy();
+	/// Probability tree for the lowest four bits of a match distance
+	/// that is equal to or greater than 128.
+	probability pos_align[ALIGN_TABLE_SIZE];
 
-				if_bit_0(coder->is_rep0_long[state][pos_state]) {
-					update_bit_0_dummy();
-					break;
-				} else {
-					update_bit_1_dummy();
-				}
+	/// Length of a normal match
+	lzma_length_decoder match_len_decoder;
 
-			} else {
-				update_bit_1_dummy();
+	/// Length of a repeated match
+	lzma_length_decoder rep_len_decoder;
 
-				if_bit_0(coder->is_rep1[state]) {
-					update_bit_0_dummy();
-				} else {
-					update_bit_1_dummy();
+	///////////////////
+	// Decoder state //
+	///////////////////
 
-					if_bit_0(coder->is_rep2[state]) {
-						update_bit_0_dummy();
-					} else {
-						update_bit_1_dummy();
-					}
-				}
-			}
+	// Range coder
+	lzma_range_decoder rc;
 
-			length_decode_dummy(len, coder->rep_len_decoder, pos_state);
-		}
-	} while (0);
+	// Types of the most recently seen LZMA symbols
+	lzma_lzma_state state;
 
-	rc_normalize();
+	uint32_t rep0;      ///< Distance of the latest match
+	uint32_t rep1;      ///< Distance of second latest match
+	uint32_t rep2;      ///< Distance of third latest match
+	uint32_t rep3;      ///< Distance of fourth latest match
 
-	return in_pos_local <= in_size;
-}
+	uint32_t pos_mask; // (1U << pos_bits) - 1
+	uint32_t literal_context_bits;
+	uint32_t literal_pos_mask;
+
+	/// Uncompressed size as bytes, or LZMA_VLI_VALUE_UNKNOWN if end of
+	/// payload marker is expected.
+	lzma_vli uncompressed_size;
+
+	////////////////////////////////
+	// State of incomplete symbol //
+	////////////////////////////////
+
+	/// Position where to continue the decoder loop
+	enum {
+		SEQ_NORMALIZE,
+		SEQ_IS_MATCH,
+		seq_8(SEQ_LITERAL),
+		seq_8(SEQ_LITERAL_MATCHED),
+		SEQ_LITERAL_WRITE,
+		SEQ_IS_REP,
+		seq_len(SEQ_MATCH_LEN),
+		seq_6(SEQ_POS_SLOT),
+		SEQ_POS_MODEL,
+		SEQ_DIRECT,
+		seq_4(SEQ_ALIGN),
+		SEQ_EOPM,
+		SEQ_IS_REP0,
+		SEQ_SHORTREP,
+		SEQ_IS_REP0_LONG,
+		SEQ_IS_REP1,
+		SEQ_IS_REP2,
+		seq_len(SEQ_REP_LEN),
+		SEQ_COPY,
+	} sequence;
+
+	/// Base of the current probability tree
+	probability *probs;
+
+	/// Symbol being decoded. This is also used as an index variable in
+	/// bittree decoders: probs[symbol]
+	uint32_t symbol;
+
+	/// Used as a loop termination condition on bittree decoders and
+	/// direct bits decoder.
+	uint32_t limit;
+
+	/// Matched literal decoder: 0x100 or 0 to help avoiding branches.
+	/// Bittree reverse decoders: Offset of the next bit: 1 << offset
+	uint32_t offset;
+
+	/// If decoding a literal: match byte.
+	/// If decoding a match: length of the match.
+	uint32_t len;
+};
 
 
-static bool
-decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
-		size_t *restrict in_pos, size_t in_size, bool has_safe_buffer)
+static lzma_ret
+lzma_decode(lzma_coder *restrict coder, lzma_dict *restrict dictptr,
+		const uint8_t *restrict in,
+		size_t *restrict in_pos, size_t in_size)
 {
 	////////////////////
 	// Initialization //
 	////////////////////
 
 	if (!rc_read_init(&coder->rc, in, in_pos, in_size))
-		return false;
+		return LZMA_OK;
 
 	///////////////
 	// Variables //
@@ -318,8 +306,12 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
 	// Making local copies of often-used variables improves both
 	// speed and readability.
 
+	lzma_dict dict = *dictptr;
+
+	const size_t dict_start = dict.pos;
+
 	// Range decoder
-	rc_to_local(coder->rc);
+	rc_to_local(coder->rc, *in_pos);
 
 	// State
 	uint32_t state = coder->state;
@@ -328,87 +320,168 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
 	uint32_t rep2 = coder->rep2;
 	uint32_t rep3 = coder->rep3;
 
-	// Misc
-	uint32_t now_pos = coder->now_pos;
-	bool has_produced_output = coder->has_produced_output;
-
-	// Variables derived from decoder settings
 	const uint32_t pos_mask = coder->pos_mask;
 
-	size_t in_pos_local = *in_pos; // Local copy
-	size_t in_limit;
-	if (in_size <= REQUIRED_IN_BUFFER_SIZE)
-		in_limit = 0;
-	else
-		in_limit = in_size - REQUIRED_IN_BUFFER_SIZE;
-
-
-	while (coder->lz.pos < coder->lz.limit
-			&& (in_pos_local < in_limit || (has_safe_buffer
-				&& decode_dummy(coder, in, in_pos_local, in_size,
-					rc, state, rep0, now_pos)))) {
-
-		/////////////////////
-		// Actual decoding //
-		/////////////////////
-
-		const uint32_t pos_state = now_pos & pos_mask;
+	// These variables are actually needed only if we last time ran
+	// out of input in the middle of the decoder loop.
+	probability *probs = coder->probs;
+	uint32_t symbol = coder->symbol;
+	uint32_t limit = coder->limit;
+	uint32_t offset = coder->offset;
+	uint32_t len = coder->len;
+
+	const uint32_t literal_pos_mask = coder->literal_pos_mask;
+	const uint32_t literal_context_bits = coder->literal_context_bits;
+
+	// Temporary variables
+	uint32_t pos_state = dict.pos & pos_mask;
+
+	lzma_ret ret = LZMA_OK;
+
+	// If uncompressed size is known, there must be no end of payload
+	// marker.
+	const bool no_eopm = coder->uncompressed_size
+			!= LZMA_VLI_VALUE_UNKNOWN;
+	if (no_eopm && coder->uncompressed_size < dict.limit - dict.pos)
+		dict.limit = dict.pos + (size_t)(coder->uncompressed_size);
+
+	// The main decoder loop. The "switch" is used to restart the decoder at
+	// correct location. Once restarted, the "switch" is no longer used.
+	switch (coder->sequence)
+	while (true) {
+		// Calculate new pos_state. This is skipped on the first loop
+		// since we already calculated it when setting up the local
+		// variables.
+		pos_state = dict.pos & pos_mask;
+
+	case SEQ_NORMALIZE:
+	case SEQ_IS_MATCH:
+		if (unlikely(no_eopm && dict.pos == dict.limit))
+			break;
 
-		if_bit_0(coder->is_match[state][pos_state]) {
-			update_bit_0(coder->is_match[state][pos_state]);
+		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
+			rc_update_0(coder->is_match[state][pos_state]);
 
 			// It's a literal i.e. a single 8-bit byte.
 
-			probability *subcoder = literal_get_subcoder(coder->literal_coder,
-					now_pos, lz_get_byte(coder->lz, 0));
-			uint32_t symbol = 1;
+			probs = literal_subcoder(coder->literal,
+					literal_context_bits, literal_pos_mask,
+					dict.pos, dict_get(&dict, 0));
+			symbol = 1;
 
 			if (is_literal_state(state)) {
 				// Decode literal without match byte.
+#ifdef HAVE_SMALL
+	case SEQ_LITERAL:
 				do {
-					if_bit_0(subcoder[symbol]) {
-						update_bit_0(subcoder[symbol]);
-						symbol <<= 1;
-					} else {
-						update_bit_1(subcoder[symbol]);
-						symbol = (symbol << 1) | 1;
-					}
-				} while (symbol < 0x100);
-
+					rc_bit(probs[symbol], , , SEQ_LITERAL);
+				} while (symbol < (1 << 8));
+#else
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL0);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL1);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL2);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL3);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL4);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL5);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL6);
+				rc_bit_case(probs[symbol], , , SEQ_LITERAL7);
+#endif
 			} else {
 				// Decode literal with match byte.
 				//
-				// The usage of subcoder_offset allows omitting some
-				// branches, which should give tiny speed improvement on
-				// some CPUs. subcoder_offset gets set to zero if match_bit
-				// didn't match.
-				uint32_t match_byte = lz_get_byte(coder->lz, rep0);
-				uint32_t subcoder_offset = 0x100;
-
+				// We store the byte we compare against
+				// ("match byte") to "len" to minimize the
+				// number of variables we need to store
+				// between decoder calls.
+				len = dict_get(&dict, rep0) << 1;
+
+				// The usage of "offset" allows omitting some
+				// branches, which should give tiny speed
+				// improvement on some CPUs. "offset" gets
+				// set to zero if match_bit didn't match.
+				offset = 0x100;
+
+#ifdef HAVE_SMALL
+	case SEQ_LITERAL_MATCHED:
 				do {
-					match_byte <<= 1;
-					const uint32_t match_bit = match_byte & subcoder_offset;
+					const uint32_t match_bit
+							= len & offset;
 					const uint32_t subcoder_index
-							= subcoder_offset + match_bit + symbol;
+							= offset + match_bit
+							+ symbol;
+
+					rc_bit(probs[subcoder_index],
+							offset &= ~match_bit,
+							offset &= match_bit,
+							SEQ_LITERAL_MATCHED);
+
+					// It seems to be faster to do this
+					// here instead of putting it to the
+					// beginning of the loop and then
+					// putting the "case" in the middle
+					// of the loop.
+					len <<= 1;
+
+				} while (symbol < (1 << 8));
+#else
+				// Unroll the loop.
+				uint32_t match_bit;
+				uint32_t subcoder_index;
+
+#	define d(seq) \
+		case seq: \
+			match_bit = len & offset; \
+			subcoder_index = offset + match_bit + symbol; \
+			rc_bit(probs[subcoder_index], \
+					offset &= ~match_bit, \
+					offset &= match_bit, \
+					seq)
+
+				d(SEQ_LITERAL_MATCHED0);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED1);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED2);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED3);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED4);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED5);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED6);
+				len <<= 1;
+				d(SEQ_LITERAL_MATCHED7);
+#	undef d
+#endif
+			}
 
-					if_bit_0(subcoder[subcoder_index]) {
-						update_bit_0(subcoder[subcoder_index]);
-						symbol <<= 1;
-						subcoder_offset &= ~match_bit;
-					} else {
-						update_bit_1(subcoder[subcoder_index]);
-						symbol = (symbol << 1) | 1;
-						subcoder_offset &= match_bit;
-					}
-				} while (symbol < 0x100);
+			//update_literal(state);
+			// Use a lookup table to update to literal state,
+			// since compared to other state updates, this would
+			// need two branches.
+			static const lzma_lzma_state next_state[] = {
+				STATE_LIT_LIT,
+				STATE_LIT_LIT,
+				STATE_LIT_LIT,
+				STATE_LIT_LIT,
+				STATE_MATCH_LIT_LIT,
+				STATE_REP_LIT_LIT,
+				STATE_SHORTREP_LIT_LIT,
+				STATE_MATCH_LIT,
+				STATE_REP_LIT,
+				STATE_SHORTREP_LIT,
+				STATE_MATCH_LIT,
+				STATE_REP_LIT
+			};
+			state = next_state[state];
+
+	case SEQ_LITERAL_WRITE:
+			if (unlikely(dict_put(&dict, symbol))) {
+				coder->sequence = SEQ_LITERAL_WRITE;
+				goto out;
 			}
 
-			// Put the decoded byte to the dictionary, update the
-			// decoder state, and start a new decoding loop.
-			coder->lz.dict[coder->lz.pos++] = (uint8_t)(symbol);
-			++now_pos;
-			update_literal(state);
-			has_produced_output = true;
 			continue;
 		}
 
@@ -416,115 +489,196 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
 		// (distance and length) which will be repeated from our
 		// output history.
 
-		update_bit_1(coder->is_match[state][pos_state]);
-		uint32_t len;
-
-		if_bit_0(coder->is_rep[state]) {
-			update_bit_0(coder->is_rep[state]);
+		rc_update_1(coder->is_match[state][pos_state]);
 
+	case SEQ_IS_REP:
+		rc_if_0(coder->is_rep[state], SEQ_IS_REP) {
 			// Not a repeated match
-			//
-			// We will decode a new distance and store
-			// the value to distance.
-
-			// Decode the length of the match.
-			length_decode(len, coder->match_len_decoder, pos_state);
-
+			rc_update_0(coder->is_rep[state]);
 			update_match(state);
 
-			const uint32_t len_to_pos_state = get_len_to_pos_state(len);
-			uint32_t pos_slot = 0;
-			bittree_decode(pos_slot,
-					coder->pos_slot_decoder[len_to_pos_state], POS_SLOT_BITS);
-			assert(pos_slot <= 63);
-
-			if (pos_slot >= START_POS_MODEL_INDEX) {
-				uint32_t direct_bits = (pos_slot >> 1) - 1;
-				assert(direct_bits >= 1 && direct_bits <= 30);
-				uint32_t distance = 2 | (pos_slot & 1);
-
-				if (pos_slot < END_POS_MODEL_INDEX) {
-					assert(direct_bits <= 5);
-					distance <<= direct_bits;
-					assert(distance <= 96);
-					// -1 is fine, because
-					// bittree_reverse_decode()
-					// starts from table index [1]
-					// (not [0]).
-					assert((int32_t)(distance - pos_slot - 1) >= -1);
-					assert((int32_t)(distance - pos_slot - 1) <= 82);
-					// We add the result to distance, so distance
-					// must not be part of second argument
-					// of the macro.
-					const int32_t offset = distance - pos_slot - 1;
-					bittree_reverse_decode(distance, coder->pos_decoders + offset,
-							direct_bits);
+			// The latest three match distances are kept in
+			// memory in case there are repeated matches.
+			rep3 = rep2;
+			rep2 = rep1;
+			rep1 = rep0;
+
+			// Decode the length of the match.
+			len_decode(len, coder->match_len_decoder,
+					pos_state, SEQ_MATCH_LEN);
+
+			// Prepare to decode the highest two bits of the
+			// match distance.
+			probs = coder->pos_slot[get_len_to_pos_state(len)];
+			symbol = 1;
+
+#ifdef HAVE_SMALL
+	case SEQ_POS_SLOT:
+			do {
+				rc_bit(probs[symbol], , , SEQ_POS_SLOT);
+			} while (symbol < POS_SLOTS);
+#else
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT0);
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT1);
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT2);
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT3);
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT4);
+			rc_bit_case(probs[symbol], , , SEQ_POS_SLOT5);
+#endif
+			// Get rid of the highest bit that was needed for
+			// indexing of the probability array.
+			symbol -= POS_SLOTS;
+			assert(symbol <= 63);
+
+			if (symbol < START_POS_MODEL_INDEX) {
+				// Match distances [0, 3] have only two bits.
+				rep0 = symbol;
+			} else {
+				// Decode the lowest [1, 29] bits of
+				// the match distance.
+				limit = (symbol >> 1) - 1;
+				assert(limit >= 1 && limit <= 30);
+				rep0 = 2 + (symbol & 1);
+
+				if (symbol < END_POS_MODEL_INDEX) {
+					// Prepare to decode the low bits for
+					// a distance of [4, 127].
+					assert(limit <= 5);
+					rep0 <<= limit;
+					assert(rep0 <= 96);
+					// -1 is fine, because we start
+					// decoding at probs[1], not probs[0].
+					// NOTE: This violates the C standard,
+					// since we are doing pointer
+					// arithmetic past the beginning of
+					// the array.
+					assert((int32_t)(rep0 - symbol - 1)
+							>= -1);
+					assert((int32_t)(rep0 - symbol - 1)
+							<= 82);
+					probs = coder->pos_special + rep0
+							- symbol - 1;
+					symbol = 1;
+					offset = 0;
+	case SEQ_POS_MODEL:
+#ifdef HAVE_SMALL
+					do {
+						rc_bit(probs[symbol], ,
+							rep0 += 1 << offset,
+							SEQ_POS_MODEL);
+					} while (++offset < limit);
+#else
+					switch (limit) {
+					case 5:
+						assert(offset == 0);
+						rc_bit(probs[symbol], ,
+							rep0 += 1,
+							SEQ_POS_MODEL);
+						++offset;
+						--limit;
+					case 4:
+						rc_bit(probs[symbol], ,
+							rep0 += 1 << offset,
+							SEQ_POS_MODEL);
+						++offset;
+						--limit;
+					case 3:
+						rc_bit(probs[symbol], ,
+							rep0 += 1 << offset,
+							SEQ_POS_MODEL);
+						++offset;
+						--limit;
+					case 2:
+						rc_bit(probs[symbol], ,
+							rep0 += 1 << offset,
+							SEQ_POS_MODEL);
+						++offset;
+						--limit;
+					case 1:
+						// We need "symbol" only for
+						// indexing the probability
+						// array, thus we can use
+						// rc_bit_last() here to omit
+						// the unneeded updating of
+						// "symbol".
+						rc_bit_last(probs[symbol], ,
+							rep0 += 1 << offset,
+							SEQ_POS_MODEL);
+					}
+#endif
 				} else {
-					assert(pos_slot >= 14);
-					assert(direct_bits >= 6);
-					direct_bits -= ALIGN_BITS;
-					assert(direct_bits >= 2);
-					rc_decode_direct(distance, direct_bits);
-					distance <<= ALIGN_BITS;
-
-					bittree_reverse_decode(distance, coder->pos_align_decoder,
-							ALIGN_BITS);
-
-					if (distance == UINT32_MAX) {
-						if (len == LEN_SPECIAL_EOPM) {
-							// End of Payload Marker found.
-							coder->lz.eopm_detected = true;
-							break;
-
-						} else if (len == LEN_SPECIAL_FLUSH) {
-							// Flush marker detected. We must have produced
-							// at least one byte of output since the previous
-							// flush marker or the beginning of the stream.
-							// This is to prevent hanging the decoder with
-							// malicious input files.
-							if (!has_produced_output)
-								return true;
-
-							has_produced_output = false;
-
-							// We know that we have enough input to call
-							// this macro, because it is tested at the
-							// end of decode_dummy().
-							rc_normalize();
-
-							rc_reset(rc);
-
-							// If we don't have enough input here, we jump
-							// out of the loop. Note that while there is a
-							// useless call to rc_normalize(), it does nothing
-							// since we have just reset the range decoder.
-							if (!rc_read_init(&rc, in, &in_pos_local, in_size))
-								break;
-
-							continue;
-
-						} else {
-							return true;
+					// The distace is >= 128. Decode the
+					// lower bits without probabilities
+					// except the lowest four bits.
+					assert(symbol >= 14);
+					assert(limit >= 6);
+					limit -= ALIGN_BITS;
+					assert(limit >= 2);
+	case SEQ_DIRECT:
+					// Not worth manual unrolling
+					do {
+						rc_direct(rep0, SEQ_DIRECT);
+					} while (--limit > 0);
+
+					// Decode the lowest four bits using
+					// probabilities.
+					rep0 <<= ALIGN_BITS;
+					symbol = 1;
+#ifdef HAVE_SMALL
+					offset = 0;
+	case SEQ_ALIGN:
+					do {
+						rc_bit(coder->pos_align[
+								symbol], ,
+							rep0 += 1 << offset,
+							SEQ_ALIGN);
+					} while (++offset < ALIGN_BITS);
+#else
+	case SEQ_ALIGN0:
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 1, SEQ_ALIGN0);
+	case SEQ_ALIGN1:
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 2, SEQ_ALIGN1);
+	case SEQ_ALIGN2:
+					rc_bit(coder->pos_align[symbol], ,
+							rep0 += 4, SEQ_ALIGN2);
+	case SEQ_ALIGN3:
+					// Like in SEQ_POS_MODEL, we don't
+					// need "symbol" for anything else
+					// than indexing the probability array.
+					rc_bit_last(coder->pos_align[symbol], ,
+							rep0 += 8, SEQ_ALIGN3);
+#endif
+
+					if (rep0 == UINT32_MAX) {
+						// End of payload marker was
+						// found. It must not be
+						// present if uncompressed
+						// size is known.
+						if (coder->uncompressed_size
+						!= LZMA_VLI_VALUE_UNKNOWN) {
+							ret = LZMA_DATA_ERROR;
+							goto out;
 						}
+
+	case SEQ_EOPM:
+						// TODO Comment
+						rc_normalize(SEQ_EOPM);
+						ret = LZMA_STREAM_END;
+						goto out;
 					}
 				}
+			}
 
-				// The latest three match distances are kept in
-				// memory in case there are repeated matches.
-				rep3 = rep2;
-				rep2 = rep1;
-				rep1 = rep0;
-				rep0 = distance;
-
-			} else {
-				rep3 = rep2;
-				rep2 = rep1;
-				rep1 = rep0;
-				rep0 = pos_slot;
+			// Validate the distance we just decoded.
+			if (unlikely(!dict_is_distance_valid(&dict, rep0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
 			}
 
 		} else {
-			update_bit_1(coder->is_rep[state]);
+			rc_update_1(coder->is_rep[state]);
 
 			// Repeated match
 			//
@@ -532,242 +686,318 @@ decode_real(lzma_coder *restrict coder, const uint8_t *restrict in,
 			// earlier. The latest four match distances are
 			// available as rep0, rep1, rep2 and rep3. We will
 			// now decode which of them is the new distance.
+			//
+			// There cannot be a match if we haven't produced
+			// any output, so check that first.
+			if (unlikely(!dict_is_distance_valid(&dict, 0))) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
 
-			if_bit_0(coder->is_rep0[state]) {
-				update_bit_0(coder->is_rep0[state]);
-
+	case SEQ_IS_REP0:
+			rc_if_0(coder->is_rep0[state], SEQ_IS_REP0) {
+				rc_update_0(coder->is_rep0[state]);
 				// The distance is rep0.
 
-				if_bit_0(coder->is_rep0_long[state][pos_state]) {
-					update_bit_0(coder->is_rep0_long[state][pos_state]);
+	case SEQ_IS_REP0_LONG:
+				rc_if_0(coder->is_rep0_long[state][pos_state],
+						SEQ_IS_REP0_LONG) {
+					rc_update_0(coder->is_rep0_long[
+							state][pos_state]);
 
 					update_short_rep(state);
 
-					// Repeat exactly one byte and start a new decoding loop.
-					// Note that rep0 is known to have a safe value, thus we
-					// don't need to check if we are wrapping the dictionary
-					// when it isn't full yet.
-					if (unlikely(lz_is_empty(coder->lz)))
-						return true;
-
-					coder->lz.dict[coder->lz.pos]
-							= lz_get_byte(coder->lz, rep0);
-					++coder->lz.pos;
-					++now_pos;
-					has_produced_output = true;
-					continue;
-
-				} else {
-					update_bit_1(coder->is_rep0_long[state][pos_state]);
+	case SEQ_SHORTREP:
+					if (unlikely(dict_put(&dict, dict_get(
+							&dict, rep0)))) {
+						coder->sequence = SEQ_SHORTREP;
+						goto out;
+					}
 
-					// Repeating more than one byte at
-					// distance of rep0.
+					continue;
 				}
 
+				// Repeating more than one byte at
+				// distance of rep0.
+				rc_update_1(coder->is_rep0_long[
+						state][pos_state]);
+
 			} else {
-				update_bit_1(coder->is_rep0[state]);
+				rc_update_1(coder->is_rep0[state]);
 
+	case SEQ_IS_REP1:
 				// The distance is rep1, rep2 or rep3. Once
 				// we find out which one of these three, it
 				// is stored to rep0 and rep1, rep2 and rep3
 				// are updated accordingly.
+				rc_if_0(coder->is_rep1[state], SEQ_IS_REP1) {
+					rc_update_0(coder->is_rep1[state]);
 
-				uint32_t distance;
+					const uint32_t distance = rep1;
+					rep1 = rep0;
+					rep0 = distance;
 
-				if_bit_0(coder->is_rep1[state]) {
-					update_bit_0(coder->is_rep1[state]);
-					distance = rep1;
 				} else {
-					update_bit_1(coder->is_rep1[state]);
+					rc_update_1(coder->is_rep1[state]);
+	case SEQ_IS_REP2:
+					rc_if_0(coder->is_rep2[state],
+							SEQ_IS_REP2) {
+						rc_update_0(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
 
-					if_bit_0(coder->is_rep2[state]) {
-						update_bit_0(coder->is_rep2[state]);
-						distance = rep2;
 					} else {
-						update_bit_1(coder->is_rep2[state]);
-						distance = rep3;
+						rc_update_1(coder->is_rep2[
+								state]);
+
+						const uint32_t distance = rep3;
 						rep3 = rep2;
+						rep2 = rep1;
+						rep1 = rep0;
+						rep0 = distance;
 					}
-
-					rep2 = rep1;
 				}
-
-				rep1 = rep0;
-				rep0 = distance;
 			}
 
 			update_long_rep(state);
 
 			// Decode the length of the repeated match.
-			length_decode(len, coder->rep_len_decoder, pos_state);
+			len_decode(len, coder->rep_len_decoder,
+					pos_state, SEQ_REP_LEN);
 		}
 
-
 		/////////////////////////////////
 		// Repeat from history buffer. //
 		/////////////////////////////////
 
 		// The length is always between these limits. There is no way
 		// to trigger the algorithm to set len outside this range.
-		assert(len >= MATCH_MIN_LEN);
-		assert(len <= MATCH_MAX_LEN);
-
-		now_pos += len;
-		has_produced_output = true;
+		assert(len >= MATCH_LEN_MIN);
+		assert(len <= MATCH_LEN_MAX);
 
+	case SEQ_COPY:
 		// Repeat len bytes from distance of rep0.
-		if (!lzma_lz_out_repeat(&coder->lz, rep0, len))
-			return true;
+		if (unlikely(dict_repeat(&dict, rep0, &len))) {
+			coder->sequence = SEQ_COPY;
+			goto out;
+		}
 	}
 
-	rc_normalize();
+	rc_normalize(SEQ_NORMALIZE);
+	coder->sequence = SEQ_IS_MATCH;
 
+out:
+	// Save state
 
-	/////////////////////////////////
-	// Update the *data structure. //
-	/////////////////////////////////
+	// NOTE: Must not copy dict.limit.
+	dictptr->pos = dict.pos;
+	dictptr->full = dict.full;
 
-	// Range decoder
-	rc_from_local(coder->rc);
+	rc_from_local(coder->rc, *in_pos);
 
-	// State
 	coder->state = state;
 	coder->rep0 = rep0;
 	coder->rep1 = rep1;
 	coder->rep2 = rep2;
 	coder->rep3 = rep3;
 
-	// Misc
-	coder->now_pos = now_pos;
-	coder->has_produced_output = has_produced_output;
-	*in_pos = in_pos_local;
+	coder->probs = probs;
+	coder->symbol = symbol;
+	coder->limit = limit;
+	coder->offset = offset;
+	coder->len = len;
+
+	// Update the remaining amount of uncompressed data if uncompressed
+	// size was known.
+	if (coder->uncompressed_size != LZMA_VLI_VALUE_UNKNOWN) {
+		coder->uncompressed_size -= dict.pos - dict_start;
+
+		// Since there cannot be end of payload marker if the
+		// uncompressed size was known, we check here if we
+		// finished decoding.
+		if (coder->uncompressed_size == 0 && ret == LZMA_OK
+				&& coder->sequence != SEQ_NORMALIZE)
+			ret = coder->sequence == SEQ_IS_MATCH
+					? LZMA_STREAM_END : LZMA_DATA_ERROR;
+	}
+
+	// We can do an additional check in the range decoder to catch some
+	// corrupted files.
+	if (ret == LZMA_STREAM_END) {
+		if (!rc_is_finished(coder->rc))
+			ret = LZMA_DATA_ERROR;
 
-	return false;
+		// Reset the range decoder so that it is ready to reinitialize
+		// for a new LZMA2 chunk.
+		rc_reset(coder->rc);
+	}
+
+	return ret;
 }
 
 
+
 static void
-lzma_decoder_end(lzma_coder *coder, lzma_allocator *allocator)
+lzma_decoder_uncompressed(lzma_coder *coder, lzma_vli uncompressed_size)
 {
-	lzma_next_coder_end(&coder->next, allocator);
-	lzma_lz_decoder_end(&coder->lz, allocator);
-	lzma_free(coder, allocator);
-	return;
+	coder->uncompressed_size = uncompressed_size;
 }
 
-
-extern lzma_ret
-lzma_lzma_decoder_init(lzma_next_coder *next, lzma_allocator *allocator,
-		const lzma_filter_info *filters)
+/*
+extern void
+lzma_lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size)
 {
-	// LZMA can only be the last filter in the chain.
-	assert(filters[1].init == NULL);
-
-	// Validate pos_bits. Other options are validated by the
-	// respective initialization functions.
-	const lzma_options_lzma *options = filters[0].options;
-	if (options->pos_bits > LZMA_POS_BITS_MAX)
-		return LZMA_HEADER_ERROR;
+	// This is hack.
+	(*(lzma_coder **)(coder))->uncompressed_size = uncompressed_size;
+}
+*/
 
-	// Allocate memory for the decoder if needed.
-	if (next->coder == NULL) {
-		next->coder = lzma_alloc(sizeof(lzma_coder), allocator);
-		if (next->coder == NULL)
-			return LZMA_MEM_ERROR;
+static void
+lzma_decoder_reset(lzma_coder *coder, const void *opt)
+{
+	const lzma_options_lzma *options = opt;
 
-		next->code = &lzma_lz_decode;
-		next->end = &lzma_decoder_end;
-		next->coder->next = LZMA_NEXT_CODER_INIT;
-		next->coder->lz = LZMA_LZ_DECODER_INIT;
-	}
+	// NOTE: We assume that lc/lp/pb are valid since they were
+	// successfully decoded with lzma_lzma_decode_properties().
+	// FIXME?
 
-	// Store the pos_bits and calculate pos_mask.
-	next->coder->pos_bits = options->pos_bits;
-	next->coder->pos_mask = (1U << next->coder->pos_bits) - 1;
+	// Calculate pos_mask. We don't need pos_bits as is for anything.
+	coder->pos_mask = (1U << options->pos_bits) - 1;
 
 	// Initialize the literal decoder.
-	return_if_error(lzma_literal_init(&next->coder->literal_coder,
-				options->literal_context_bits,
-				options->literal_pos_bits));
+	literal_init(coder->literal, options->literal_context_bits,
+				options->literal_pos_bits);
 
-	// Allocate and initialize the LZ decoder.
-	return_if_error(lzma_lz_decoder_reset(&next->coder->lz, allocator,
-			&decode_real, options->dictionary_size,
-			MATCH_MAX_LEN));
+	coder->literal_context_bits = options->literal_context_bits;
+	coder->literal_pos_mask = (1 << options->literal_pos_bits) - 1;
 
 	// State
-	next->coder->state = 0;
-	next->coder->rep0 = 0;
-	next->coder->rep1 = 0;
-	next->coder->rep2 = 0;
-	next->coder->rep3 = 0;
-	next->coder->pos_bits = options->pos_bits;
-	next->coder->pos_mask = (1 << next->coder->pos_bits) - 1;
-	next->coder->now_pos = 0;
+	coder->state = STATE_LIT_LIT;
+	coder->rep0 = 0;
+	coder->rep1 = 0;
+	coder->rep2 = 0;
+	coder->rep3 = 0;
+	coder->pos_mask = (1 << options->pos_bits) - 1;
 
 	// Range decoder
-	rc_reset(next->coder->rc);
+	rc_reset(coder->rc);
 
 	// Bit and bittree decoders
 	for (uint32_t i = 0; i < STATES; ++i) {
-		for (uint32_t j = 0; j <= next->coder->pos_mask; ++j) {
-			bit_reset(next->coder->is_match[i][j]);
-			bit_reset(next->coder->is_rep0_long[i][j]);
+		for (uint32_t j = 0; j <= coder->pos_mask; ++j) {
+			bit_reset(coder->is_match[i][j]);
+			bit_reset(coder->is_rep0_long[i][j]);
 		}
 
-		bit_reset(next->coder->is_rep[i]);
-		bit_reset(next->coder->is_rep0[i]);
-		bit_reset(next->coder->is_rep1[i]);
-		bit_reset(next->coder->is_rep2[i]);
+		bit_reset(coder->is_rep[i]);
+		bit_reset(coder->is_rep0[i]);
+		bit_reset(coder->is_rep1[i]);
+		bit_reset(coder->is_rep2[i]);
 	}
 
 	for (uint32_t i = 0; i < LEN_TO_POS_STATES; ++i)
-		bittree_reset(next->coder->pos_slot_decoder[i], POS_SLOT_BITS);
+		bittree_reset(coder->pos_slot[i], POS_SLOT_BITS);
 
 	for (uint32_t i = 0; i < FULL_DISTANCES - END_POS_MODEL_INDEX; ++i)
-		bit_reset(next->coder->pos_decoders[i]);
+		bit_reset(coder->pos_special[i]);
 
-	bittree_reset(next->coder->pos_align_decoder, ALIGN_BITS);
+	bittree_reset(coder->pos_align, ALIGN_BITS);
 
 	// Len decoders (also bit/bittree)
-	const uint32_t num_pos_states = 1 << next->coder->pos_bits;
-	bit_reset(next->coder->match_len_decoder.choice);
-	bit_reset(next->coder->match_len_decoder.choice2);
-	bit_reset(next->coder->rep_len_decoder.choice);
-	bit_reset(next->coder->rep_len_decoder.choice2);
+	const uint32_t num_pos_states = 1 << options->pos_bits;
+	bit_reset(coder->match_len_decoder.choice);
+	bit_reset(coder->match_len_decoder.choice2);
+	bit_reset(coder->rep_len_decoder.choice);
+	bit_reset(coder->rep_len_decoder.choice2);
 
 	for (uint32_t pos_state = 0; pos_state < num_pos_states; ++pos_state) {
-		bittree_reset(next->coder->match_len_decoder.low[pos_state],
+		bittree_reset(coder->match_len_decoder.low[pos_state],
 				LEN_LOW_BITS);
-		bittree_reset(next->coder->match_len_decoder.mid[pos_state],
+		bittree_reset(coder->match_len_decoder.mid[pos_state],
 				LEN_MID_BITS);
 
-		bittree_reset(next->coder->rep_len_decoder.low[pos_state],
+		bittree_reset(coder->rep_len_decoder.low[pos_state],
 				LEN_LOW_BITS);
-		bittree_reset(next->coder->rep_len_decoder.mid[pos_state],
+		bittree_reset(coder->rep_len_decoder.mid[pos_state],
 				LEN_MID_BITS);
 	}
 
-	bittree_reset(next->coder->match_len_decoder.high, LEN_HIGH_BITS);
-	bittree_reset(next->coder->rep_len_decoder.high, LEN_HIGH_BITS);
+	bittree_reset(coder->match_len_decoder.high, LEN_HIGH_BITS);
+	bittree_reset(coder->rep_len_decoder.high, LEN_HIGH_BITS);
+
+	coder->sequence = SEQ_IS_MATCH;
+	coder->probs = NULL;
+	coder->symbol = 0;
+	coder->limit = 0;
+	coder->offset = 0;
+	coder->len = 0;
+
+	return;
+}
+
+
+extern lzma_ret
+lzma_lzma_decoder_create(lzma_lz_decoder *lz, lzma_allocator *allocator,
+		const void *opt, size_t *dict_size)
+{
+	if (lz->coder == NULL) {
+		lz->coder = lzma_alloc(sizeof(lzma_coder), allocator);
+		if (lz->coder == NULL)
+			return LZMA_MEM_ERROR;
+
+		lz->code = &lzma_decode;
+		lz->reset = &lzma_decoder_reset;
+		lz->set_uncompressed = &lzma_decoder_uncompressed;
+	}
 
-	next->coder->has_produced_output = false;
+	// All dictionary sizes are OK here. LZ decoder will take care of
+	// the special cases.
+	const lzma_options_lzma *options = opt;
+	*dict_size = options->dictionary_size;
 
 	return LZMA_OK;
 }
 
 
-extern void
-lzma_lzma_decoder_uncompressed_size(
-		lzma_next_coder *next, lzma_vli uncompressed_size)
+/// Allocate and initialize LZMA decoder. This is used only via LZ
+/// initialization (lzma_lzma_decoder_init() passes function pointer to
+/// the LZ initialization).
+static lzma_ret
+lzma_decoder_init(lzma_lz_decoder *lz, lzma_allocator *allocator,
+		const void *options, size_t *dict_size)
 {
-	next->coder->lz.uncompressed_size = uncompressed_size;
-	return;
+	if (!is_lclppb_valid(options))
+		return LZMA_PROG_ERROR;
+
+	return_if_error(lzma_lzma_decoder_create(
+			lz, allocator, options, dict_size));
+
+	lzma_decoder_reset(lz->coder, options);
+	lzma_decoder_uncompressed(lz->coder, LZMA_VLI_VALUE_UNKNOWN);
+
+	return LZMA_OK;
+}
+
+
+extern lzma_ret
+lzma_lzma_decoder_init(lzma_next_coder *next, lzma_allocator *allocator,
+		const lzma_filter_info *filters)
+{
+	// LZMA can only be the last filter in the chain. This is enforced
+	// by the raw_decoder initialization.
+	assert(filters[1].init == NULL);
+
+	return lzma_lz_decoder_init(next, allocator, filters,
+			&lzma_decoder_init);
 }
 
 
 extern bool
-lzma_lzma_decode_properties(lzma_options_lzma *options, uint8_t byte)
+lzma_lzma_lclppb_decode(lzma_options_lzma *options, uint8_t byte)
 {
 	if (byte > (4 * 5 + 4) * 9 + 8)
 		return true;
@@ -781,3 +1011,49 @@ lzma_lzma_decode_properties(lzma_options_lzma *options, uint8_t byte)
 	return options->literal_context_bits + options->literal_pos_bits
 			> LZMA_LITERAL_BITS_MAX;
 }
+
+
+extern uint64_t
+lzma_lzma_decoder_memusage(const void *options)
+{
+	const lzma_options_lzma *const opt = options;
+	const uint64_t lz_memusage
+			= lzma_lz_decoder_memusage(opt->dictionary_size);
+	if (lz_memusage == UINT64_MAX)
+		return UINT64_MAX;
+
+	return sizeof(lzma_coder) + lz_memusage;
+}
+
+
+extern lzma_ret
+lzma_lzma_props_decode(void **options, lzma_allocator *allocator,
+		const uint8_t *props, size_t props_size)
+{
+	if (props_size != 5)
+		return LZMA_HEADER_ERROR;
+
+	lzma_options_lzma *opt
+			= lzma_alloc(sizeof(lzma_options_lzma), allocator);
+	if (opt == NULL)
+		return LZMA_MEM_ERROR;
+
+	if (lzma_lzma_lclppb_decode(opt, props[0]))
+		goto error;
+
+	// All dictionary sizes are accepted, including zero. LZ decoder
+	// will automatically use a dictionary at least a few KiB even if
+	// a smaller dictionary is requested.
+	opt->dictionary_size = integer_read_32(props + 1);
+
+	opt->preset_dictionary = NULL;
+	opt->preset_dictionary_size = 0;
+
+	*options = opt;
+
+	return LZMA_OK;
+
+error:
+	lzma_free(opt, allocator);
+	return LZMA_HEADER_ERROR;
+}