aboutsummaryrefslogtreecommitdiff
path: root/src/liblzma/lzma/lzma_encoder.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/liblzma/lzma/lzma_encoder.c')
-rw-r--r--src/liblzma/lzma/lzma_encoder.c551
1 files changed, 261 insertions, 290 deletions
diff --git a/src/liblzma/lzma/lzma_encoder.c b/src/liblzma/lzma/lzma_encoder.c
index 4eee84f3..b0052182 100644
--- a/src/liblzma/lzma/lzma_encoder.c
+++ b/src/liblzma/lzma/lzma_encoder.c
@@ -26,113 +26,248 @@
#include "fastpos.h"
-////////////
-// Macros //
-////////////
-
-// These are as macros mostly because they use local range encoder variables.
-
-#define literal_encode(subcoder, symbol) \
-do { \
- uint32_t context = 1; \
- int i = 8; \
- do { \
- --i; \
- const uint32_t bit = ((symbol) >> i) & 1; \
- bit_encode(subcoder[context], bit); \
- context = (context << 1) | bit; \
- } while (i != 0); \
-} while (0)
-
-
-#define literal_encode_matched(subcoder, match_byte, symbol) \
-do { \
- uint32_t context = 1; \
- int i = 8; \
- do { \
- --i; \
- uint32_t bit = ((symbol) >> i) & 1; \
- const uint32_t match_bit = ((match_byte) >> i) & 1; \
- const uint32_t subcoder_index = 0x100 + (match_bit << 8) + context; \
- bit_encode(subcoder[subcoder_index], bit); \
- context = (context << 1) | bit; \
- if (match_bit != bit) { \
- while (i != 0) { \
- --i; \
- bit = ((symbol) >> i) & 1; \
- bit_encode(subcoder[context], bit); \
- context = (context << 1) | bit; \
- } \
- break; \
- } \
- } while (i != 0); \
-} while (0)
-
-
-#define length_encode(length_encoder, symbol, pos_state, update_price) \
-do { \
- assert((symbol) <= MATCH_MAX_LEN); \
- if ((symbol) < LEN_LOW_SYMBOLS) { \
- bit_encode_0((length_encoder).choice); \
- bittree_encode((length_encoder).low[pos_state], \
- LEN_LOW_BITS, symbol); \
- } else { \
- bit_encode_1((length_encoder).choice); \
- if ((symbol) < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS) { \
- bit_encode_0((length_encoder).choice2); \
- bittree_encode((length_encoder).mid[pos_state], \
- LEN_MID_BITS, \
- (symbol) - LEN_LOW_SYMBOLS); \
- } else { \
- bit_encode_1((length_encoder).choice2); \
- bittree_encode((length_encoder).high, LEN_HIGH_BITS, \
- (symbol) - LEN_LOW_SYMBOLS \
- - LEN_MID_SYMBOLS); \
- } \
- } \
- if (update_price) \
- if (--(length_encoder).counters[pos_state] == 0) \
- lzma_length_encoder_update_table(&(length_encoder), pos_state); \
-} while (0)
-
-
-///////////////
-// Functions //
-///////////////
-
-/// \brief Updates price table of the length encoder
-///
-/// Like all the other prices in LZMA, these are used by lzma_get_optimum().
-///
-extern void
-lzma_length_encoder_update_table(lzma_length_encoder *lencoder,
- const uint32_t pos_state)
+/////////////
+// Literal //
+/////////////
+
+static inline void
+literal_normal(lzma_range_encoder *rc, probability *subcoder, uint32_t symbol)
+{
+ uint32_t context = 1;
+ uint32_t bit_count = 8; // Bits per byte
+ do {
+ const uint32_t bit = (symbol >> --bit_count) & 1;
+ rc_bit(rc, &subcoder[context], bit);
+ context = (context << 1) | bit;
+ } while (bit_count != 0);
+}
+
+
+static inline void
+literal_matched(lzma_range_encoder *rc, probability *subcoder,
+ uint32_t match_byte, uint32_t symbol)
+{
+ uint32_t context = 1;
+ uint32_t bit_count = 8;
+
+ do {
+ uint32_t bit = (symbol >> --bit_count) & 1;
+ const uint32_t match_bit = (match_byte >> bit_count) & 1;
+ rc_bit(rc, &subcoder[(0x100 << match_bit) + context], bit);
+ context = (context << 1) | bit;
+
+ if (match_bit != bit) {
+ // The bit from the literal being encoded and the bit
+ // from the previous match differ. Finish encoding
+ // as a normal literal.
+ while (bit_count != 0) {
+ bit = (symbol >> --bit_count) & 1;
+ rc_bit(rc, &subcoder[context], bit);
+ context = (context << 1) | bit;
+ }
+
+ break;
+ }
+
+ } while (bit_count != 0);
+}
+
+
+static inline void
+literal(lzma_coder *coder)
+{
+ // Locate the literal byte to be encoded and the subcoder.
+ const uint8_t cur_byte = coder->lz.buffer[
+ coder->lz.read_pos - coder->additional_offset];
+ probability *subcoder = literal_get_subcoder(coder->literal_coder,
+ coder->now_pos, coder->previous_byte);
+
+ if (is_literal_state(coder->state)) {
+ // Previous LZMA-symbol was a literal. Encode a normal
+ // literal without a match byte.
+ literal_normal(&coder->rc, subcoder, cur_byte);
+ } else {
+ // Previous LZMA-symbol was a match. Use the last byte of
+ // the match as a "match byte". That is, compare the bits
+ // of the current literal and the match byte.
+ const uint8_t match_byte = coder->lz.buffer[
+ coder->lz.read_pos - coder->reps[0] - 1
+ - coder->additional_offset];
+ literal_matched(&coder->rc, subcoder, match_byte, cur_byte);
+ }
+
+ update_literal(coder->state);
+ coder->previous_byte = cur_byte;
+}
+
+
+//////////////////
+// Match length //
+//////////////////
+
+static inline void
+length(lzma_range_encoder *rc, lzma_length_encoder *lc,
+ const uint32_t pos_state, uint32_t len)
+{
+ assert(len <= MATCH_MAX_LEN);
+ len -= MATCH_MIN_LEN;
+
+ if (len < LEN_LOW_SYMBOLS) {
+ rc_bit(rc, &lc->choice, 0);
+ rc_bittree(rc, lc->low[pos_state], LEN_LOW_BITS, len);
+ } else {
+ rc_bit(rc, &lc->choice, 1);
+ len -= LEN_LOW_SYMBOLS;
+
+ if (len < LEN_MID_SYMBOLS) {
+ rc_bit(rc, &lc->choice2, 0);
+ rc_bittree(rc, lc->mid[pos_state], LEN_MID_BITS, len);
+ } else {
+ rc_bit(rc, &lc->choice2, 1);
+ len -= LEN_MID_SYMBOLS;
+ rc_bittree(rc, lc->high, LEN_HIGH_BITS, len);
+ }
+ }
+}
+
+
+///////////
+// Match //
+///////////
+
+static inline void
+match(lzma_coder *coder, const uint32_t pos_state,
+ const uint32_t distance, const uint32_t len)
+{
+ update_match(coder->state);
+
+ length(&coder->rc, &coder->match_len_encoder, pos_state, len);
+ coder->prev_len_encoder = &coder->match_len_encoder;
+
+ const uint32_t pos_slot = get_pos_slot(distance);
+ const uint32_t len_to_pos_state = get_len_to_pos_state(len);
+ rc_bittree(&coder->rc, coder->pos_slot_encoder[len_to_pos_state],
+ POS_SLOT_BITS, pos_slot);
+
+ if (pos_slot >= START_POS_MODEL_INDEX) {
+ const uint32_t footer_bits = (pos_slot >> 1) - 1;
+ const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
+ const uint32_t pos_reduced = distance - base;
+
+ if (pos_slot < END_POS_MODEL_INDEX) {
+ rc_bittree_reverse(&coder->rc,
+ &coder->pos_encoders[base - pos_slot - 1],
+ footer_bits, pos_reduced);
+ } else {
+ rc_direct(&coder->rc, pos_reduced >> ALIGN_BITS,
+ footer_bits - ALIGN_BITS);
+ rc_bittree_reverse(
+ &coder->rc, coder->pos_align_encoder,
+ ALIGN_BITS, pos_reduced & ALIGN_MASK);
+ ++coder->align_price_count;
+ }
+ }
+
+ coder->reps[3] = coder->reps[2];
+ coder->reps[2] = coder->reps[1];
+ coder->reps[1] = coder->reps[0];
+ coder->reps[0] = distance;
+ ++coder->match_price_count;
+}
+
+
+////////////////////
+// Repeated match //
+////////////////////
+
+static inline void
+rep_match(lzma_coder *coder, const uint32_t pos_state,
+ const uint32_t rep, const uint32_t len)
{
- const uint32_t num_symbols = lencoder->table_size;
- const uint32_t a0 = bit_get_price_0(lencoder->choice);
- const uint32_t a1 = bit_get_price_1(lencoder->choice);
- const uint32_t b0 = a1 + bit_get_price_0(lencoder->choice2);
- const uint32_t b1 = a1 + bit_get_price_1(lencoder->choice2);
+ if (rep == 0) {
+ rc_bit(&coder->rc, &coder->is_rep0[coder->state], 0);
+ rc_bit(&coder->rc,
+ &coder->is_rep0_long[coder->state][pos_state],
+ len != 1);
+ } else {
+ const uint32_t distance = coder->reps[rep];
+ rc_bit(&coder->rc, &coder->is_rep0[coder->state], 1);
+
+ if (rep == 1) {
+ rc_bit(&coder->rc, &coder->is_rep1[coder->state], 0);
+ } else {
+ rc_bit(&coder->rc, &coder->is_rep1[coder->state], 1);
+ rc_bit(&coder->rc, &coder->is_rep2[coder->state],
+ rep - 2);
+
+ if (rep == 3)
+ coder->reps[3] = coder->reps[2];
+
+ coder->reps[2] = coder->reps[1];
+ }
+
+ coder->reps[1] = coder->reps[0];
+ coder->reps[0] = distance;
+ }
+
+ if (len == 1) {
+ update_short_rep(coder->state);
+ } else {
+ length(&coder->rc, &coder->rep_len_encoder, pos_state, len);
+ coder->prev_len_encoder = &coder->rep_len_encoder;
+ update_long_rep(coder->state);
+ }
+}
+
- uint32_t *prices = lencoder->prices[pos_state];
- uint32_t i = 0;
+//////////
+// Main //
+//////////
- for (i = 0; i < num_symbols && i < LEN_LOW_SYMBOLS; ++i)
- prices[i] = a0 + bittree_get_price(lencoder->low[pos_state],
- LEN_LOW_BITS, i);
+static void
+encode_symbol(lzma_coder *coder, uint32_t pos, uint32_t len)
+{
+ const uint32_t pos_state = coder->now_pos & coder->pos_mask;
+
+ if (len == 1 && pos == UINT32_MAX) {
+ // Literal i.e. eight-bit byte
+ rc_bit(&coder->rc,
+ &coder->is_match[coder->state][pos_state], 0);
+ literal(coder);
+ } else {
+ // Some type of match
+ rc_bit(&coder->rc,
+ &coder->is_match[coder->state][pos_state], 1);
+
+ if (pos < REP_DISTANCES) {
+ // It's a repeated match i.e. the same distance
+ // has been used earlier.
+ rc_bit(&coder->rc, &coder->is_rep[coder->state], 1);
+ rep_match(coder, pos_state, pos, len);
+ } else {
+ // Normal match
+ rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
+ match(coder, pos_state, pos - REP_DISTANCES, len);
+ }
- for (; i < num_symbols && i < LEN_LOW_SYMBOLS + LEN_MID_SYMBOLS; ++i)
- prices[i] = b0 + bittree_get_price(lencoder->mid[pos_state],
- LEN_MID_BITS, i - LEN_LOW_SYMBOLS);
+ coder->previous_byte = coder->lz.buffer[
+ coder->lz.read_pos + len - 1
+ - coder->additional_offset];
+ }
- for (; i < num_symbols; ++i)
- prices[i] = b1 + bittree_get_price(
- lencoder->high, LEN_HIGH_BITS,
- i - LEN_LOW_SYMBOLS - LEN_MID_SYMBOLS);
+ assert(coder->additional_offset >= len);
+ coder->additional_offset -= len;
+ coder->now_pos += len;
+}
- lencoder->counters[pos_state] = num_symbols;
- return;
+static void
+encode_eopm(lzma_coder *coder)
+{
+ const uint32_t pos_state = coder->now_pos & coder->pos_mask;
+ rc_bit(&coder->rc, &coder->is_match[coder->state][pos_state], 1);
+ rc_bit(&coder->rc, &coder->is_rep[coder->state], 0);
+ match(coder, pos_state, UINT32_MAX, MATCH_MIN_LEN);
}
@@ -145,15 +280,6 @@ extern bool
lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
size_t *restrict out_pos, size_t out_size)
{
-#define rc_buffer coder->lz.temp
-#define rc_buffer_size coder->lz.temp_size
-
- // Local copies
- lzma_range_encoder rc = coder->rc;
- size_t out_pos_local = *out_pos;
- const uint32_t pos_mask = coder->pos_mask;
- const bool best_compression = coder->best_compression;
-
// Initialize the stream if no data has been encoded yet.
if (!coder->is_initialized) {
if (coder->lz.read_pos == coder->lz.read_limit) {
@@ -167,20 +293,10 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
// Do the actual initialization.
uint32_t len;
uint32_t num_distance_pairs;
- lzma_read_match_distances(coder, &len, &num_distance_pairs);
-
- bit_encode_0(coder->is_match[coder->state][0]);
- update_literal(coder->state);
-
- const uint8_t cur_byte = coder->lz.buffer[
- coder->lz.read_pos - coder->additional_offset];
- probability *subcoder = literal_get_subcoder(coder->literal_coder,
- coder->now_pos, coder->previous_byte);
- literal_encode(subcoder, cur_byte);
+ lzma_read_match_distances(coder,
+ &len, &num_distance_pairs);
- coder->previous_byte = cur_byte;
- --coder->additional_offset;
- ++coder->now_pos;
+ encode_symbol(coder, UINT32_MAX, 1);
assert(coder->additional_offset == 0);
}
@@ -191,19 +307,19 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
// Encoding loop
while (true) {
- // Check that there is free output space.
- if (out_pos_local == out_size)
- break;
-
- assert(rc_buffer_size == 0);
+ // Encode pending bits, if any.
+ if (rc_encode(&coder->rc, out, out_pos, out_size))
+ return false;
// Check that there is some input to process.
if (coder->lz.read_pos >= coder->lz.read_limit) {
// If flushing or finishing, we must keep encoding
// until additional_offset becomes zero to make
// all the input available at output.
- if (coder->lz.sequence == SEQ_RUN
- || coder->additional_offset == 0)
+ if (coder->lz.sequence == SEQ_RUN)
+ return false;
+
+ if (coder->additional_offset == 0)
break;
}
@@ -218,8 +334,6 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
}
#endif
- const uint32_t pos_state = coder->now_pos & pos_mask;
-
uint32_t pos;
uint32_t len;
@@ -230,175 +344,32 @@ lzma_lzma_encode(lzma_coder *coder, uint8_t *restrict out,
// - UINT32_MAX: not a match but a literal
// Value ranges for len:
// - [MATCH_MIN_LEN, MATCH_MAX_LEN]
- if (best_compression)
+ if (coder->best_compression)
lzma_get_optimum(coder, &pos, &len);
else
lzma_get_optimum_fast(coder, &pos, &len);
- if (len == 1 && pos == UINT32_MAX) {
- // It's a literal.
- bit_encode_0(coder->is_match[coder->state][pos_state]);
-
- const uint8_t cur_byte = coder->lz.buffer[
- coder->lz.read_pos - coder->additional_offset];
- probability *subcoder = literal_get_subcoder(coder->literal_coder,
- coder->now_pos, coder->previous_byte);
-
- if (is_literal_state(coder->state)) {
- literal_encode(subcoder, cur_byte);
- } else {
- const uint8_t match_byte = coder->lz.buffer[
- coder->lz.read_pos
- - coder->rep_distances[0] - 1
- - coder->additional_offset];
- literal_encode_matched(subcoder, match_byte, cur_byte);
- }
-
- update_literal(coder->state);
- coder->previous_byte = cur_byte;
-
- } else {
- // It's a match.
- bit_encode_1(coder->is_match[coder->state][pos_state]);
-
- if (pos < REP_DISTANCES) {
- // It's a repeated match i.e. the same distance
- // has been used earlier.
- bit_encode_1(coder->is_rep[coder->state]);
-
- if (pos == 0) {
- bit_encode_0(coder->is_rep0[coder->state]);
- const uint32_t symbol = (len == 1) ? 0 : 1;
- bit_encode(coder->is_rep0_long[coder->state][pos_state],
- symbol);
- } else {
- const uint32_t distance = coder->rep_distances[pos];
- bit_encode_1(coder->is_rep0[coder->state]);
-
- if (pos == 1) {
- bit_encode_0(coder->is_rep1[coder->state]);
- } else {
- bit_encode_1(coder->is_rep1[coder->state]);
- bit_encode(coder->is_rep2[coder->state], pos - 2);
-
- if (pos == 3)
- coder->rep_distances[3] = coder->rep_distances[2];
-
- coder->rep_distances[2] = coder->rep_distances[1];
- }
-
- coder->rep_distances[1] = coder->rep_distances[0];
- coder->rep_distances[0] = distance;
- }
-
- if (len == 1) {
- update_short_rep(coder->state);
- } else {
- length_encode(coder->rep_len_encoder,
- len - MATCH_MIN_LEN, pos_state,
- best_compression);
- update_long_rep(coder->state);
- }
-
- } else {
- bit_encode_0(coder->is_rep[coder->state]);
- update_match(coder->state);
- length_encode(coder->match_len_encoder, len - MATCH_MIN_LEN,
- pos_state, best_compression);
- pos -= REP_DISTANCES;
-
- const uint32_t pos_slot = get_pos_slot(pos);
- const uint32_t len_to_pos_state = get_len_to_pos_state(len);
- bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
- POS_SLOT_BITS, pos_slot);
-
- if (pos_slot >= START_POS_MODEL_INDEX) {
- const uint32_t footer_bits = (pos_slot >> 1) - 1;
- const uint32_t base = (2 | (pos_slot & 1)) << footer_bits;
- const uint32_t pos_reduced = pos - base;
-
- if (pos_slot < END_POS_MODEL_INDEX) {
- bittree_reverse_encode(
- coder->pos_encoders + base - pos_slot - 1,
- footer_bits, pos_reduced);
- } else {
- rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
- footer_bits - ALIGN_BITS);
- bittree_reverse_encode(coder->pos_align_encoder,
- ALIGN_BITS, pos_reduced & ALIGN_MASK);
- ++coder->align_price_count;
- }
- }
-
- coder->rep_distances[3] = coder->rep_distances[2];
- coder->rep_distances[2] = coder->rep_distances[1];
- coder->rep_distances[1] = coder->rep_distances[0];
- coder->rep_distances[0] = pos;
- ++coder->match_price_count;
- }
-
- coder->previous_byte = coder->lz.buffer[
- coder->lz.read_pos + len - 1
- - coder->additional_offset];
- }
-
- assert(coder->additional_offset >= len);
- coder->additional_offset -= len;
- coder->now_pos += len;
+ encode_symbol(coder, pos, len);
}
- // Check if everything is done.
- bool all_done = false;
- if (coder->lz.sequence != SEQ_RUN
- && coder->lz.read_pos == coder->lz.write_pos
- && coder->additional_offset == 0) {
- assert(coder->longest_match_was_found == false);
-
- if (coder->lz.uncompressed_size == LZMA_VLI_VALUE_UNKNOWN
- || coder->lz.sequence == SEQ_FLUSH) {
- // Write special marker: flush marker or end of payload
- // marker. Both are encoded as a match with distance of
- // UINT32_MAX. The match length codes the type of the marker.
- const uint32_t pos_state = coder->now_pos & pos_mask;
- bit_encode_1(coder->is_match[coder->state][pos_state]);
- bit_encode_0(coder->is_rep[coder->state]);
- update_match(coder->state);
-
- const uint32_t len = coder->lz.sequence == SEQ_FLUSH
- ? LEN_SPECIAL_FLUSH : LEN_SPECIAL_EOPM;
- length_encode(coder->match_len_encoder, len - MATCH_MIN_LEN,
- pos_state, best_compression);
-
- const uint32_t pos_slot = (1 << POS_SLOT_BITS) - 1;
- const uint32_t len_to_pos_state = get_len_to_pos_state(len);
- bittree_encode(coder->pos_slot_encoder[len_to_pos_state],
- POS_SLOT_BITS, pos_slot);
-
- const uint32_t footer_bits = 30;
- const uint32_t pos_reduced
- = (UINT32_C(1) << footer_bits) - 1;
- rc_encode_direct_bits(pos_reduced >> ALIGN_BITS,
- footer_bits - ALIGN_BITS);
+ assert(!coder->longest_match_was_found);
- bittree_reverse_encode(coder->pos_align_encoder, ALIGN_BITS,
- pos_reduced & ALIGN_MASK);
- }
+ if (coder->is_flushed) {
+ coder->is_flushed = false;
+ return true;
+ }
- // Flush the last bytes of compressed data from
- // the range coder to the output buffer.
- rc_flush();
+ // We don't support encoding old LZMA streams without EOPM, and LZMA2
+ // doesn't use EOPM at LZMA level.
+ if (coder->write_eopm)
+ encode_eopm(coder);
- rc_reset(rc);
+ rc_flush(&coder->rc);
- // All done. Note that some output bytes might be
- // pending in coder->lz.temp. lzma_lz_encode() will
- // take care of those bytes.
- all_done = true;
+ if (rc_encode(&coder->rc, out, out_pos, out_size)) {
+ coder->is_flushed = true;
+ return false;
}
- // Store local variables back to *coder.
- coder->rc = rc;
- *out_pos = out_pos_local;
-
- return all_done;
+ return true;
}