From 3b34851de1eaf358cf9268922fa0eeed8278d680 Mon Sep 17 00:00:00 2001 From: Lasse Collin Date: Thu, 28 Aug 2008 22:53:15 +0300 Subject: Sort of garbage collection commit. :-| Many things are still broken. API has changed a lot and it will still change a little more here and there. The command line tool doesn't have all the required changes to reflect the API changes, so it's easy to get "internal error" or trigger assertions. --- src/liblzma/lz/lz_encoder.h | 334 +++++++++++++++++++++++++++++++++----------- 1 file changed, 254 insertions(+), 80 deletions(-) (limited to 'src/liblzma/lz/lz_encoder.h') diff --git a/src/liblzma/lz/lz_encoder.h b/src/liblzma/lz/lz_encoder.h index da0e0804..45bb8462 100644 --- a/src/liblzma/lz/lz_encoder.h +++ b/src/liblzma/lz/lz_encoder.h @@ -3,8 +3,8 @@ /// \file lz_encoder.h /// \brief LZ in window and match finder API // -// Copyright (C) 1999-2006 Igor Pavlov -// Copyright (C) 2007 Lasse Collin +// Copyright (C) 1999-2008 Igor Pavlov +// Copyright (C) 2008 Lasse Collin // // This library is free software; you can redistribute it and/or // modify it under the terms of the GNU Lesser General Public @@ -24,19 +24,16 @@ #include "common.h" -typedef struct lzma_lz_encoder_s lzma_lz_encoder; -struct lzma_lz_encoder_s { - enum { - SEQ_RUN, - SEQ_FLUSH, - SEQ_FINISH, - } sequence; +/// A table of these is used by the LZ-based encoder to hold +/// the length-distance pairs found by the match finder. +typedef struct { + uint32_t len; + uint32_t dist; +} lzma_match; - /// Function to do the actual encoding from the sliding input window - /// to the output stream. - bool (*process)(lzma_coder *coder, uint8_t *restrict out, - size_t *restrict out_pos, size_t out_size); +typedef struct lzma_mf_s lzma_mf; +struct lzma_mf_s { /////////////// // In Window // /////////////// @@ -46,17 +43,33 @@ struct lzma_lz_encoder_s { /// Total size of the allocated buffer (that is, including all /// the extra space) - size_t size; + uint32_t size; + + /// Number of bytes that must be kept available in our input history. + /// That is, once keep_size_before bytes have been processed, + /// buffer[read_pos - keep_size_before] is the oldest byte that + /// must be available for reading. + uint32_t keep_size_before; + + /// Number of bytes that must be kept in buffer after read_pos. + /// That is, read_pos <= write_pos - keep_size_after as long as + /// stream_end_was_reached is false (once it is true, read_pos + /// is allowed to reach write_pos). + uint32_t keep_size_after; /// Match finders store locations of matches using 32-bit integers. /// To avoid adjusting several megabytes of integers every time the /// input window is moved with move_window(), we only adjust the /// offset of the buffer. Thus, buffer[match_finder_pos - offset] /// is the byte pointed by match_finder_pos. - size_t offset; + uint32_t offset; /// buffer[read_pos] is the current byte. - size_t read_pos; + uint32_t read_pos; + + /// Number of bytes that have been ran through the match finder, but + /// which haven't been encoded by the LZ-based encoder yet. + uint32_t read_ahead; /// As long as read_pos is less than read_limit, there is enough /// input available in buffer for at least one encoding loop. @@ -64,92 +77,253 @@ struct lzma_lz_encoder_s { /// Because of the stateful API, read_limit may and will get greater /// than read_pos quite often. This is taken into account when /// calculating the value for keep_size_after. - size_t read_limit; + uint32_t read_limit; /// buffer[write_pos] is the first byte that doesn't contain valid /// uncompressed data; that is, the next input byte will be copied /// to buffer[write_pos]. - size_t write_pos; + uint32_t write_pos; /// Number of bytes not hashed before read_pos. This is needed to /// restart the match finder after LZMA_SYNC_FLUSH. - size_t pending; - - /// Number of bytes that must be kept available in our input history. - /// That is, once keep_size_before bytes have been processed, - /// buffer[read_pos - keep_size_before] is the oldest byte that - /// must be available for reading. - size_t keep_size_before; - - /// Number of bytes that must be kept in buffer after read_pos. - /// That is, read_pos <= write_pos - keep_size_after as long as - /// stream_end_was_reached is false (once it is true, read_pos - /// is allowed to reach write_pos). - size_t keep_size_after; + uint32_t pending; ////////////////// // Match Finder // ////////////////// - // Pointers to match finder functions - void (*get_matches)(lzma_lz_encoder *restrict lz, - uint32_t *restrict distances); - void (*skip)(lzma_lz_encoder *restrict lz, uint32_t num); + /// Find matches. Returns the number of distance-length pairs written + /// to the matches array. This is called only via lzma_mf_find. + uint32_t (*find)(lzma_mf *mf, lzma_match *matches); + + /// Skips num bytes. This is like find() but doesn't make the + /// distance-length pairs available, thus being a little faster. + /// This is called only via mf_skip function. + void (*skip)(lzma_mf *mf, uint32_t num); - // Match finder data - uint32_t *hash; // TODO: Check if hash aliases son - uint32_t *son; // and add 'restrict' if possible. + uint32_t *hash; + uint32_t *son; uint32_t cyclic_buffer_pos; uint32_t cyclic_buffer_size; // Must be dictionary_size + 1. uint32_t hash_mask; - uint32_t cut_value; + + /// Maximum number of loops in the match finder + uint32_t loops; + + /// Maximum length of a match that the match finder will try to find. + uint32_t find_len_max; + + /// Maximum length of a match supported by the LZ-based encoder. + /// If the longest match found by the match finder is find_len_max, + /// lz_dict_find() tries to expand it up to match_len_max bytes. + uint32_t match_len_max; + + /// When running out of input, binary tree match finders need to know + /// if it is due to flushing or finishing. The action is used also + /// by the LZ-based encoders themselves. + lzma_action action; + + /// Number of elements in hash[] uint32_t hash_size_sum; - uint32_t num_items; - uint32_t match_max_len; + + /// Number of elements in son[] + uint32_t sons_count; }; -#define LZMA_LZ_ENCODER_INIT \ - (lzma_lz_encoder){ \ - .buffer = NULL, \ - .size = 0, \ - .hash = NULL, \ - .num_items = 0, \ +typedef struct { + /// Extra amount of data to keep available before the "actual" + /// dictionary. + size_t before_size; + + /// Size of the history buffer + size_t dictionary_size; + + /// Extra amount of data to keep available after the "actual" + /// dictionary. + size_t after_size; + + /// Maximum length of a match that the LZ-based encoder can accept. + /// This is used to extend matches of length find_len_max to the + /// maximum possible length. + size_t match_len_max; + + /// Match finder will search matches of at maximum of this length. + /// This must be less than or equal to match_len_max. + size_t find_len_max; + + /// Type of the match finder to use + lzma_match_finder match_finder; + + /// TODO: Comment + uint32_t match_finder_cycles; + + /// TODO: Comment + const uint8_t *preset_dictionary; + + uint32_t preset_dictionary_size; + +} lzma_lz_options; + + +// The total usable buffer space at any moment outside the match finder: +// before_size + dictionary_size + after_size + match_len_max +// +// In reality, there's some extra space allocated to prevent the number of +// memmove() calls reasonable. The bigger the dictionary_size is, the bigger +// this extra buffer will be since with bigger dictionaries memmove() would +// also take longer. +// +// A single encoder loop in the LZ-based encoder may call the match finder +// (lz_dict_find() or lz_dict_skip()) at maximum of after_size times. +// In other words, a single encoder loop may advance lz_dict.read_pos at +// maximum of after_size times. Since matches are looked up to +// lz_dict.buffer[lz_dict.read_pos + match_len_max - 1], the total +// amount of extra buffer needed after dictionary_size becomes +// after_size + match_len_max. +// +// before_size has two uses. The first one is to keep literals available +// in cases when the LZ-based encoder has made some read ahead. +// TODO: Maybe this could be changed by making the LZ-based encoders to +// store the actual literals as they do with length-distance pairs. +// +// Alrogithms such as LZMA2 first try to compress a chunk, and then check +// if the encoded result is smaller than the uncompressed one. If the chunk +// was uncompressible, it is better to store it in uncompressed form in +// the output stream. To do this, the whole uncompressed chunk has to be +// still available in the history buffer. before_size achieves that. + + +typedef struct { + /// Data specific to the LZ-based encoder + lzma_coder *coder; + + /// Function to encode from *dict to out[] + lzma_ret (*code)(lzma_coder *restrict coder, + lzma_mf *restrict mf, uint8_t *restrict out, + size_t *restrict out_pos, size_t out_size); + + /// Free allocated resources + void (*end)(lzma_coder *coder, lzma_allocator *allocator); + +} lzma_lz_encoder; + + +// Basic steps: +// 1. Input gets copied into the dictionary. +// 2. Data in dictionary gets run through the match finder byte by byte. +// 3. The literals and matches are encoded using e.g. LZMA. +// +// The bytes that have been ran through the match finder, but not encoded yet, +// are called `read ahead'. + + +/// Get pointer to the first byte not ran through the match finder +static inline const uint8_t * +mf_ptr(const lzma_mf *mf) +{ + return mf->buffer + mf->read_pos; +} + + +/// Get the number of bytes that haven't been ran through the match finder yet. +static inline uint32_t +mf_avail(const lzma_mf *mf) +{ + return mf->write_pos - mf->read_pos; +} + + +/// Get the number of bytes that haven't been encoded yet (some of these +/// bytes may have been ran through the match finder though). +static inline uint32_t +mf_unencoded(const lzma_mf *mf) +{ + return mf->write_pos - mf->read_pos - mf->read_ahead; +} + + +/// Calculate the absolute offset from the beginning of the most recent +/// dictionary reset. Only the lowest four bits are important, so there's no +/// problem that we don't know the 64-bit size of the data encoded so far. +/// +/// NOTE: When moving the input window, we need to do it so that the lowest +/// bits of dict->read_pos are not modified to keep this macro working +/// as intended. +static inline uint32_t +mf_position(const lzma_mf *mf) +{ + return mf->read_pos - mf->read_ahead; +} + + +/// Since everything else begins with mf_, use it also for lzma_mf_find(). +#define mf_find lzma_mf_find + + +/// Skip the given number of bytes. This is used when a good match was found. +/// For example, if mf_find() finds a match of 200 bytes long, the first byte +/// of that match was already consumed by mf_find(), and the rest 199 bytes +/// have to be skipped with mf_skip(mf, 199). +static inline void +mf_skip(lzma_mf *mf, uint32_t amount) +{ + if (amount != 0) { + mf->skip(mf, amount); + mf->read_ahead += amount; } +} + + +/// Copies at maximum of *left amount of bytes from the history buffer +/// to out[]. This is needed by LZMA2 to encode uncompressed chunks. +static inline void +mf_read(lzma_mf *mf, uint8_t *out, size_t *out_pos, size_t out_size, + size_t *left) +{ + const size_t out_avail = out_size - *out_pos; + const size_t copy_size = MIN(out_avail, *left); + + assert(mf->read_ahead == 0); + assert(mf->read_pos >= *left); + + memcpy(out + *out_pos, mf->buffer + mf->read_pos - *left, + copy_size); + + *out_pos += copy_size; + *left -= copy_size; + return; +} + + +extern lzma_ret lzma_lz_encoder_init( + lzma_next_coder *next, lzma_allocator *allocator, + const lzma_filter_info *filters, + lzma_ret (*lz_init)(lzma_lz_encoder *lz, + lzma_allocator *allocator, const void *options, + lzma_lz_options *lz_options)); + + +extern uint64_t lzma_lz_encoder_memusage(const lzma_lz_options *lz_options); + + +// These are only for LZ encoder's internal use. +extern uint32_t lzma_mf_find( + lzma_mf *mf, uint32_t *count, lzma_match *matches); + +extern uint32_t lzma_mf_hc3_find(lzma_mf *dict, lzma_match *matches); +extern void lzma_mf_hc3_skip(lzma_mf *dict, uint32_t amount); + +extern uint32_t lzma_mf_hc4_find(lzma_mf *dict, lzma_match *matches); +extern void lzma_mf_hc4_skip(lzma_mf *dict, uint32_t amount); + +extern uint32_t lzma_mf_bt2_find(lzma_mf *dict, lzma_match *matches); +extern void lzma_mf_bt2_skip(lzma_mf *dict, uint32_t amount); +extern uint32_t lzma_mf_bt3_find(lzma_mf *dict, lzma_match *matches); +extern void lzma_mf_bt3_skip(lzma_mf *dict, uint32_t amount); -/// Calculates -extern bool lzma_lz_encoder_hash_properties(lzma_match_finder match_finder, - uint32_t history_size, uint32_t *restrict hash_mask, - uint32_t *restrict hash_size_sum, - uint32_t *restrict num_items); - -// NOTE: liblzma doesn't use callback API like LZMA SDK does. The caller -// must make sure that keep_size_after is big enough for single encoding pass -// i.e. keep_size_after >= maximum number of bytes possibly needed after -// the current position between calls to lzma_lz_read(). -extern lzma_ret lzma_lz_encoder_reset(lzma_lz_encoder *lz, - lzma_allocator *allocator, - bool (*process)(lzma_coder *coder, uint8_t *restrict out, - size_t *restrict out_pos, size_t out_size), - size_t history_size, size_t additional_buffer_before, - size_t match_max_len, size_t additional_buffer_after, - lzma_match_finder match_finder, uint32_t match_finder_cycles, - const uint8_t *preset_dictionary, - size_t preset_dictionary_size); - -/// Frees memory allocated for in window and match finder buffers. -extern void lzma_lz_encoder_end( - lzma_lz_encoder *lz, lzma_allocator *allocator); - -extern lzma_ret lzma_lz_encode(lzma_coder *coder, - lzma_allocator *allocator lzma_attribute((unused)), - const uint8_t *restrict in, size_t *restrict in_pos, - size_t in_size, uint8_t *restrict out, - size_t *restrict out_pos, size_t out_size, - lzma_action action); - -/// This should not be called directly, but only via move_pos() macro. -extern void lzma_lz_encoder_normalize(lzma_lz_encoder *lz); +extern uint32_t lzma_mf_bt4_find(lzma_mf *dict, lzma_match *matches); +extern void lzma_mf_bt4_skip(lzma_mf *dict, uint32_t amount); #endif -- cgit v1.2.3