Major changes to LZ encoder, LZMA encoder, and range encoder.

These changes implement support for LZMA_SYNC_FLUSH in LZMA encoder, and move the temporary buffer needed by range encoder from lzma_range_encoder structure to lzma_lz_encoder.
author: Lasse Collin <lasse.collin@tukaani.org> 2008-01-14 13:39:54 +0200
committer: Lasse Collin <lasse.collin@tukaani.org> 2008-01-14 13:39:54 +0200
commit: e22b37968d153683fec61ad37b6b160cb7ca4ddc (patch)
tree: d9631e988ead9de0fcac67a9abc803a37324e3a6 /src/liblzma/lz
parent: Added one assert() to process.c of the command line tool. (diff)
download: xz-e22b37968d153683fec61ad37b6b160cb7ca4ddc.tar.xz
2 files changed, 130 insertions, 25 deletions
diff --git a/src/liblzma/lz/lz_encoder.c b/src/liblzma/lz/lz_encoder.c
index 629f9df2..8d2277ec 100644
--- a/src/liblzma/lz/lz_encoder.c
+++ b/src/liblzma/lz/lz_encoder.c
@@ -141,8 +141,9 @@ lzma_lz_encoder_reset(lzma_lz_encoder *lz, lzma_allocator *allocator,
 		const uint8_t *preset_dictionary,
 		size_t preset_dictionary_size)
 {
-	// Set uncompressed size.
+	lz->sequence = SEQ_RUN;
 	lz->uncompressed_size = uncompressed_size;
+	lz->temp_size = 0;
 
 	///////////////
 	// In Window //
@@ -187,7 +188,6 @@ lzma_lz_encoder_reset(lzma_lz_encoder *lz, lzma_allocator *allocator,
 	lz->read_pos = 0;
 	lz->read_limit = 0;
 	lz->write_pos = 0;
-	lz->stream_end_was_reached = false;
 
 
 	//////////////////
@@ -368,35 +368,59 @@ fill_window(lzma_coder *coder, lzma_allocator *allocator, const uint8_t *in,
 		size_t *in_pos, size_t in_size, lzma_action action)
 {
 	assert(coder->lz.read_pos <= coder->lz.write_pos);
-	lzma_ret ret;
 
 	// Move the sliding window if needed.
 	if (coder->lz.read_pos >= coder->lz.size - coder->lz.keep_size_after)
 		move_window(&coder->lz);
 
+	size_t in_used;
+	lzma_ret ret;
 	if (coder->next.code == NULL) {
 		// Not using a filter, simply memcpy() as much as possible.
-		bufcpy(in, in_pos, in_size, coder->lz.buffer,
+		in_used = bufcpy(in, in_pos, in_size, coder->lz.buffer,
 				&coder->lz.write_pos, coder->lz.size);
 
-		if (action == LZMA_FINISH && *in_pos == in_size)
+		if (action != LZMA_RUN && *in_pos == in_size)
 			ret = LZMA_STREAM_END;
 		else
 			ret = LZMA_OK;
 
 	} else {
+		const size_t in_start = *in_pos;
 		ret = coder->next.code(coder->next.coder, allocator,
 				in, in_pos, in_size,
 				coder->lz.buffer, &coder->lz.write_pos,
 				coder->lz.size, action);
+		in_used = *in_pos - in_start;
 	}
 
-	// If end of stream has been reached, we allow the encoder to process
-	// all the input (that is, read_pos is allowed to reach write_pos).
-	// Otherwise we keep keep_size_after bytes available as prebuffer.
+	assert(coder->lz.uncompressed_size >= in_used);
+	if (coder->lz.uncompressed_size != LZMA_VLI_VALUE_UNKNOWN)
+		coder->lz.uncompressed_size -= in_used;
+
+	// If end of stream has been reached or flushing completed, we allow
+	// the encoder to process all the input (that is, read_pos is allowed
+	// to reach write_pos). Otherwise we keep keep_size_after bytes
+	// available as prebuffer.
 	if (ret == LZMA_STREAM_END) {
-		coder->lz.stream_end_was_reached = true;
+		assert(*in_pos == in_size);
 		coder->lz.read_limit = coder->lz.write_pos;
+		ret = LZMA_OK;
+
+		switch (action) {
+		case LZMA_SYNC_FLUSH:
+			coder->lz.sequence = SEQ_FLUSH;
+			break;
+
+		case LZMA_FINISH:
+			coder->lz.sequence = SEQ_FINISH;
+			break;
+
+		default:
+			assert(0);
+			ret = LZMA_PROG_ERROR;
+			break;
+		}
 
 	} else if (coder->lz.write_pos > coder->lz.keep_size_after) {
 		// This needs to be done conditionally, because if we got
@@ -406,6 +430,19 @@ fill_window(lzma_coder *coder, lzma_allocator *allocator, const uint8_t *in,
 				- coder->lz.keep_size_after;
 	}
 
+	// Switch to finishing mode if we have got all the input data.
+	// lzma_lz_encode() won't return LZMA_STREAM_END until LZMA_FINISH
+	// is used.
+	//
+	// NOTE: When LZMA is used together with other filters, it is possible
+	// that coder->lz.sequence gets set to SEQ_FINISH before the next
+	// encoder has returned LZMA_STREAM_END. This is somewhat ugly, but
+	// works correctly, because the next encoder cannot have any more
+	// output left to be produced. If it had, then our known Uncompressed
+	// Size would be invalid, which would mean that we have a bad bug.
+	if (ret == LZMA_OK && coder->lz.uncompressed_size == 0)
+		coder->lz.sequence = SEQ_FINISH;
+
 	return ret;
 }
 
@@ -417,20 +454,81 @@ lzma_lz_encode(lzma_coder *coder, lzma_allocator *allocator,
 		uint8_t *restrict out, size_t *restrict out_pos,
 		size_t out_size, lzma_action action)
 {
-	while (*out_pos < out_size
-			&& (*in_pos < in_size || action == LZMA_FINISH)) {
-		// Fill the input window if there is no more usable data.
-		if (!coder->lz.stream_end_was_reached && coder->lz.read_pos
-				>= coder->lz.read_limit) {
-			const lzma_ret ret = fill_window(coder, allocator,
-					in, in_pos, in_size, action);
-			if (ret != LZMA_OK && ret != LZMA_STREAM_END)
-				return ret;
+	// Flush the temporary output buffer, which may be used when the
+	// encoder runs of out of space in primary output buffer (the out,
+	// *out_pos, and out_size variables).
+	if (coder->lz.temp_size > 0) {
+		const size_t out_avail = out_size - *out_pos;
+		if (out_avail < coder->lz.temp_size) {
+			// Cannot copy everything. Copy as much as possible
+			// and move the data in lz.temp to the beginning of
+			// that buffer.
+			memcpy(out + *out_pos, coder->lz.temp, out_avail);
+			*out_pos += out_avail;
+			memmove(coder->lz.temp, coder->lz.temp + out_avail,
+					coder->lz.temp_size - out_avail);
+			coder->lz.temp_size -= out_avail;
+			return LZMA_OK;
 		}
 
+		// We can copy everything from coder->lz.temp to out.
+		memcpy(out + *out_pos, coder->lz.temp, coder->lz.temp_size);
+		*out_pos += coder->lz.temp_size;
+		coder->lz.temp_size = 0;
+	}
+
+	if (coder->lz.sequence == SEQ_FLUSH_END) {
+		// During an earlier call to this function, flushing was
+		// otherwise finished except some data was left pending
+		// in coder->lz.buffer. Now we have copied all that data
+		// to the output buffer and can return LZMA_STREAM_END.
+		coder->lz.sequence = SEQ_RUN;
+		assert(action == LZMA_SYNC_FLUSH);
+		return LZMA_STREAM_END;
+	}
+
+	if (coder->lz.sequence == SEQ_END) {
+		// This is like the above flushing case, but for finishing
+		// the encoding.
+		//
+		// NOTE: action is not necesarily LZMA_FINISH; it can
+		// be LZMA_SYNC_FLUSH too in case it is used at the
+		// end of the stream with known Uncompressed Size.
+		return action != LZMA_RUN ? LZMA_STREAM_END : LZMA_OK;
+	}
+
+	while (*out_pos < out_size
+			&& (*in_pos < in_size || action != LZMA_RUN)) {
+		// Read more data to coder->lz.buffer if needed.
+		if (coder->lz.sequence == SEQ_RUN
+				&& coder->lz.read_pos >= coder->lz.read_limit)
+			return_if_error(fill_window(coder, allocator,
+					in, in_pos, in_size, action));
+
 		// Encode
-		if (coder->lz.process(coder, out, out_pos, out_size))
-			return LZMA_STREAM_END;
+		if (coder->lz.process(coder, out, out_pos, out_size)) {
+			if (coder->lz.sequence == SEQ_FLUSH) {
+				assert(action == LZMA_SYNC_FLUSH);
+				if (coder->lz.temp_size == 0) {
+					// Flushing was finished successfully.
+					coder->lz.sequence = SEQ_RUN;
+				} else {
+					// Flushing was otherwise finished,
+					// except that some data was left
+					// into coder->lz.buffer.
+					coder->lz.sequence = SEQ_FLUSH_END;
+				}
+			} else {
+				// NOTE: action may be LZMA_RUN here in case
+				// Uncompressed Size is known and we have
+				// processed all the data already.
+				assert(coder->lz.sequence == SEQ_FINISH);
+				coder->lz.sequence = SEQ_END;
+			}
+
+			return action != LZMA_RUN && coder->lz.temp_size == 0
+					? LZMA_STREAM_END : LZMA_OK;
+		}
 	}
 
 	return LZMA_OK;
diff --git a/src/liblzma/lz/lz_encoder.h b/src/liblzma/lz/lz_encoder.h
index fe94618b..11d12722 100644
--- a/src/liblzma/lz/lz_encoder.h
+++ b/src/liblzma/lz/lz_encoder.h
@@ -24,11 +24,15 @@
 #include "common.h"
 
 
+#define LZMA_LZ_TEMP_SIZE 64
+
+
 typedef struct lzma_lz_encoder_s lzma_lz_encoder;
 struct lzma_lz_encoder_s {
 	enum {
-		SEQ_INIT,
 		SEQ_RUN,
+		SEQ_FLUSH,
+		SEQ_FLUSH_END,
 		SEQ_FINISH,
 		SEQ_END
 	} sequence;
@@ -36,8 +40,15 @@ struct lzma_lz_encoder_s {
 	bool (*process)(lzma_coder *coder, uint8_t *restrict out,
 			size_t *restrict out_pos, size_t out_size);
 
+	/// Uncompressed Size or LZMA_VLI_VALUE_UNKNOWN if using EOPM. We need
+	/// to track Uncompressed Size to prevent writing flush marker to the
+	/// very end of stream that doesn't use EOPM.
 	lzma_vli uncompressed_size;
 
+	/// Temporary buffer for range encoder.
+	uint8_t temp[LZMA_LZ_TEMP_SIZE];
+	size_t temp_size;
+
 	///////////////
 	// In Window //
 	///////////////
@@ -84,10 +95,6 @@ struct lzma_lz_encoder_s {
 	/// is allowed to reach write_pos).
 	size_t keep_size_after;
 
-	/// This is set to true once the last byte of the input data has
-	/// been copied to buffer.
-	bool stream_end_was_reached;
-
 	//////////////////
 	// Match Finder //
 	//////////////////
author	Lasse Collin <lasse.collin@tukaani.org>	2008-01-14 13:39:54 +0200
committer	Lasse Collin <lasse.collin@tukaani.org>	2008-01-14 13:39:54 +0200
commit	e22b37968d153683fec61ad37b6b160cb7ca4ddc (patch)
tree	d9631e988ead9de0fcac67a9abc803a37324e3a6 /src/liblzma/lz
parent	Added one assert() to process.c of the command line tool. (diff)
download	xz-e22b37968d153683fec61ad37b6b160cb7ca4ddc.tar.xz