From 18c10c30d2833f394cd7bce0e6a821044b15832f Mon Sep 17 00:00:00 2001
From: Lasse Collin <lasse.collin@tukaani.org>
Date: Sat, 4 Jul 2009 00:40:44 +0300
Subject: Make "xz --decompress --stdout --force" copy unrecognized files as is
 to standard output.

This feature is needed to be more compatible with gzip's
behavior. This was more complicated to implement than it
sounds, because the way liblzma is able to return errors with
files of only a few bytes in size. xz now has its own file
type detection code and no longer uses lzma_auto_decoder().
---
 src/xz/coder.c | 213 +++++++++++++++++++++++++++++++++++++++++++++++----------
 1 file changed, 178 insertions(+), 35 deletions(-)

(limited to 'src/xz')

diff --git a/src/xz/coder.c b/src/xz/coder.c
index 5584cac4..90c99271 100644
--- a/src/xz/coder.c
+++ b/src/xz/coder.c
@@ -13,6 +13,14 @@
 #include "private.h"
 
 
+/// Return value type for coder_init().
+enum coder_init_ret {
+	CODER_INIT_NORMAL,
+	CODER_INIT_PASSTHRU,
+	CODER_INIT_ERROR,
+};
+
+
 enum operation_mode opt_mode = MODE_COMPRESS;
 
 enum format_type opt_format = FORMAT_AUTO;
@@ -24,6 +32,10 @@ static lzma_stream strm = LZMA_STREAM_INIT;
 /// Filters needed for all encoding all formats, and also decoding in raw data
 static lzma_filter filters[LZMA_FILTERS_MAX + 1];
 
+/// Input and output buffers
+static uint8_t in_buf[IO_BUFFER_SIZE];
+static uint8_t out_buf[IO_BUFFER_SIZE];
+
 /// Number of filters. Zero indicates that we are using a preset.
 static size_t filters_count = 0;
 
@@ -251,8 +263,69 @@ coder_set_compression_settings(void)
 }
 
 
+/// Return true if the data in in_buf seems to be in the .xz format.
 static bool
-coder_init(void)
+is_format_xz(void)
+{
+	return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0;
+}
+
+
+/// Return true if the data in in_buf seems to be in the .lzma format.
+static bool
+is_format_lzma(void)
+{
+	// The .lzma header is 13 bytes.
+	if (strm.avail_in < 13)
+		return false;
+
+	// Decode the LZMA1 properties.
+	lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
+	if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK)
+		return false;
+
+	// A hack to ditch tons of false positives: We allow only dictionary
+	// sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone
+	// created only files with 2^n, but accepts any dictionary size.
+	// If someone complains, this will be reconsidered.
+	lzma_options_lzma *opt = filter.options;
+	const uint32_t dict_size = opt->dict_size;
+	free(opt);
+
+	if (dict_size != UINT32_MAX) {
+		uint32_t d = dict_size - 1;
+		d |= d >> 2;
+		d |= d >> 3;
+		d |= d >> 4;
+		d |= d >> 8;
+		d |= d >> 16;
+		++d;
+		if (d != dict_size || dict_size == 0)
+			return false;
+	}
+
+	// Another hack to ditch false positives: Assume that if the
+	// uncompressed size is known, it must be less than 256 GiB.
+	// Again, if someone complains, this will be reconsidered.
+	uint64_t uncompressed_size = 0;
+	for (size_t i = 0; i < 8; ++i)
+		uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8);
+
+	if (uncompressed_size != UINT64_MAX
+			&& uncompressed_size > (UINT64_C(1) << 38))
+		return false;
+
+	return true;
+}
+
+
+/// Detect the input file type (for now, this done only when decompressing),
+/// and initialize an appropriate coder. Return value indicates if a normal
+/// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru
+/// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred
+/// (CODER_INIT_ERROR).
+static enum coder_init_ret
+coder_init(file_pair *pair)
 {
 	lzma_ret ret = LZMA_PROG_ERROR;
 
@@ -279,10 +352,45 @@ coder_init(void)
 		const uint32_t flags = LZMA_TELL_UNSUPPORTED_CHECK
 				| LZMA_CONCATENATED;
 
+		// We abuse FORMAT_AUTO to indicate unknown file format,
+		// for which we may consider passthru mode.
+		enum format_type init_format = FORMAT_AUTO;
+
 		switch (opt_format) {
 		case FORMAT_AUTO:
-			ret = lzma_auto_decoder(&strm,
-					hardware_memlimit_get(), flags);
+			if (is_format_xz())
+				init_format = FORMAT_XZ;
+			else if (is_format_lzma())
+				init_format = FORMAT_LZMA;
+			break;
+
+		case FORMAT_XZ:
+			if (is_format_xz())
+				init_format = FORMAT_XZ;
+			break;
+
+		case FORMAT_LZMA:
+			if (is_format_lzma())
+				init_format = FORMAT_LZMA;
+			break;
+
+		case FORMAT_RAW:
+			init_format = FORMAT_RAW;
+			break;
+		}
+
+		switch (init_format) {
+		case FORMAT_AUTO:
+			// Uknown file format. If --decompress --stdout
+			// --force have been given, then we copy the input
+			// as is to stdout. Checking for MODE_DECOMPRESS
+			// is needed, because we don't want to do use
+			// passthru mode with --test.
+			if (opt_mode == MODE_DECOMPRESS
+					&& opt_stdout && opt_force)
+				return CODER_INIT_PASSTHRU;
+
+			ret = LZMA_FORMAT_ERROR;
 			break;
 
 		case FORMAT_XZ:
@@ -304,35 +412,30 @@ coder_init(void)
 	}
 
 	if (ret != LZMA_OK) {
-		if (ret == LZMA_MEM_ERROR)
-			message_error("%s", message_strm(LZMA_MEM_ERROR));
-		else
-			message_bug();
-
-		return true;
+		message_error("%s: %s", pair->src_name, message_strm(ret));
+		return CODER_INIT_ERROR;
 	}
 
-	return false;
+	return CODER_INIT_NORMAL;
 }
 
 
+/// Compress or decompress using liblzma.
 static bool
-coder_main(file_pair *pair)
+coder_normal(file_pair *pair)
 {
-	// Buffers to hold input and output data.
-	uint8_t in_buf[IO_BUFFER_SIZE];
-	uint8_t out_buf[IO_BUFFER_SIZE];
+	// Encoder needs to know when we have given all the input to it.
+	// The decoders need to know it too when we are using
+	// LZMA_CONCATENATED. We need to check for src_eof here, because
+	// the first input chunk has been already read, and that may
+	// have been the only chunk we will read.
+	lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN;
 
-	// Initialize the progress indicator.
-	const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
-			? 0 : (uint64_t)(pair->src_st.st_size);
-	message_progress_start(&strm, pair->src_name, in_size);
-
-	lzma_action action = LZMA_RUN;
 	lzma_ret ret;
-	bool success = false; // Assume that something goes wrong.
 
-	strm.avail_in = 0;
+	// Assume that something goes wrong.
+	bool success = false;
+
 	strm.next_out = out_buf;
 	strm.avail_out = IO_BUFFER_SIZE;
 
@@ -346,9 +449,6 @@ coder_main(file_pair *pair)
 			if (strm.avail_in == SIZE_MAX)
 				break;
 
-			// Encoder needs to know when we have given all the
-			// input to it. The decoders need to know it too when
-			// we are using LZMA_CONCATENATED.
 			if (pair->src_eof)
 				action = LZMA_FINISH;
 		}
@@ -457,28 +557,71 @@ coder_main(file_pair *pair)
 		message_progress_update();
 	}
 
-	message_progress_end(success);
-
 	return success;
 }
 
 
+/// Copy from input file to output file without processing the data in any
+/// way. This is used only when trying to decompress unrecognized files
+/// with --decompress --stdout --force, so the output is always stdout.
+static bool
+coder_passthru(file_pair *pair)
+{
+	while (strm.avail_in != 0) {
+		if (user_abort)
+			return false;
+
+		if (io_write(pair, in_buf, strm.avail_in))
+			return false;
+
+		strm.total_in += strm.avail_in;
+		strm.total_out = strm.total_in;
+		message_progress_update();
+
+		strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+		if (strm.avail_in == SIZE_MAX)
+			return false;
+	}
+
+	return true;
+}
+
+
 extern void
 coder_run(const char *filename)
 {
-	// First try initializing the coder. If it fails, it's useless to try
-	// opening the file. Check also for user_abort just in case if we had
-	// got a signal while initializing the coder.
-	if (coder_init() || user_abort)
-		return;
-
 	// Try to open the input and output files.
 	file_pair *pair = io_open(filename);
 	if (pair == NULL)
 		return;
 
-	// Do the actual coding.
-	const bool success = coder_main(pair);
+	// Initialize the progress indicator.
+	const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
+			? 0 : (uint64_t)(pair->src_st.st_size);
+	message_progress_start(&strm, pair->src_name, in_size);
+
+	// Assume that something goes wrong.
+	bool success = false;
+
+	// Read the first chunk of input data. This is needed to detect
+	// the input file type (for now, only for decompression).
+	strm.next_in = in_buf;
+	strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+
+	switch (coder_init(pair)) {
+	case CODER_INIT_NORMAL:
+		success = coder_normal(pair);
+		break;
+
+	case CODER_INIT_PASSTHRU:
+		success = coder_passthru(pair);
+		break;
+
+	case CODER_INIT_ERROR:
+		break;
+	}
+
+	message_progress_end(success);
 
 	// Close the file pair. It needs to know if coding was successful to
 	// know if the source or target file should be unlinked.
-- 
cgit v1.2.3