1 files changed, 178 insertions, 35 deletions
diff --git a/src/xz/coder.c b/src/xz/coder.c
index 5584cac4..90c99271 100644
--- a/src/xz/coder.c
+++ b/src/xz/coder.c
@@ -13,6 +13,14 @@
 #include "private.h"
 
 
+/// Return value type for coder_init().
+enum coder_init_ret {
+	CODER_INIT_NORMAL,
+	CODER_INIT_PASSTHRU,
+	CODER_INIT_ERROR,
+};
+
+
 enum operation_mode opt_mode = MODE_COMPRESS;
 
 enum format_type opt_format = FORMAT_AUTO;
@@ -24,6 +32,10 @@ static lzma_stream strm = LZMA_STREAM_INIT;
 /// Filters needed for all encoding all formats, and also decoding in raw data
 static lzma_filter filters[LZMA_FILTERS_MAX + 1];
 
+/// Input and output buffers
+static uint8_t in_buf[IO_BUFFER_SIZE];
+static uint8_t out_buf[IO_BUFFER_SIZE];
+
 /// Number of filters. Zero indicates that we are using a preset.
 static size_t filters_count = 0;
 
@@ -251,8 +263,69 @@ coder_set_compression_settings(void)
 }
 
 
+/// Return true if the data in in_buf seems to be in the .xz format.
 static bool
-coder_init(void)
+is_format_xz(void)
+{
+	return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0;
+}
+
+
+/// Return true if the data in in_buf seems to be in the .lzma format.
+static bool
+is_format_lzma(void)
+{
+	// The .lzma header is 13 bytes.
+	if (strm.avail_in < 13)
+		return false;
+
+	// Decode the LZMA1 properties.
+	lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
+	if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK)
+		return false;
+
+	// A hack to ditch tons of false positives: We allow only dictionary
+	// sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone
+	// created only files with 2^n, but accepts any dictionary size.
+	// If someone complains, this will be reconsidered.
+	lzma_options_lzma *opt = filter.options;
+	const uint32_t dict_size = opt->dict_size;
+	free(opt);
+
+	if (dict_size != UINT32_MAX) {
+		uint32_t d = dict_size - 1;
+		d |= d >> 2;
+		d |= d >> 3;
+		d |= d >> 4;
+		d |= d >> 8;
+		d |= d >> 16;
+		++d;
+		if (d != dict_size || dict_size == 0)
+			return false;
+	}
+
+	// Another hack to ditch false positives: Assume that if the
+	// uncompressed size is known, it must be less than 256 GiB.
+	// Again, if someone complains, this will be reconsidered.
+	uint64_t uncompressed_size = 0;
+	for (size_t i = 0; i < 8; ++i)
+		uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8);
+
+	if (uncompressed_size != UINT64_MAX
+			&& uncompressed_size > (UINT64_C(1) << 38))
+		return false;
+
+	return true;
+}
+
+
+/// Detect the input file type (for now, this done only when decompressing),
+/// and initialize an appropriate coder. Return value indicates if a normal
+/// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru
+/// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred
+/// (CODER_INIT_ERROR).
+static enum coder_init_ret
+coder_init(file_pair *pair)
 {
 	lzma_ret ret = LZMA_PROG_ERROR;
 
@@ -279,10 +352,45 @@ coder_init(void)
 		const uint32_t flags = LZMA_TELL_UNSUPPORTED_CHECK
 				| LZMA_CONCATENATED;
 
+		// We abuse FORMAT_AUTO to indicate unknown file format,
+		// for which we may consider passthru mode.
+		enum format_type init_format = FORMAT_AUTO;
+
 		switch (opt_format) {
 		case FORMAT_AUTO:
-			ret = lzma_auto_decoder(&strm,
-					hardware_memlimit_get(), flags);
+			if (is_format_xz())
+				init_format = FORMAT_XZ;
+			else if (is_format_lzma())
+				init_format = FORMAT_LZMA;
+			break;
+
+		case FORMAT_XZ:
+			if (is_format_xz())
+				init_format = FORMAT_XZ;
+			break;
+
+		case FORMAT_LZMA:
+			if (is_format_lzma())
+				init_format = FORMAT_LZMA;
+			break;
+
+		case FORMAT_RAW:
+			init_format = FORMAT_RAW;
+			break;
+		}
+
+		switch (init_format) {
+		case FORMAT_AUTO:
+			// Uknown file format. If --decompress --stdout
+			// --force have been given, then we copy the input
+			// as is to stdout. Checking for MODE_DECOMPRESS
+			// is needed, because we don't want to do use
+			// passthru mode with --test.
+			if (opt_mode == MODE_DECOMPRESS
+					&& opt_stdout && opt_force)
+				return CODER_INIT_PASSTHRU;
+
+			ret = LZMA_FORMAT_ERROR;
 			break;
 
 		case FORMAT_XZ:
@@ -304,35 +412,30 @@ coder_init(void)
 	}
 
 	if (ret != LZMA_OK) {
-		if (ret == LZMA_MEM_ERROR)
-			message_error("%s", message_strm(LZMA_MEM_ERROR));
-		else
-			message_bug();
-
-		return true;
+		message_error("%s: %s", pair->src_name, message_strm(ret));
+		return CODER_INIT_ERROR;
 	}
 
-	return false;
+	return CODER_INIT_NORMAL;
 }
 
 
+/// Compress or decompress using liblzma.
 static bool
-coder_main(file_pair *pair)
+coder_normal(file_pair *pair)
 {
-	// Buffers to hold input and output data.
-	uint8_t in_buf[IO_BUFFER_SIZE];
-	uint8_t out_buf[IO_BUFFER_SIZE];
+	// Encoder needs to know when we have given all the input to it.
+	// The decoders need to know it too when we are using
+	// LZMA_CONCATENATED. We need to check for src_eof here, because
+	// the first input chunk has been already read, and that may
+	// have been the only chunk we will read.
+	lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN;
 
-	// Initialize the progress indicator.
-	const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
-			? 0 : (uint64_t)(pair->src_st.st_size);
-	message_progress_start(&strm, pair->src_name, in_size);
-
-	lzma_action action = LZMA_RUN;
 	lzma_ret ret;
-	bool success = false; // Assume that something goes wrong.
 
-	strm.avail_in = 0;
+	// Assume that something goes wrong.
+	bool success = false;
+
 	strm.next_out = out_buf;
 	strm.avail_out = IO_BUFFER_SIZE;
 
@@ -346,9 +449,6 @@ coder_main(file_pair *pair)
 			if (strm.avail_in == SIZE_MAX)
 				break;
 
-			// Encoder needs to know when we have given all the
-			// input to it. The decoders need to know it too when
-			// we are using LZMA_CONCATENATED.
 			if (pair->src_eof)
 				action = LZMA_FINISH;
 		}
@@ -457,28 +557,71 @@ coder_main(file_pair *pair)
 		message_progress_update();
 	}
 
-	message_progress_end(success);
-
 	return success;
 }
 
 
+/// Copy from input file to output file without processing the data in any
+/// way. This is used only when trying to decompress unrecognized files
+/// with --decompress --stdout --force, so the output is always stdout.
+static bool
+coder_passthru(file_pair *pair)
+{
+	while (strm.avail_in != 0) {
+		if (user_abort)
+			return false;
+
+		if (io_write(pair, in_buf, strm.avail_in))
+			return false;
+
+		strm.total_in += strm.avail_in;
+		strm.total_out = strm.total_in;
+		message_progress_update();
+
+		strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+		if (strm.avail_in == SIZE_MAX)
+			return false;
+	}
+
+	return true;
+}
+
+
 extern void
 coder_run(const char *filename)
 {
-	// First try initializing the coder. If it fails, it's useless to try
-	// opening the file. Check also for user_abort just in case if we had
-	// got a signal while initializing the coder.
-	if (coder_init() || user_abort)
-		return;
-
 	// Try to open the input and output files.
 	file_pair *pair = io_open(filename);
 	if (pair == NULL)
 		return;
 
-	// Do the actual coding.
-	const bool success = coder_main(pair);
+	// Initialize the progress indicator.
+	const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
+			? 0 : (uint64_t)(pair->src_st.st_size);
+	message_progress_start(&strm, pair->src_name, in_size);
+
+	// Assume that something goes wrong.
+	bool success = false;
+
+	// Read the first chunk of input data. This is needed to detect
+	// the input file type (for now, only for decompression).
+	strm.next_in = in_buf;
+	strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+
+	switch (coder_init(pair)) {
+	case CODER_INIT_NORMAL:
+		success = coder_normal(pair);
+		break;
+
+	case CODER_INIT_PASSTHRU:
+		success = coder_passthru(pair);
+		break;
+
+	case CODER_INIT_ERROR:
+		break;
+	}
+
+	message_progress_end(success);
 
 	// Close the file pair. It needs to know if coding was successful to
 	// know if the source or target file should be unlinked.