aboutsummaryrefslogtreecommitdiff
path: root/src/xz/coder.c
diff options
context:
space:
mode:
Diffstat (limited to 'src/xz/coder.c')
-rw-r--r--src/xz/coder.c213
1 files changed, 178 insertions, 35 deletions
diff --git a/src/xz/coder.c b/src/xz/coder.c
index 5584cac4..90c99271 100644
--- a/src/xz/coder.c
+++ b/src/xz/coder.c
@@ -13,6 +13,14 @@
#include "private.h"
+/// Return value type for coder_init().
+enum coder_init_ret {
+ CODER_INIT_NORMAL,
+ CODER_INIT_PASSTHRU,
+ CODER_INIT_ERROR,
+};
+
+
enum operation_mode opt_mode = MODE_COMPRESS;
enum format_type opt_format = FORMAT_AUTO;
@@ -24,6 +32,10 @@ static lzma_stream strm = LZMA_STREAM_INIT;
/// Filters needed for all encoding all formats, and also decoding in raw data
static lzma_filter filters[LZMA_FILTERS_MAX + 1];
+/// Input and output buffers
+static uint8_t in_buf[IO_BUFFER_SIZE];
+static uint8_t out_buf[IO_BUFFER_SIZE];
+
/// Number of filters. Zero indicates that we are using a preset.
static size_t filters_count = 0;
@@ -251,8 +263,69 @@ coder_set_compression_settings(void)
}
+/// Return true if the data in in_buf seems to be in the .xz format.
static bool
-coder_init(void)
+is_format_xz(void)
+{
+ return strm.avail_in >= 6 && memcmp(in_buf, "\3757zXZ", 6) == 0;
+}
+
+
+/// Return true if the data in in_buf seems to be in the .lzma format.
+static bool
+is_format_lzma(void)
+{
+ // The .lzma header is 13 bytes.
+ if (strm.avail_in < 13)
+ return false;
+
+ // Decode the LZMA1 properties.
+ lzma_filter filter = { .id = LZMA_FILTER_LZMA1 };
+ if (lzma_properties_decode(&filter, NULL, in_buf, 5) != LZMA_OK)
+ return false;
+
+ // A hack to ditch tons of false positives: We allow only dictionary
+ // sizes that are 2^n or 2^n + 2^(n-1) or UINT32_MAX. LZMA_Alone
+ // created only files with 2^n, but accepts any dictionary size.
+ // If someone complains, this will be reconsidered.
+ lzma_options_lzma *opt = filter.options;
+ const uint32_t dict_size = opt->dict_size;
+ free(opt);
+
+ if (dict_size != UINT32_MAX) {
+ uint32_t d = dict_size - 1;
+ d |= d >> 2;
+ d |= d >> 3;
+ d |= d >> 4;
+ d |= d >> 8;
+ d |= d >> 16;
+ ++d;
+ if (d != dict_size || dict_size == 0)
+ return false;
+ }
+
+ // Another hack to ditch false positives: Assume that if the
+ // uncompressed size is known, it must be less than 256 GiB.
+ // Again, if someone complains, this will be reconsidered.
+ uint64_t uncompressed_size = 0;
+ for (size_t i = 0; i < 8; ++i)
+ uncompressed_size |= (uint64_t)(in_buf[5 + i]) << (i * 8);
+
+ if (uncompressed_size != UINT64_MAX
+ && uncompressed_size > (UINT64_C(1) << 38))
+ return false;
+
+ return true;
+}
+
+
+/// Detect the input file type (for now, this done only when decompressing),
+/// and initialize an appropriate coder. Return value indicates if a normal
+/// liblzma-based coder was initialized (CODER_INIT_NORMAL), if passthru
+/// mode should be used (CODER_INIT_PASSTHRU), or if an error occurred
+/// (CODER_INIT_ERROR).
+static enum coder_init_ret
+coder_init(file_pair *pair)
{
lzma_ret ret = LZMA_PROG_ERROR;
@@ -279,10 +352,45 @@ coder_init(void)
const uint32_t flags = LZMA_TELL_UNSUPPORTED_CHECK
| LZMA_CONCATENATED;
+ // We abuse FORMAT_AUTO to indicate unknown file format,
+ // for which we may consider passthru mode.
+ enum format_type init_format = FORMAT_AUTO;
+
switch (opt_format) {
case FORMAT_AUTO:
- ret = lzma_auto_decoder(&strm,
- hardware_memlimit_get(), flags);
+ if (is_format_xz())
+ init_format = FORMAT_XZ;
+ else if (is_format_lzma())
+ init_format = FORMAT_LZMA;
+ break;
+
+ case FORMAT_XZ:
+ if (is_format_xz())
+ init_format = FORMAT_XZ;
+ break;
+
+ case FORMAT_LZMA:
+ if (is_format_lzma())
+ init_format = FORMAT_LZMA;
+ break;
+
+ case FORMAT_RAW:
+ init_format = FORMAT_RAW;
+ break;
+ }
+
+ switch (init_format) {
+ case FORMAT_AUTO:
+ // Uknown file format. If --decompress --stdout
+ // --force have been given, then we copy the input
+ // as is to stdout. Checking for MODE_DECOMPRESS
+ // is needed, because we don't want to do use
+ // passthru mode with --test.
+ if (opt_mode == MODE_DECOMPRESS
+ && opt_stdout && opt_force)
+ return CODER_INIT_PASSTHRU;
+
+ ret = LZMA_FORMAT_ERROR;
break;
case FORMAT_XZ:
@@ -304,35 +412,30 @@ coder_init(void)
}
if (ret != LZMA_OK) {
- if (ret == LZMA_MEM_ERROR)
- message_error("%s", message_strm(LZMA_MEM_ERROR));
- else
- message_bug();
-
- return true;
+ message_error("%s: %s", pair->src_name, message_strm(ret));
+ return CODER_INIT_ERROR;
}
- return false;
+ return CODER_INIT_NORMAL;
}
+/// Compress or decompress using liblzma.
static bool
-coder_main(file_pair *pair)
+coder_normal(file_pair *pair)
{
- // Buffers to hold input and output data.
- uint8_t in_buf[IO_BUFFER_SIZE];
- uint8_t out_buf[IO_BUFFER_SIZE];
+ // Encoder needs to know when we have given all the input to it.
+ // The decoders need to know it too when we are using
+ // LZMA_CONCATENATED. We need to check for src_eof here, because
+ // the first input chunk has been already read, and that may
+ // have been the only chunk we will read.
+ lzma_action action = pair->src_eof ? LZMA_FINISH : LZMA_RUN;
- // Initialize the progress indicator.
- const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
- ? 0 : (uint64_t)(pair->src_st.st_size);
- message_progress_start(&strm, pair->src_name, in_size);
-
- lzma_action action = LZMA_RUN;
lzma_ret ret;
- bool success = false; // Assume that something goes wrong.
- strm.avail_in = 0;
+ // Assume that something goes wrong.
+ bool success = false;
+
strm.next_out = out_buf;
strm.avail_out = IO_BUFFER_SIZE;
@@ -346,9 +449,6 @@ coder_main(file_pair *pair)
if (strm.avail_in == SIZE_MAX)
break;
- // Encoder needs to know when we have given all the
- // input to it. The decoders need to know it too when
- // we are using LZMA_CONCATENATED.
if (pair->src_eof)
action = LZMA_FINISH;
}
@@ -457,28 +557,71 @@ coder_main(file_pair *pair)
message_progress_update();
}
- message_progress_end(success);
-
return success;
}
+/// Copy from input file to output file without processing the data in any
+/// way. This is used only when trying to decompress unrecognized files
+/// with --decompress --stdout --force, so the output is always stdout.
+static bool
+coder_passthru(file_pair *pair)
+{
+ while (strm.avail_in != 0) {
+ if (user_abort)
+ return false;
+
+ if (io_write(pair, in_buf, strm.avail_in))
+ return false;
+
+ strm.total_in += strm.avail_in;
+ strm.total_out = strm.total_in;
+ message_progress_update();
+
+ strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+ if (strm.avail_in == SIZE_MAX)
+ return false;
+ }
+
+ return true;
+}
+
+
extern void
coder_run(const char *filename)
{
- // First try initializing the coder. If it fails, it's useless to try
- // opening the file. Check also for user_abort just in case if we had
- // got a signal while initializing the coder.
- if (coder_init() || user_abort)
- return;
-
// Try to open the input and output files.
file_pair *pair = io_open(filename);
if (pair == NULL)
return;
- // Do the actual coding.
- const bool success = coder_main(pair);
+ // Initialize the progress indicator.
+ const uint64_t in_size = pair->src_st.st_size <= (off_t)(0)
+ ? 0 : (uint64_t)(pair->src_st.st_size);
+ message_progress_start(&strm, pair->src_name, in_size);
+
+ // Assume that something goes wrong.
+ bool success = false;
+
+ // Read the first chunk of input data. This is needed to detect
+ // the input file type (for now, only for decompression).
+ strm.next_in = in_buf;
+ strm.avail_in = io_read(pair, in_buf, IO_BUFFER_SIZE);
+
+ switch (coder_init(pair)) {
+ case CODER_INIT_NORMAL:
+ success = coder_normal(pair);
+ break;
+
+ case CODER_INIT_PASSTHRU:
+ success = coder_passthru(pair);
+ break;
+
+ case CODER_INIT_ERROR:
+ break;
+ }
+
+ message_progress_end(success);
// Close the file pair. It needs to know if coding was successful to
// know if the source or target file should be unlinked.