9 files changed, 204 insertions, 15 deletions
diff --git a/src/liblzma/api/lzma/lzma12.h b/src/liblzma/api/lzma/lzma12.h
index df5f23b6..d34e7839 100644
--- a/src/liblzma/api/lzma/lzma12.h
+++ b/src/liblzma/api/lzma/lzma12.h
@@ -18,18 +18,41 @@
 
 
 /**
- * \brief       LZMA1 Filter ID
+ * \brief       LZMA1 Filter ID (for raw encoder/decoder only, not in .xz)
  *
  * LZMA1 is the very same thing as what was called just LZMA in LZMA Utils,
  * 7-Zip, and LZMA SDK. It's called LZMA1 here to prevent developers from
  * accidentally using LZMA when they actually want LZMA2.
- *
- * LZMA1 shouldn't be used for new applications unless you _really_ know
- * what you are doing. LZMA2 is almost always a better choice.
  */
 #define LZMA_FILTER_LZMA1       LZMA_VLI_C(0x4000000000000001)
 
 /**
+ * \brief       LZMA1 Filter ID with extended options (for raw encoder/decoder)
+ *
+ * This is like LZMA_FILTER_LZMA1 but with this ID a few extra options
+ * are supported in the lzma_options_lzma structure:
+ *
+ *   - A flag to tell the encoder if the end of payload marker (EOPM) alias
+ *     end of stream (EOS) marker must be written at the end of the stream.
+ *     In contrast, LZMA_FILTER_LZMA1 always writes the end marker.
+ *
+ *   - Decoder needs to be told the uncompressed size of the stream
+ *     or that it is unknown (using the special value UINT64_MAX).
+ *     If the size is known, a flag can be set to allow the presence of
+ *     the end marker anyway. In contrast, LZMA_FILTER_LZMA1 always
+ *     behaves as if the uncompressed size was unknown.
+ *
+ * This allows handling file formats where LZMA1 streams are used but where
+ * the end marker isn't allowed or where it might not (always) be present.
+ * This extended LZMA1 functionality is provided as a Filter ID for raw
+ * encoder and decoder instead of adding new encoder and decoder initialization
+ * functions because this way it is possible to also use extra filters,
+ * for example, LZMA_FILTER_X86 in a filter chain with LZMA_FILTER_LZMA1EXT,
+ * which might be needed to handle some file formats.
+ */
+#define LZMA_FILTER_LZMA1EXT    LZMA_VLI_C(0x4000000000000002)
+
+/**
  * \brief       LZMA2 Filter ID
  *
  * Usually you want this instead of LZMA1. Compared to LZMA1, LZMA2 adds
@@ -374,6 +397,82 @@ typedef struct {
 	 */
 	uint32_t depth;
 
+	/**
+	 * \brief       For LZMA_FILTER_LZMA1EXT: Extended flags
+	 *
+	 * This is used only with LZMA_FILTER_LZMA1EXT.
+	 *
+	 * Currently only one flag is supported, LZMA_LZMA1EXT_ALLOW_EOPM:
+	 *
+	 *   - Encoder: If the flag is set, then end marker is written just
+	 *     like it is with LZMA_FILTER_LZMA1. Without this flag the
+	 *     end marker isn't written and the application has to store
+	 *     the uncompressed size somewhere outside the compressed stream.
+	 *     To decompress streams without the end marker, the appliation
+	 *     has to set the correct uncompressed size in ext_size_low and
+	 *     ext_size_high.
+	 *
+	 *   - Decoder: If the uncompressed size in ext_size_low and
+	 *     ext_size_high is set to the special value UINT64_MAX
+	 *     (indicating unknown uncompressed size) then this flag is
+	 *     ignored and the end marker must always be present, that is,
+	 *     the behavior is identical to LZMA_FILTER_LZMA1.
+	 *
+	 *     Otherwise, if this flag isn't set, then the input stream
+	 *     must not have the end marker; if the end marker is detected
+	 *     then it will result in LZMA_DATA_ERROR. This is useful when
+	 *     it is known that the stream must not have the end marker and
+	 *     strict validation is wanted.
+	 *
+	 *     If this flag is set, then it is autodetected if the end marker
+	 *     is present after the specified number of uncompressed bytes
+	 *     has been decompressed (ext_size_low and ext_size_high). The
+	 *     end marker isn't allowed in any other position. This behavior
+	 *     is useful when uncompressed size is known but the end marker
+	 *     may or may not be present. This is the case, for example,
+	 *     in .7z files (valid .7z files that have the end marker in
+	 *     LZMA1 streams are rare but they do exist).
+	 */
+	uint32_t ext_flags;
+#	define LZMA_LZMA1EXT_ALLOW_EOPM   UINT32_C(0x01)
+
+	/**
+	 * \brief       For LZMA_FILTER_LZMA1EXT: Uncompressed size (low bits)
+	 *
+	 * The 64-bit uncompressed size is needed for decompression with
+	 * LZMA_FILTER_LZMA1EXT. The size is ignored by the encoder.
+	 *
+	 * The special value UINT64_MAX indicates that the uncompressed size
+	 * is unknown and that the end of payload marker (also known as
+	 * end of stream marker) must be present to indicate the end of
+	 * the LZMA1 stream. Any other value indicates the expected
+	 * uncompressed size of the LZMA1 stream. (If LZMA1 was used together
+	 * with filters that change the size of the data then the uncompressed
+	 * size of the LZMA1 stream could be different than the final
+	 * uncompressed size of the filtered stream.)
+	 *
+	 * ext_size_low holds the least significant 32 bits of the
+	 * uncompressed size. The most significant 32 bits must be set
+	 * in ext_size_high. The macro lzma_ext_size_set(opt_lzma, u64size)
+	 * can be used to set these members.
+	 *
+	 * The 64-bit uncompressed size is split into two uint32_t variables
+	 * because there were no reserved uint64_t members and using the
+	 * same options structure for LZMA_FILTER_LZMA1, LZMA_FILTER_LZMA1EXT,
+	 * and LZMA_FILTER_LZMA2 was otherwise more convenient than having
+	 * a new options structure for LZMA_FILTER_LZMA1EXT. (Replacing two
+	 * uint32_t members with one uint64_t changes the ABI on some systems
+	 * as the alignment of this struct can increase from 4 bytes to 8.)
+	 */
+	uint32_t ext_size_low;
+
+	/**
+	 * \brief       For LZMA_FILTER_LZMA1EXT: Uncompressed size (high bits)
+	 *
+	 * This holds the most significant 32 bits of the uncompressed size.
+	 */
+	uint32_t ext_size_high;
+
 	/*
 	 * Reserved space to allow possible future extensions without
 	 * breaking the ABI. You should not touch these, because the names
@@ -381,9 +480,6 @@ typedef struct {
 	 * with the currently supported options, so it is safe to leave these
 	 * uninitialized.
 	 */
-	uint32_t reserved_int1;
-	uint32_t reserved_int2;
-	uint32_t reserved_int3;
 	uint32_t reserved_int4;
 	uint32_t reserved_int5;
 	uint32_t reserved_int6;
@@ -400,6 +496,19 @@ typedef struct {
 
 
 /**
+ * \brief       Macro to set the 64-bit uncompressed size in ext_size_*
+ *
+ * This might be convenient when decoding using LZMA_FILTER_LZMA1EXT.
+ * This isn't used with LZMA_FILTER_LZMA1 or LZMA_FILTER_LZMA2.
+ */
+#define lzma_set_ext_size(opt_lzma2, u64size) \
+do { \
+	(opt_lzma2).ext_size_low = (uint32_t)(u64size); \
+	(opt_lzma2).ext_size_high = (uint32_t)((uint64_t)(u64size) >> 32); \
+} while (0)
+
+
+/**
  * \brief       Set a compression preset to lzma_options_lzma structure
  *
  * 0 is the fastest and 9 is the slowest. These match the switches -0 .. -9
diff --git a/src/liblzma/common/filter_common.c b/src/liblzma/common/filter_common.c
index a803b4c2..c8694e2c 100644
--- a/src/liblzma/common/filter_common.c
+++ b/src/liblzma/common/filter_common.c
@@ -42,6 +42,13 @@ static const struct {
 		.last_ok = true,
 		.changes_size = true,
 	},
+	{
+		.id = LZMA_FILTER_LZMA1EXT,
+		.options_size = sizeof(lzma_options_lzma),
+		.non_last_ok = false,
+		.last_ok = true,
+		.changes_size = true,
+	},
 #endif
 #if defined(HAVE_ENCODER_LZMA2) || defined(HAVE_DECODER_LZMA2)
 	{
diff --git a/src/liblzma/common/filter_decoder.c b/src/liblzma/common/filter_decoder.c
index b031ac62..fa53f5bd 100644
--- a/src/liblzma/common/filter_decoder.c
+++ b/src/liblzma/common/filter_decoder.c
@@ -50,6 +50,12 @@ static const lzma_filter_decoder decoders[] = {
 		.memusage = &lzma_lzma_decoder_memusage,
 		.props_decode = &lzma_lzma_props_decode,
 	},
+	{
+		.id = LZMA_FILTER_LZMA1EXT,
+		.init = &lzma_lzma_decoder_init,
+		.memusage = &lzma_lzma_decoder_memusage,
+		.props_decode = &lzma_lzma_props_decode,
+	},
 #endif
 #ifdef HAVE_DECODER_LZMA2
 	{
diff --git a/src/liblzma/common/filter_encoder.c b/src/liblzma/common/filter_encoder.c
index 5d6c1a7e..978b7a6b 100644
--- a/src/liblzma/common/filter_encoder.c
+++ b/src/liblzma/common/filter_encoder.c
@@ -64,6 +64,15 @@ static const lzma_filter_encoder encoders[] = {
 		.props_size_fixed = 5,
 		.props_encode = &lzma_lzma_props_encode,
 	},
+	{
+		.id = LZMA_FILTER_LZMA1EXT,
+		.init = &lzma_lzma_encoder_init,
+		.memusage = &lzma_lzma_encoder_memusage,
+		.block_size = NULL, // Not needed for LZMA1
+		.props_size_get = NULL,
+		.props_size_fixed = 5,
+		.props_encode = &lzma_lzma_props_encode,
+	},
 #endif
 #ifdef HAVE_ENCODER_LZMA2
 	{
diff --git a/src/liblzma/lzma/lzma2_encoder.c b/src/liblzma/lzma/lzma2_encoder.c
index f1252c57..4b6b2311 100644
--- a/src/liblzma/lzma/lzma2_encoder.c
+++ b/src/liblzma/lzma/lzma2_encoder.c
@@ -341,7 +341,7 @@ lzma2_encoder_init(lzma_lz_encoder *lz, const lzma_allocator *allocator,
 
 	// Initialize LZMA encoder
 	return_if_error(lzma_lzma_encoder_create(&coder->lzma, allocator,
-			&coder->opt_cur, lz_options));
+			LZMA_FILTER_LZMA2, &coder->opt_cur, lz_options));
 
 	// Make sure that we will always have enough history available in
 	// case we need to use uncompressed chunks. They are used when the
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index 550963d1..26c148a9 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -1018,11 +1018,35 @@ lzma_decoder_init(lzma_lz_decoder *lz, const lzma_allocator *allocator,
 	if (!is_lclppb_valid(options))
 		return LZMA_PROG_ERROR;
 
+	lzma_vli uncomp_size = LZMA_VLI_UNKNOWN;
+	bool allow_eopm = true;
+
+	if (id == LZMA_FILTER_LZMA1EXT) {
+		const lzma_options_lzma *opt = options;
+
+		// Only one flag is supported.
+		if (opt->ext_flags & ~LZMA_LZMA1EXT_ALLOW_EOPM)
+			return LZMA_OPTIONS_ERROR;
+
+		// FIXME? Using lzma_vli instead of uint64_t is weird because
+		// this has nothing to do with .xz headers and variable-length
+		// integer encoding. On the other hand, using LZMA_VLI_UNKNOWN
+		// instead of UINT64_MAX is clearer when unknown size is
+		// meant. A problem with using lzma_vli is that now we
+		// allow > LZMA_VLI_MAX which is fine in this file but
+		// it's still confusing. Note that alone_decoder.c also
+		// allows > LZMA_VLI_MAX when setting uncompressed size.
+		uncomp_size = opt->ext_size_low
+				+ ((uint64_t)(opt->ext_size_high) << 32);
+		allow_eopm = (opt->ext_flags & LZMA_LZMA1EXT_ALLOW_EOPM) != 0
+				|| uncomp_size == LZMA_VLI_UNKNOWN;
+	}
+
 	return_if_error(lzma_lzma_decoder_create(
 			lz, allocator, options, lz_options));
 
 	lzma_decoder_reset(lz->coder, options);
-	lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN, true);
+	lzma_decoder_uncompressed(lz->coder, uncomp_size, allow_eopm);
 
 	return LZMA_OK;
 }
diff --git a/src/liblzma/lzma/lzma_encoder.c b/src/liblzma/lzma/lzma_encoder.c
index e2dbbc03..dc62f44f 100644
--- a/src/liblzma/lzma/lzma_encoder.c
+++ b/src/liblzma/lzma/lzma_encoder.c
@@ -416,7 +416,7 @@ lzma_lzma_encode(lzma_lzma1_encoder *restrict coder, lzma_mf *restrict mf,
 	//
 	// Plain LZMA streams without EOPM aren't supported except when
 	// output size limiting is enabled.
-	if (limit == UINT32_MAX && coder->out_limit == 0)
+	if (coder->use_eopm)
 		encode_eopm(coder, (uint32_t)(coder->uncomp_size));
 
 	// Flush the remaining bytes from the range encoder.
@@ -462,6 +462,7 @@ lzma_lzma_set_out_limit(
 	lzma_lzma1_encoder *coder = coder_ptr;
 	coder->out_limit = out_limit;
 	coder->uncomp_size_ptr = uncomp_size;
+	coder->use_eopm = false;
 	return LZMA_OK;
 }
 
@@ -599,10 +600,13 @@ lzma_lzma_encoder_reset(lzma_lzma1_encoder *coder,
 
 
 extern lzma_ret
-lzma_lzma_encoder_create(void **coder_ptr,
-		const lzma_allocator *allocator,
-		const lzma_options_lzma *options, lzma_lz_options *lz_options)
+lzma_lzma_encoder_create(void **coder_ptr, const lzma_allocator *allocator,
+		lzma_vli id, const lzma_options_lzma *options,
+		lzma_lz_options *lz_options)
 {
+	assert(id == LZMA_FILTER_LZMA1 || id == LZMA_FILTER_LZMA1EXT
+			|| id == LZMA_FILTER_LZMA2);
+
 	// Allocate lzma_lzma1_encoder if it wasn't already allocated.
 	if (*coder_ptr == NULL) {
 		*coder_ptr = lzma_alloc(sizeof(lzma_lzma1_encoder), allocator);
@@ -672,6 +676,32 @@ lzma_lzma_encoder_create(void **coder_ptr,
 	// Output size limitting is disabled by default.
 	coder->out_limit = 0;
 
+	// Determine if end marker is wanted:
+	//   - It is never used with LZMA2.
+	//   - It is always used with LZMA_FILTER_LZMA1 (unless
+	//     lzma_lzma_set_out_limit() is called later).
+	//   - LZMA_FILTER_LZMA1EXT has a flag for it in the options.
+	coder->use_eopm = (id == LZMA_FILTER_LZMA1);
+	if (id == LZMA_FILTER_LZMA1EXT) {
+		// Check if unsupported flags are present.
+		if (options->ext_flags & ~LZMA_LZMA1EXT_ALLOW_EOPM)
+			return LZMA_OPTIONS_ERROR;
+
+		coder->use_eopm = (options->ext_flags
+				& LZMA_LZMA1EXT_ALLOW_EOPM) != 0;
+
+		// TODO? As long as there are no filters that change the size
+		// of the data, it is enough to look at lzma_stream.total_in
+		// after encoding has been finished to know the uncompressed
+		// size of the LZMA1 stream. But in the future there could be
+		// filters that change the size of the data and then total_in
+		// doesn't work as the LZMA1 stream size might be different
+		// due to another filter in the chain. The problem is simple
+		// to solve: Add another flag to ext_flags and then set
+		// coder->uncomp_size_ptr to the address stored in
+		// lzma_options_lzma.reserved_ptr2 (or _ptr1).
+	}
+
 	set_lz_options(lz_options, options);
 
 	return lzma_lzma_encoder_reset(coder, options);
@@ -685,7 +715,7 @@ lzma_encoder_init(lzma_lz_encoder *lz, const lzma_allocator *allocator,
 	lz->code = &lzma_encode;
 	lz->set_out_limit = &lzma_lzma_set_out_limit;
 	return lzma_lzma_encoder_create(
-			&lz->coder, allocator, options, lz_options);
+			&lz->coder, allocator, id, options, lz_options);
 }
 
 
diff --git a/src/liblzma/lzma/lzma_encoder.h b/src/liblzma/lzma/lzma_encoder.h
index 6cfdf228..84d8c916 100644
--- a/src/liblzma/lzma/lzma_encoder.h
+++ b/src/liblzma/lzma/lzma_encoder.h
@@ -40,7 +40,8 @@ extern bool lzma_lzma_lclppb_encode(
 /// Initializes raw LZMA encoder; this is used by LZMA2.
 extern lzma_ret lzma_lzma_encoder_create(
 		void **coder_ptr, const lzma_allocator *allocator,
-		const lzma_options_lzma *options, lzma_lz_options *lz_options);
+		lzma_vli id, const lzma_options_lzma *options,
+		lzma_lz_options *lz_options);
 
 
 /// Resets an already initialized LZMA encoder; this is used by LZMA2.
diff --git a/src/liblzma/lzma/lzma_encoder_private.h b/src/liblzma/lzma/lzma_encoder_private.h
index 8960c52c..b228c577 100644
--- a/src/liblzma/lzma/lzma_encoder_private.h
+++ b/src/liblzma/lzma/lzma_encoder_private.h
@@ -111,6 +111,9 @@ struct lzma_lzma1_encoder_s {
 	/// have been written to the output buffer yet.
 	bool is_flushed;
 
+	/// True if end of payload marker will be written.
+	bool use_eopm;
+
 	uint32_t pos_mask;         ///< (1 << pos_bits) - 1
 	uint32_t literal_context_bits;
 	uint32_t literal_pos_mask;