7 files changed, 95 insertions, 39 deletions
diff --git a/doc/lzma-file-format.txt b/doc/lzma-file-format.txt
index 015b0fae..4865defd 100644
--- a/doc/lzma-file-format.txt
+++ b/doc/lzma-file-format.txt
@@ -40,7 +40,11 @@ The .lzma File Format
 
 0.2. Changes
 
-        Last modified: 2011-04-12 11:55+0300
+        Last modified: 2022-07-13 21:00+0300
+
+        Compared to the previous version (2011-04-12 11:55+0300)
+        the section 1.1.3 was modified to allow End of Payload Marker
+        with a known Uncompressed Size.
 
 
 1. File Format
@@ -129,7 +133,10 @@ The .lzma File Format
         Uncompressed Size is stored as unsigned 64-bit little endian
         integer. A special value of 0xFFFF_FFFF_FFFF_FFFF indicates
         that Uncompressed Size is unknown. End of Payload Marker (*)
-        is used if and only if Uncompressed Size is unknown.
+        is used if Uncompressed Size is unknown. End of Payload Marker
+        is allowed but rarely used if Uncompressed Size is known.
+        XZ Utils 5.2.5 and older don't support .lzma files that have
+        End of Payload Marker together with a known Uncompressed Size.
 
         XZ Utils rejects files whose Uncompressed Size field specifies
         a known size that is 256 GiB or more. This is to reject false
diff --git a/src/liblzma/common/alone_decoder.c b/src/liblzma/common/alone_decoder.c
index 239b230e..a3ea20a2 100644
--- a/src/liblzma/common/alone_decoder.c
+++ b/src/liblzma/common/alone_decoder.c
@@ -146,7 +146,7 @@ alone_decode(void *coder_ptr, const lzma_allocator *allocator,
 
 		// Use a hack to set the uncompressed size.
 		lzma_lz_decoder_uncompressed(coder->next.coder,
-				coder->uncompressed_size);
+				coder->uncompressed_size, true);
 
 		coder->sequence = SEQ_CODE;
 		break;
diff --git a/src/liblzma/common/microlzma_decoder.c b/src/liblzma/common/microlzma_decoder.c
index 37907109..d6337816 100644
--- a/src/liblzma/common/microlzma_decoder.c
+++ b/src/liblzma/common/microlzma_decoder.c
@@ -108,7 +108,7 @@ microlzma_decode(void *coder_ptr, const lzma_allocator *allocator,
 		// Use a hack to set the uncompressed size.
 		if (coder->uncomp_size_is_exact)
 			lzma_lz_decoder_uncompressed(coder->lzma.coder,
-					coder->uncomp_size);
+					coder->uncomp_size, false);
 
 		// Pass one dummy 0x00 byte to the LZMA decoder since that
 		// is what it expects the first byte to be.
diff --git a/src/liblzma/lz/lz_decoder.c b/src/liblzma/lz/lz_decoder.c
index 09b57438..ab6af0dd 100644
--- a/src/liblzma/lz/lz_decoder.c
+++ b/src/liblzma/lz/lz_decoder.c
@@ -304,8 +304,14 @@ lzma_lz_decoder_memusage(size_t dictionary_size)
 
 
 extern void
-lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size)
+lzma_lz_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size,
+		bool allow_eopm)
 {
 	lzma_coder *coder = coder_ptr;
-	coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size);
+
+	if (uncompressed_size == LZMA_VLI_UNKNOWN)
+		allow_eopm = true;
+
+	coder->lz.set_uncompressed(coder->lz.coder, uncompressed_size,
+			allow_eopm);
 }
diff --git a/src/liblzma/lz/lz_decoder.h b/src/liblzma/lz/lz_decoder.h
index 754ccf37..e6d7ab2a 100644
--- a/src/liblzma/lz/lz_decoder.h
+++ b/src/liblzma/lz/lz_decoder.h
@@ -62,8 +62,10 @@ typedef struct {
 
 	void (*reset)(void *coder, const void *options);
 
-	/// Set the uncompressed size
-	void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size);
+	/// Set the uncompressed size. If uncompressed_size == LZMA_VLI_UNKNOWN
+	/// then allow_eopm will always be true.
+	void (*set_uncompressed)(void *coder, lzma_vli uncompressed_size,
+			bool allow_eopm);
 
 	/// Free allocated resources
 	void (*end)(void *coder, const lzma_allocator *allocator);
@@ -91,7 +93,7 @@ extern lzma_ret lzma_lz_decoder_init(lzma_next_coder *next,
 extern uint64_t lzma_lz_decoder_memusage(size_t dictionary_size);
 
 extern void lzma_lz_decoder_uncompressed(
-		void *coder, lzma_vli uncompressed_size);
+		void *coder, lzma_vli uncompressed_size, bool allow_eopm);
 
 
 //////////////////////
diff --git a/src/liblzma/lzma/lzma2_decoder.c b/src/liblzma/lzma/lzma2_decoder.c
index cf1b5110..105a28dc 100644
--- a/src/liblzma/lzma/lzma2_decoder.c
+++ b/src/liblzma/lzma/lzma2_decoder.c
@@ -139,7 +139,7 @@ lzma2_decode(void *coder_ptr, lzma_dict *restrict dict,
 		coder->uncompressed_size += in[(*in_pos)++] + 1U;
 		coder->sequence = SEQ_COMPRESSED_0;
 		coder->lzma.set_uncompressed(coder->lzma.coder,
-				coder->uncompressed_size);
+				coder->uncompressed_size, false);
 		break;
 
 	case SEQ_COMPRESSED_0:
diff --git a/src/liblzma/lzma/lzma_decoder.c b/src/liblzma/lzma/lzma_decoder.c
index e605a0a9..140a44f1 100644
--- a/src/liblzma/lzma/lzma_decoder.c
+++ b/src/liblzma/lzma/lzma_decoder.c
@@ -238,6 +238,11 @@ typedef struct {
 	/// payload marker is expected.
 	lzma_vli uncompressed_size;
 
+	/// True if end of payload marker (EOPM) is allowed even when
+	/// uncompressed_size is known; false if EOPM must not be present.
+	/// This is ignored if uncompressed_size == LZMA_VLI_UNKNOWN.
+	bool allow_eopm;
+
 	////////////////////////////////
 	// State of incomplete symbol //
 	////////////////////////////////
@@ -343,12 +348,19 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 	lzma_ret ret = LZMA_OK;
 
-	// If uncompressed size is known, there must be no end of payload
-	// marker.
-	const bool no_eopm = coder->uncompressed_size
-			!= LZMA_VLI_UNKNOWN;
-	if (no_eopm && coder->uncompressed_size < dict.limit - dict.pos)
+	// EOPM is always required (not just allowed) when
+	// the uncompressed size isn't known.
+	bool eopm_allowed = coder->uncompressed_size == LZMA_VLI_UNKNOWN;
+
+	// If uncompressed size is known and there is enough output space
+	// to decode all the data, limit the available buffer space so that
+	// the main loop won't try to decode past the end of the stream.
+	bool might_finish_without_eopm = false;
+	if (coder->uncompressed_size != LZMA_VLI_UNKNOWN
+			&& coder->uncompressed_size <= dict.limit - dict.pos) {
 		dict.limit = dict.pos + (size_t)(coder->uncompressed_size);
+		might_finish_without_eopm = true;
+	}
 
 	// The main decoder loop. The "switch" is used to restart the decoder at
 	// correct location. Once restarted, the "switch" is no longer used.
@@ -361,8 +373,32 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 	case SEQ_NORMALIZE:
 	case SEQ_IS_MATCH:
-		if (unlikely(no_eopm && dict.pos == dict.limit))
-			break;
+		if (unlikely(might_finish_without_eopm
+				&& dict.pos == dict.limit)) {
+			// In rare cases there is a useless byte that needs
+			// to be read anyway.
+			rc_normalize(SEQ_NORMALIZE);
+
+			// If the range decoder state is such that we can
+			// be at the end of the LZMA stream, then the
+			// decoding is finished.
+			if (rc_is_finished(rc)) {
+				ret = LZMA_STREAM_END;
+				goto out;
+			}
+
+			// If the caller hasn't allowed EOPM to be present
+			// together with known uncompressed size, then the
+			// LZMA stream is corrupt.
+			if (!coder->allow_eopm) {
+				ret = LZMA_DATA_ERROR;
+				goto out;
+			}
+
+			// Otherwise continue decoding with the expectation
+			// that the next LZMA symbol is EOPM.
+			eopm_allowed = true;
+		}
 
 		rc_if_0(coder->is_match[state][pos_state], SEQ_IS_MATCH) {
 			rc_update_0(coder->is_match[state][pos_state]);
@@ -658,11 +694,18 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 
 					if (rep0 == UINT32_MAX) {
 						// End of payload marker was
-						// found. It must not be
-						// present if uncompressed
-						// size is known.
-						if (coder->uncompressed_size
-						!= LZMA_VLI_UNKNOWN) {
+						// found. It may only be
+						// present if
+						//   - uncompressed size is
+						//     unknown or
+						//   - after known uncompressed
+						//     size amount of bytes has
+						//     been decompressed and
+						//     caller has indicated
+						//     that EOPM might be used
+						//     (it's not allowed in
+						//     LZMA2).
+						if (!eopm_allowed) {
 							ret = LZMA_DATA_ERROR;
 							goto out;
 						}
@@ -671,7 +714,9 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 						// LZMA1 stream with
 						// end-of-payload marker.
 						rc_normalize(SEQ_EOPM);
-						ret = LZMA_STREAM_END;
+						ret = rc_is_finished(rc)
+							? LZMA_STREAM_END
+							: LZMA_DATA_ERROR;
 						goto out;
 					}
 				}
@@ -793,9 +838,6 @@ lzma_decode(void *coder_ptr, lzma_dict *restrict dictptr,
 		}
 	}
 
-	rc_normalize(SEQ_NORMALIZE);
-	coder->sequence = SEQ_IS_MATCH;
-
 out:
 	// Save state
 
@@ -822,24 +864,21 @@ out:
 	if (coder->uncompressed_size != LZMA_VLI_UNKNOWN) {
 		coder->uncompressed_size -= dict.pos - dict_start;
 
-		// Since there cannot be end of payload marker if the
-		// uncompressed size was known, we check here if we
-		// finished decoding.
+		// If we have gotten all the output but the decoder wants
+		// to write more output, the file is corrupt. There are
+		// three SEQ values where output is produced.
 		if (coder->uncompressed_size == 0 && ret == LZMA_OK
-				&& coder->sequence != SEQ_NORMALIZE)
-			ret = coder->sequence == SEQ_IS_MATCH
-					? LZMA_STREAM_END : LZMA_DATA_ERROR;
+				&& (coder->sequence == SEQ_LITERAL_WRITE
+					|| coder->sequence == SEQ_SHORTREP
+					|| coder->sequence == SEQ_COPY))
+			ret = LZMA_DATA_ERROR;
 	}
 
-	// We can do an additional check in the range decoder to catch some
-	// corrupted files.
 	if (ret == LZMA_STREAM_END) {
-		if (!rc_is_finished(coder->rc))
-			ret = LZMA_DATA_ERROR;
-
 		// Reset the range decoder so that it is ready to reinitialize
 		// for a new LZMA2 chunk.
 		rc_reset(coder->rc);
+		coder->sequence = SEQ_IS_MATCH;
 	}
 
 	return ret;
@@ -848,10 +887,12 @@ out:
 
 
 static void
-lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size)
+lzma_decoder_uncompressed(void *coder_ptr, lzma_vli uncompressed_size,
+		bool allow_eopm)
 {
 	lzma_lzma1_decoder *coder = coder_ptr;
 	coder->uncompressed_size = uncompressed_size;
+	coder->allow_eopm = allow_eopm;
 }
 
 
@@ -977,7 +1018,7 @@ lzma_decoder_init(lzma_lz_decoder *lz, const lzma_allocator *allocator,
 			lz, allocator, options, lz_options));
 
 	lzma_decoder_reset(lz->coder, options);
-	lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN);
+	lzma_decoder_uncompressed(lz->coder, LZMA_VLI_UNKNOWN, true);
 
 	return LZMA_OK;
 }