aboutsummaryrefslogtreecommitdiff
path: root/doc/file-format.txt
blob: 2c8cd48648ec160b438d6b57e118a406896ff7a4 (plain) (blame)
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
496
497
498
499
500
501
502
503
504
505
506
507
508
509
510
511
512
513
514
515
516
517
518
519
520
521
522
523
524
525
526
527
528
529
530
531
532
533
534
535
536
537
538
539
540
541
542
543
544
545
546
547
548
549
550
551
552
553
554
555
556
557
558
559
560
561
562
563
564
565
566
567
568
569
570
571
572
573
574
575
576
577
578
579
580
581
582
583
584
585
586
587
588
589
590
591
592
593
594
595
596
597
598
599
600
601
602
603
604
605
606
607
608
609
610
611
612
613
614
615
616
617
618
619
620
621
622
623
624
625
626
627
628
629
630
631
632
633
634
635
636
637
638
639
640
641
642
643
644
645
646
647
648
649
650
651
652
653
654
655
656
657
658
659
660
661
662
663
664
665
666
667
668
669
670
671
672
673
674
675
676
677
678
679
680
681
682
683
684
685
686
687
688
689
690
691
692
693
694
695
696
697
698
699
700
701
702
703
704
705
706
707
708
709
710
711
712
713
714
715
716
717
718
719
720
721
722
723
724
725
726
727
728
729
730
731
732
733
734
735
736
737
738
739
740
741
742
743
744
745
746
747
748
749
750
751
752
753
754
755
756
757
758
759
760
761
762
763
764
765
766
767
768
769
770
771
772
773
774
775
776
777
778
779
780
781
782
783
784
785
786
787
788
789
790
791
792
793
794
795
796
797
798
799
800
801
802
803
804
805
806
807
808
809
810
811
812
813
814
815
816
817
818
819
820
821
822
823
824
825
826
827
828
829
830
831
832
833
834
835
836
837
838
839
840
841
842
843
844
845
846
847
848
849
850
851
852
853
854
855
856
857
858
859
860
861
862
863
864
865
866
867
868
869
870
871
872
873
874
875
876
877
878
879
880
881
882
883
884
885
886
887
888
889
890
891
892
893
894
895
896
897
898
899
900
901
902
903
904
905
906
907
908
909
910
911
912
913
914
915
916
917
918
919
920
921
922
923
924
925
926
927
928
929
930
931
932
933
934
935
936
937
938
939
940
941
942
943
944
945
946
947
948
949
950
951
952
953
954
955
956
957
958
959
960
961
962
963
964
965
966
967
968
969
970
971
972
973
974
975
976
977
978
979
980
981
982
983
984
985
986
987
988
989
990
991
992
993
994
995
996
997
998
999
1000
1001
1002
1003
1004
1005
1006
1007
1008
1009
1010
1011
1012
1013
1014
1015
1016
1017
1018
1019
1020
1021
1022
1023
1024
1025
1026
1027
1028
1029
1030
1031
1032
1033
1034
1035
1036
1037
1038
1039
1040
1041
1042
1043
1044
1045
1046
1047
1048
1049
1050
1051
1052
1053
1054
1055
1056
1057
1058
1059
1060
1061
1062
1063
1064
1065
1066
1067
1068
1069
1070
1071
1072
1073
1074
1075
1076
1077
1078
1079
1080
1081
1082
1083
1084
1085
1086
1087
1088
1089
1090
1091
1092
1093
1094
1095
1096
1097
1098
1099
1100
1101
1102
1103
1104
1105
1106
1107
1108
1109
1110
1111
1112
1113
1114
1115
1116
1117
1118
1119
1120
1121
1122
1123
1124
1125
1126
1127
1128
1129
1130
1131
1132
1133
1134
1135
1136
1137
1138
1139
1140
1141
1142
1143
1144
1145
1146
1147
1148
1149
1150
1151
1152
1153
1154
1155
1156
1157
1158
1159
1160
1161
1162
1163
1164
1165
1166
1167
1168
1169
1170
1171
1172
1173
1174
1175
1176
1177
1178
1179
1180
1181
1182
1183
1184
1185
1186
1187
1188
1189
1190
1191
1192
1193
1194
1195
1196
1197
1198
1199
1200
1201
1202
1203
1204
1205
1206
1207
1208
1209
1210
1211
1212
1213
1214
1215
1216
1217
1218
1219
1220
1221
1222
1223
1224
1225
1226
1227
1228
1229
1230
1231
1232
1233
1234
1235
1236
1237
1238
1239
1240
1241
1242
1243
1244
1245
1246
1247
1248
1249
1250
1251
1252
1253
1254
1255
1256
1257
1258
1259
1260
1261
1262
1263
1264
1265
1266
1267
1268
1269
1270
1271
1272
1273
1274
1275
1276
1277
1278
1279
1280
1281
1282
1283
1284
1285
1286
1287
1288
1289
1290
1291
1292
1293
1294
1295
1296
1297
1298
1299
1300
1301
1302
1303
1304
1305
1306
1307
1308
1309
1310
1311
1312
1313
1314
1315
1316
1317
1318
1319
1320
1321
1322
1323
1324
1325
1326
1327
1328
1329
1330
1331
1332
1333
1334
1335
1336
1337
1338
1339
1340
1341
1342
1343
1344
1345
1346
1347
1348
1349
1350
1351
1352
1353
1354
1355
1356
1357
1358
1359
1360
1361
1362
1363
1364
1365
1366
1367
1368
1369
1370
1371
1372
1373
1374
1375
1376
1377
1378
1379
1380
1381
1382
1383
1384
1385
1386
1387
1388
1389
1390
1391
1392
1393
1394
1395
1396
1397
1398
1399
1400
1401
1402
1403
1404
1405
1406
1407
1408
1409
1410
1411
1412
1413
1414
1415
1416
1417
1418
1419
1420
1421
1422
1423
1424
1425
1426
1427
1428
1429
1430
1431
1432
1433
1434
1435
1436
1437
1438
1439
1440
1441
1442
1443
1444
1445
1446
1447
1448
1449
1450
1451
1452
1453
1454
1455
1456
1457
1458
1459
1460
1461
1462
1463
1464
1465
1466
1467
1468
1469
1470
1471
1472
1473
1474
1475
1476
1477
1478
1479
1480
1481
1482
1483
1484
1485
1486
1487
1488
1489
1490
1491
1492
1493
1494
1495
1496
1497
1498
1499
1500
1501
1502
1503
1504
1505
1506
1507
1508
1509
1510
1511
1512
1513
1514
1515
1516
1517
1518
1519
1520
1521
1522
1523
1524
1525
1526
1527
1528
1529
1530
1531
1532
1533
1534
1535
1536
1537
1538
1539
1540
1541
1542
1543
1544
1545
1546
1547
1548
1549
1550
1551
1552
1553
1554
1555
1556
1557
1558
1559
1560
1561
1562
1563
1564
1565
1566
1567
1568
1569
1570
1571
1572
1573
1574
1575
1576
1577
1578
1579
1580
1581
1582
1583
1584
1585
1586
1587
1588
1589
1590
1591
1592
1593
1594
1595
1596
1597
1598
1599
1600
1601
1602
1603
1604
1605
1606
1607
1608
1609
1610
1611
1612
1613
1614
1615
1616
1617
1618
1619
1620
1621
1622
1623
1624
1625
1626
1627
1628
1629
1630
1631
1632
1633
1634
1635
1636
1637
1638
1639
1640
1641
1642
1643
1644
1645
1646
1647
1648
1649
1650
1651
1652
1653
1654
1655
1656
1657
1658
1659
1660
1661
1662
1663
1664
1665
1666
1667
1668
1669
1670
1671
1672
1673
1674
1675
1676
1677
1678
1679
1680
1681
1682
1683
1684
1685
1686
1687
1688
1689
1690
1691
1692
1693
1694
1695
1696
1697
1698
1699
1700
1701
1702
1703
1704
1705
1706
1707
1708
1709
1710
1711
1712
1713
1714
1715
1716
1717
1718
1719
1720
1721
1722
1723
1724
1725
1726
1727
1728
1729
1730
1731
1732
1733
1734
1735
1736
1737
1738
1739
1740
1741
1742
1743
1744
1745
1746
1747
1748
1749
1750
1751
1752
1753
1754
1755
1756
1757
1758
1759
1760
1761
1762
1763
1764
1765
1766
1767
1768
1769
1770
1771
1772
1773
1774
1775
1776
1777
1778
1779
1780
1781
1782
1783
1784
1785
1786
1787
1788
1789
1790
1791
1792
1793
1794
1795
1796
1797
1798
1799
1800
1801
1802
1803
1804
1805
1806
1807
1808
1809
1810
1811
1812
1813
1814
1815
1816
1817
1818
1819
1820
1821
1822
1823
1824
1825
1826
1827
1828
1829
1830
1831
1832
1833
1834
1835
1836
1837
1838
1839
1840
1841
1842
1843
1844
1845
1846
1847
1848
1849
1850
1851
1852
1853
1854
1855
1856
1857
1858
1859
1860
1861
The .lzma File Format
---------------------

        0. Preface
          0.1. Copyright Notices
          0.2. Changes
        1. Conventions
          1.1. Byte and Its Representation
          1.2. Multibyte Integers
        2. Stream
          2.1. Stream Types
            2.1.1. Single-Block Stream
            2.1.2. Multi-Block Stream
          2.2. Stream Header
            2.2.1. Header Magic Bytes
            2.2.2. Stream Flags
            2.2.3. CRC32
        3. Block
          3.1. Block Header
            3.1.1. Block Flags
            3.1.2. Compressed Size
            3.1.3. Uncompressed Size
            3.1.4. List of Filter Flags
              3.1.4.1. Misc
              3.1.4.2. External ID
              3.1.4.3. External Size of Properties
              3.1.4.4. Filter Properties
            3.1.5. CRC32
            3.1.6. Header Padding
          3.2. Compressed Data
          3.3. Block Footer
            3.3.1. Check
            3.3.2. Stream Footer
              3.3.2.1. Uncompressed Size
              3.3.2.2. Backward Size
              3.3.2.3. Stream Flags
              3.3.2.4. Footer Magic Bytes
            3.3.3. Footer Padding
        4. Filters
          4.1. Detecting when All Data Has Been Decoded
            4.1.1. With Uncompressed Size
            4.1.2. With End of Input
            4.1.3. With End of Payload Marker
          4.2. Alignment
          4.3. Filters
            4.3.1. Copy
            4.3.2. Subblock
              4.3.2.1. Format of the Encoded Output
            4.3.3. Delta
              4.3.3.1. Format of the Encoded Output
            4.3.4. LZMA
              4.3.4.1. LZMA Properties
              4.3.4.2. Dictionary Flags
            4.3.5. Branch/Call/Jump Filters for Executables
        5. Metadata
          5.1. Metadata Flags
          5.2. Size of Header Metadata Block
          5.3. Total Size
          5.4. Uncompressed Size
          5.5. Index
            5.5.1. Number of Data Blocks
            5.5.2. Total Sizes
            5.5.3. Uncompressed Sizes
          5.6. Extra
            5.6.1. 0x00: Dummy/Padding
            5.6.2. 0x01: OpenPGP Signature
            5.6.3. 0x02: Filter Information
            5.6.4. 0x03: Comment
            5.6.5. 0x04: List of Checks
            5.6.6. 0x05: Original Filename
            5.6.7. 0x07: Modification Time
            5.6.8. 0x09: High-Resolution Modification Time
            5.6.9. 0x0B: MIME Type
            5.6.10. 0x0D: Homepage URL
        6. Custom Filter and Extra Record IDs
          6.1. Reserved Custom Filter ID Ranges
        7. Cyclic Redundancy Checks
        8. References
          8.1. Normative References
          8.2. Informative References


0. Preface

        This document describes the .lzma file format (filename suffix
        `.lzma', MIME type `application/x-lzma'). It is intended that
        this format replace the format used by the LZMA_Alone tool
        included in LZMA SDK up to and including version 4.57.

        IMPORTANT:  The version described in this document is a
                    draft, NOT a final, official version. Changes
                    are possible.


0.1. Copyright Notices

        Copyright (C) 2006, 2007 Lasse Collin <lasse.collin@tukaani.org>
        Copyright (C) 2006 Ville Koskinen <w-ber@iki.fi>

        Copying and distribution of this file, with or without
        modification, are permitted in any medium without royalty
        provided the copyright notice and this notice are preserved.
        Modified versions must be marked as such.

        All source code examples given in this document are put into
        the public domain by the authors of this document.

        Thanks for helping with this document goes to Igor Pavlov,
        Mark Adler and Mikko Pouru.


0.2. Changes

        Last modified: 2008-02-01 19:25+0200

        (A changelog will be kept once the first official version
        is made.)


1. Conventions

        The keywords `must', `must not', `required', `should',
        `should not', `recommended', `may', and `optional' in this
        document are to be interpreted as described in [RFC-2119].
        These words are not capitalized in this document.

        Indicating a warning means displaying a message, returning
        appropriate exit status, or something else to let the user
        know that something worth warning occurred. The operation
        should still finish if a warning is indicated.

        Indicating an error means displaying a message, returning
        appropriate exit status, or something else to let the user
        know that something prevented successfully finishing the
        operation. The operation must be aborted once an error has
        been indicated.


1.1. Byte and Its Representation

        In this document, byte is always 8 bits.

        A `nul byte' has all bits unset. That is, the value of a nul
        byte is 0x00.

        To represent byte blocks, this document uses notation that
        is similar to the notation used in [RFC-1952]:

            +-------+
            |  Foo  |   One byte.
            +-------+

            +---+---+
            |  Foo  |   Two bytes; that is, some of the vertical bars
            +---+---+   can be missing.

            +=======+
            |  Foo  |   Zero or more bytes.
            +=======+

        In this document, a boxed byte or a byte sequence declared
        using this notation is called `a field'. The example field
        above would be called called `the Foo field' or plain `Foo'.


1.2. Multibyte Integers

        Multibyte integers of static length, such as CRC values,
        are stored in little endian byte order (least significant
        byte first).

        When smaller values are more likely than bigger values (e.g.
        file sizes), multibyte integers are encoded in a simple
        variable-length representation:
          - Numbers in the range [0, 127] are copied as is, and take
            one byte of space.
          - Bigger numbers will occupy two or more bytes. The lowest
            seven bits of every byte are used for data; the highest
            (eighth) bit indicates either that
              0) the byte is in the middle of the byte sequence, or
              1) the byte is the first or the last byte.

        For now, the value of the variable-length integers is limited
        to 63 bits, which limits the encoded size of the integer to
        nine bytes. These limits may be increased in future if needed.

        Note that the encoding is not as optimal as it could be. For
        example, it is possible to encode the number 42 using any
        number of bytes between one and nine. This is convenient
        for non-streamed encoders, that write Compressed Size or
        Uncompressed Size fields to the Block Header (see Section 3.1)
        after the Compressed Data field is written to the disk.

        In several situations, the decoder needs to compare that two
        fields contain identical information. When comparing fields
        using the encoding described in this Section, the decoder must
        consider two fields identical if their decoded values are
        identical; it does not matter if the encoded variable-length
        representations differ.

        The following C code illustrates encoding and decoding 63-bit
        variables; the highest bit of uint64_t must be unset. The
        functions return the number of bytes occupied by the integer
        (1-9), or zero on error.

            #include <sys/types.h>
            #include <inttypes.h>

            size_t
            encode(uint8_t buf[static 9], uint64_t num)
            {
                if (num >= (UINT64_C(1) << (9 * 7)))
                    return 0;
                if (num <= 0x7F) {
                    buf[0] = num;
                    return 1;
                }
                buf[0] = (num & 0x7F) | 0x80;
                num >>= 7;
                size_t i = 1;
                while (num >= 0x80) {
                    buf[i++] = num & 0x7F;
                    num >>= 7;
                }
                buf[i++] = num | 0x80;
                return i;
            }

            size_t
            decode(const uint8_t buf[], size_t size_max, uint64_t *num)
            {
                if (size_max == 0)
                    return 0;
                if (size_max > 9)
                    size_max = 9;
                *num = buf[0] & 0x7F;
                if (!(buf[0] & 0x80))
                    return 1;
                size_t i = 1;
                do {
                    if (i == size_max)
                        return 0;
                    *num |= (uint64_t)(buf[i] & 0x7F) << (7 * i);
                } while (!(buf[i++] & 0x80));
                return i;
            }

            size_t
            decode_reverse(const uint8_t buf[], size_t size_max,
                    uint64_t *num)
            {
                if (size_max == 0)
                    return 0;
                const size_t end = size_max > 9 ? size_max - 9 : 0;
                size_t i = size_max - 1;
                *num = buf[i] & 0x7F;
                if (!(buf[i] & 0x80))
                    return 1;
                do {
                    if (i-- == end)
                        return 0;
                    *num <<= 7;
                    *num |= buf[i] & 0x7F;
                } while (!(buf[i] & 0x80));
                return size_max - i;
            }


2. Stream

        +========+========+========+
        | Stream | Stream | Stream | ...
        +========+========+========+

        A file contains usually only one Stream. However, it is
        possible to concatenate multiple Streams together with no
        additional processing. It is up to the implementation to
        decide if the decoder will continue decoding from the next
        Stream once the end of the first Stream has been reached.


2.1. Stream Types

        There are two types of Streams: Single-Block Streams and
        Multi-Block Streams. Decoders conforming to this specification
        must support at least Single-Block Streams. Supporting
        Multi-Block Streams is optional. If the decoder supports only
        Single-Block Streams, the documentation of the decoder should
        mention this fact clearly.


2.1.1. Single-Block Stream

        +===============+============+
        | Stream Header | Data Block |
        +===============+============+

        As the name says, a Single-Block Stream has exactly one Block.
        The Block must be a Data Block; Metadata Blocks are not allowed
        in Single-Block Streams.


2.1.2. Multi-Block Stream

        +===============+=======================+
        | Stream Header | Header Metadata Block |
        +===============+=======================+

             +============+     +============+=======================+
        ---> | Data Block | ... | Data Block | Footer Metadata Block |
             +============+     +============+=======================+

        Notes:
          - Stream Header is mandatory.
          - Header Metadata Block is optional.
          - Each Multi-Block Stream has at least one Data Block. The
            maximum number of Data Blocks is not limited.
          - Footer Metadata Block is mandatory.


2.2. Stream Header

        +---+---+---+---+---+---+--------------+--+--+--+--+
        |  Header Magic Bytes   | Stream Flags |   CRC32   |
        +---+---+---+---+---+---+--------------+--+--+--+--+


2.2.1. Header Magic Bytes

        The first six (6) bytes of the Stream are so called Header
        Magic Bytes. They can be used to identify the file type.

            Using a C array and ASCII:
            const uint8_t HEADER_MAGIC[6]
                    = { 0xFF, 'L', 'Z', 'M', 'A', 0x00 };

            In plain hexadecimal:
            FF 4C 5A 4D 41 00

        Notes:
          - The first byte (0xFF) was chosen so that the files cannot
            be erroneously detected as being in LZMA_Alone format, in
            which the first byte is in the the range [0x00, 0xE0].
          - The sixth byte (0x00) was chosen to prevent applications
            from misdetecting the file as a text file.


2.2.2. Stream Flags

        Bit(s)  Mask  Description
         0-2    0x07  Type of Check (see Section 3.3.1):
                          ID    Size      Check name
                          0x00   0 bytes  None
                          0x01   4 bytes  CRC32
                          0x02   4 bytes  (Reserved)
                          0x03   8 bytes  CRC64
                          0x04  16 bytes  (Reserved)
                          0x05  32 bytes  SHA-256
                          0x06  32 bytes  (Reserved)
                          0x07  64 bytes  (Reserved)
          3     0x08  The CRC32 field is present in Block Headers.
          4     0x10  If unset, this is a Single-Block Stream; if set,
                      this is a Multi-Block Stream.
         5-7    0xE0  Reserved for future use; must be zero for now.

        Implementations must support at least the Check IDs 0x00 (None)
        and 0x01 (CRC32). Supporting other Check IDs is optional. If an
        unsupported Check is used, the decoder must indicate a warning
        or error.

        If any reserved bit is set, the decoder must indicate an error.
        It is possible that there is a new field present which the
        decoder is not aware of, and can thus parse the Stream Header
        incorrectly.


2.2.3. CRC32

        The CRC32 is calculated from the Stream Flags field. It is
        stored as an unsigned 32-bit little endian integer. If the
        calculated value does not match the stored one, the decoder
        must indicate an error.

        Note that this field is always present; the bit in Stream Flags
        controls only presence of CRC32 in Block Headers.


3. Block

        +==============+=================+==============+
        | Block Header | Compressed Data | Block Footer |
        +==============+=================+==============+

        There are two types of Blocks:
          - Data Blocks hold the actual compressed data.
          - Metadata Blocks hold the Index, Extra, and a few other
            non-data fields (see Section 5).

        The type of the Block is indicated by the corresponding bit
        in the Block Flags field (see Section 3.1.1).


3.1. Block Header

        +------+------+=================+===================+
        | Block Flags | Compressed Size | Uncompressed Size |
        +------+------+=================+===================+

             +======================+--+--+--+--+================+
        ---> | List of Filter Flags |   CRC32   | Header Padding |
             +======================+--+--+--+--+================+


3.1.1. Block Flags

        The first byte of the Block Flags field is a bit field:

            Bit(s)  Mask  Description
             0-2    0x07  Number of filters (0-7)
              3     0x08  Use End of Payload Marker (even if
                          Uncompressed Size is stored to Block Header).
              4     0x10  The Compressed Size field is present.
              5     0x20  The Uncompressed Size field is present.
              6     0x40  Reserved for future use; must be zero for now.
              7     0x80  This is a Metadata Block.

        The second byte of the Block Flags field is also a bit field:

            Bit(s)  Mask  Description
             0-4    0x1F  Size of the Header Padding field (0-31 bytes)
             5-7    0xE0  Reserved for future use; must be zero for now.

        The decoder must indicate an error if End of Payload Marker
        is not used and Uncompressed Size is not stored to the Block
        Header. Because of this, the first byte of Block Flags can
        never be a nul byte. This is useful when detecting beginning
        of the Block after Footer Padding (see Section 3.3.3).

        If any reserved bit is set, the decoder must indicate an error.
        It is possible that there is a new field present which the
        decoder is not aware of, and can thus parse the Block Header
        incorrectly.


3.1.2. Compressed Size

        This field is present only if the appropriate bit is set in
        the Block Flags field (see Section 3.1.1).

        This field contains the size of the Compressed Data field.
        The size is stored using the encoding described in Section 1.2.
        If the Compressed Size does not match the real size of the
        Compressed Data field, the decoder must indicate an error.

        Having the Compressed Size field in the Block Header can be
        useful for multithreaded decoding when seeking is not possible.
        If the Blocks are small enough, the decoder can read multiple
        Blocks into its internal buffer, and decode the Blocks in
        parallel.

        Compressed Size can also be useful when seeking forwards to
        a specific location in streamed mode: the decoder can quickly
        skip over irrelevant Blocks, without decoding them.


3.1.3. Uncompressed Size

        This field is present only if the appropriate bit is set in
        the Block Flags field (see Section 3.1.1).

        The Uncompressed Size field contains the size of the Block
        after uncompressing.

        Storing Uncompressed Size serves several purposes:
          - The decoder will know when all of the data has been
            decoded without an explicit End of Payload Marker.
          - The decoder knows how much memory it needs to allocate
            for a temporary buffer in multithreaded mode.
          - Simple error detection: wrong size indicates a broken file.
          - Sometimes it is useful to know the file size without
            uncompressing the file.

        It should be noted that the only reliable way to find out what
        the real uncompressed size is is to uncompress the Block,
        because the Block Header and Metadata Block fields may contain
        (intentionally or unintentionally) invalid information.

        Uncompressed Size is stored using the encoding described in
        Section 1.2. If the Uncompressed Size does not match the
        real uncompressed size, the decoder must indicate an error.


3.1.4. List of Filter Flags

        +================+================+     +================+
        | Filter 0 Flags | Filter 1 Flags | ... | Filter n Flags |
        +================+================+     +================+

        The number of Filter Flags fields is stored in the Block Flags
        field (see Section 3.1.1). As a special case, if the number of
        Filter Flags fields is zero, it is equivalent to having the
        Copy filter as the only filter.

        The format of each Filter Flags field is as follows:

            +------+=============+=============================+
            | Misc | External ID | External Size of Properties |
            +------+=============+=============================+

                 +===================+
            ---> | Filter Properties |
                 +===================+

        The list of officially defined Filter IDs and the formats of
        their Filter Properties are described in Section 4.3.


3.1.4.1. Misc

        To save space, the most commonly used Filter IDs and the
        Size of Filter Properties are encoded in a single byte.
        Depending on the contents of the Misc field, Filter ID is
        the value of the Misc or External ID field.

            Value          Filter ID        Size of Filter Properties
            0x00 - 0x1F    Misc             0 bytes
            0x20 - 0x3F    Misc             1 byte
            0x40 - 0x5F    Misc             2 bytes
            0x60 - 0x7F    Misc             3 bytes
            0x80 - 0x9F    Misc             4 bytes
            0xA0 - 0xBF    Misc             5 bytes
            0xC0 - 0xDF    Misc             6 bytes
            0xE0 - 0xFE    External ID      0-30 bytes
            0xFF           External ID      External Size of Properties

        The following code demonstrates parsing the Misc field and,
        when needed, the External ID and External Size of Properties
        fields.

            uint64_t id;
            uint64_t properties_size;
            uint8_t misc = read_byte();

            if (misc >= 0xE0) {
                id = read_variable_length_integer();

                if (misc == 0xFF)
                    properties_size = read_variable_length_integer();
                else
                    properties_size = misc - 0xE0;

            } else {
                id = misc;
                properties_size = misc / 0x20;
            }


3.1.4.2. External ID

        This field is present only if the Misc field contains a value
        that indicates usage of External ID. The External ID is stored
        using the encoding described in Section 1.2.


3.1.4.3. External Size of Properties

        This field is present only if the Misc field contains a value
        that indicates usage of External Size of Properties. The size
        of Filter Properties is stored using the encoding described in
        Section 1.2.


3.1.4.4. Filter Properties

        Size of this field depends on the Misc field (Section 3.1.4.1)
        and, if present, External Size of Properties field (Section
        3.1.4.3). The format of this field is depends on the selected
        filter; see Section 4.3 for details.


3.1.5. CRC32

        This field is present only if the appropriate bit is set in
        the Stream Flags field (see Section 2.2.2).

        The CRC32 is calculated over everything in the Block Header
        field except the Header Padding field and the CRC32 field
        itself. It is stored as an unsigned 32-bit little endian
        integer. If the calculated value does not match the stored
        one, the decoder must indicate an error.


3.1.6. Header Padding

        This field contains as many nul bytes as indicated by the value
        stored in the Header Flags field. If the Header Padding field
        contains any non-nul bytes, the decoder must indicate an error.

        The intent of the Header Padding field is to allow alignment
        of Compressed Data. The usefulness of alignment is described
        in Section 4.3.


3.2. Compressed Data

        The format of Compressed Data depends on Block Flags and List
        of Filter Flags. Excluding the descriptions of the simplest
        filters in Section 4, the format of the filter-specific encoded
        data is out of scope of this document.

        Note a special case: if End of Payload Marker (see Section
        3.1.1) is not used and Uncompressed Size is zero, the size
        of the Compressed Data field is always zero.


3.3. Block Footer

        +=======+===============+================+
        | Check | Stream Footer | Footer Padding |
        +=======+===============+================+


3.3.1. Check

        The type and size of the Check field depends on which bits
        are set in the Stream Flags field (see Section 2.2.2).

        The Check, when used, is calculated from the original
        uncompressed data. If the calculated Check does not match the
        stored one, the decoder must indicate an error. If the selected
        type of Check is not supported by the decoder, it must indicate
        a warning or error.


3.3.2. Stream Footer

        +===================+===============+--------------+
        | Uncompressed Size | Backward Size | Stream Flags |
        +===================+===============+--------------+

             +----------+---------+
        ---> | Footer Magic Bytes |
             +----------+---------+

        Stream Footer is present only in
          - Data Block of a Single-Block Stream; and
          - Footer Metadata Block of a Multi-Block Stream.

        The Stream Footer field is placed inside Block Footer, because
        no padding is allowed between Check and Stream Footer.


3.3.2.1. Uncompressed Size

        This field is present only in the Data Block of a Single-Block
        Stream if Uncompressed Size is not stored to the Block Header
        (see Section 3.1.1). Without the Uncompressed Size field in
        Stream Footer it would not be possible to quickly find out
        the Uncompressed Size of the Stream in all cases.

        Uncompressed Size is stored using the encoding described in
        Section 1.2. If the stored value does not match the real
        uncompressed size of the Single-Block Stream, the decoder must
        indicate an error.


3.3.2.2. Backward Size

        This field contains the total size of the Block Header,
        Compressed Data, Check, and Uncompressed Size fields. The
        value is stored using the encoding described in Section 1.2.
        If the Backward Size does not match the real total size of
        the appropriate fields, the decoder must indicate an error.

        Implementations reading the Stream backwards should notice
        that the value in this field can never be zero.


3.3.2.3. Stream Flags

        This is a copy of the Stream Flags field from the Stream
        Header. The information stored to Stream Flags is needed
        when parsing the Stream backwards.


3.3.2.4. Footer Magic Bytes

        As the last step of the decoding process, the decoder must
        verify the existence of Footer Magic Bytes. If they are not
        found, an error must be indicated.

            Using a C array and ASCII:
            const uint8_t FOOTER_MAGIC[2] = { 'Y', 'Z' };

            In hexadecimal:
            59 5A

        The primary reason to have Footer Magic Bytes is to make
        it easier to detect incomplete files quickly, without
        uncompressing. If the file does not end with Footer Magic Bytes
        (excluding Footer Padding described in Section 3.3.3), it
        cannot be undamaged, unless someone has intentionally appended
        garbage after the end of the Stream. (Appending garbage at the
        end of the file does not prevent uncompressing the file, but
        may give a warning or error depending on the decoder
        implementation.)


3.3.3. Footer Padding

        In certain situations it is convenient to be able to pad
        Blocks or Streams to be multiples of, for example, 512 bytes.
        Footer Padding makes this possible. Note that this is in no
        way required to enforce alignment in the way described in
        Section 4.3; the Header Padding field is enough for that.

        When Footer Padding is used, it must contain only nul bytes.
        Any non-nul byte should be considered as the beginning of
        a new Block or Stream.

        The possibility of Padding should be taken into account when
        designing an application that wants to find out information
        about a Stream by parsing Footer Metadata Block.

        Support for Padding was inspired by a related note in
        [GNU-tar].


4. Filters

        The Block Flags field defines how many filters are used. When
        more than one filter is used, the filters are chained; that is,
        the output of one filter is the input of another filter. The
        following figure illustrates the direction of data flow.

                    v   Uncompressed Data   ^
                    |       Filter 0        |
            Encoder |       Filter 1        | Decoder
                    |         ...           |
                    |       Filter n        |
                    v    Compressed Data    ^

        The filters are independent from each other, except that they
        must cooperate a little to make it possible, in all cases, to
        detect when all of the data has been decoded. In addition, the
        filters should cooperate in the encoder to keep the alignment
        optimal.


4.1. Detecting when All Data Has Been Decoded

        There must be a way for the decoder to detect when all of the
        Compressed Data has been decoded. This is simple when only
        one filter is used, but a bit more complex when multiple
        filters are chained.

        This file format supports three methods to detect when all of
        the data has been decoded:
          - Uncompressed size
          - End of Input
          - End of Payload Marker

        In both encoder and decoder, filters are initialized starting
        from the first filter in the chain. For each filter, one of
        these three methods is used.


4.1.1. With Uncompressed Size

        This method is the only method supported by all filters.
        It must be used when uncompressed size is known by the
        filter-specific encoder or decoder. In practice this means
        that Uncompressed Size has been stored to the Block Header.

        In case of the first filter in the chain, the uncompressed size
        given to the filter-specific encoder or decoder equals the
        Uncompressed Size stored in the Block Header. For the rest of
        the filters in the chain, uncompressed size is the size of the
        output data of the previous filter in the chain.

        Note that when Use End of Payload Marker bit is set in Block
        Flags, Uncompressed Size is considered to be unknown even if
        it was present in the Block Header. Thus, if End of Payload
        Marker is used, uncompressed size of all of the filters in
        the chain is unknown, and can never be used to detect when
        all of the data has been decoded.

        Once the correct number of bytes has been written out, the
        filter-specific decoder indicates to its caller that all of
        the data has been decoded. If the filter-specific decoder
        detects End of Input or End of Payload Marker before the
        correct number of bytes is decoded, the decoder must indicate
        an error.


4.1.2. With End of Input

        Most filters will know that all of the data has been decoded
        when the End of Input data has been reached. Once the filter
        knows that it has received the input data in its entirety,
        it finishes its job, and indicates to its caller that all of
        the data has been decoded. The filter-specific decoder must
        indicate an error if it detects End of Payload Marker.

        Note that this method can work only when the filter is not
        the last filter in the chain, because only another filter
        can indicate the End of Input data. In practice this means,
        that a filter later in the chain must support embedding
        End of Payload Marker.

        When a filter that cannot embed End of Payload Marker is the
        last filter in the chain, Subblock filter is appended to the
        chain as an implicit filter. In the simplest case, this occurs
        when no filters are specified, and the End of Payload Marker
        bit is set in Block Flags.


4.1.3. With End of Payload Marker

        End of Payload Marker is a filter-specific bit sequence that
        indicates the end of data. It is supported by only a few
        filters. It is used when uncompressed size is unknown, and
        the filter
          - doesn't support End of Input; or
          - is the last filter in the chain.

        End of Payload Marker is embedded at the end of the encoded
        data by the filter-specific encoder. When the filter-specific
        decoder detects the embedded End of Payload Marker, the decoder
        knows that all of the data has been decoded. Then it finishes
        its job, and indicates to its caller that all of the data has
        been decoded. If the filter-specific decoder detects End of
        Input before End of Payload Marker, the decoder must indicate
        an error.

        If the filter supports both End of Input and End of Payload
        Marker, the former is used, unless the filter is the last
        filter in the chain.


4.2. Alignment

        Some filters give better compression ratio or are faster
        when the input or output data is aligned. For optimal results,
        the encoder should try to enforce proper alignment when
        possible. Not enforcing alignment in the encoder is not
        an error. Thus, the decoder must be able to handle files with
        suboptimal alignment.

        Alignment of uncompressed input data is usually the job of
        the application producing the data. For example, to get the
        best results, an archiver tool should make sure that all
        PowerPC executable files in the archive stream start at
        offsets that are multiples of four bytes.

        Some filters, for example LZMA, can be configured to take
        advantage of specified alignment of input data. Note that
        taking advantage of aligned input can be benefical also when
        a filter is not the first filter in the chain. For example,
        if you compress PowerPC executables, you may want to use the
        PowerPC filter and chain that with the LZMA filter. Because not
        only the input but also the output alignment of the PowerPC
        filter is four bytes, it is now benefical to set LZMA settings
        so that the LZMA encoder can take advantage of its
        four-byte-aligned input data.

        The output of the last filter in the chain is stored to the
        Compressed Data field. Aligning Compressed Data appropriately
        can increase
          - speed, if the filtered data is handled multiple bytes at
            a time by the filter-specific encoder and decoder,
            because accessing aligned data in computer memory is
            usually faster; and
          - compression ratio, if the output data is later compressed
            with an external compression tool.

        Compressed Data in a Stream can be aligned by using the Header
        Padding field in the Block Header.


4.3. Filters

4.3.1. Copy

        This is a dummy filter that simply copies all data from input
        to output unmodified.

            Filter ID:                  0x00
            Size of Filter Properties:  0 bytes
            Changes size of data:       No

            Detecting when all of the data has been decoded:
                Uncompressed size:      Yes
                End of Payload Marker:  No
                End of Input:           Yes

            Preferred alignment:
                Input data:             1 byte
                Output data:            1 byte


4.3.2. Subblock

        The Subblock filter can be used to
          - embed End of Payload Marker when the otherwise last
            filter in the chain does not support embedding it; and
          - apply additional filters in the middle of a Block.

            Filter ID:                  0x01
            Size of Filter Properties:  0 bytes
            Changes size of data:       Yes, unpredictably

            Detecting when all of the data has been decoded:
                Uncompressed size:      Yes
                End of Payload Marker:  Yes
                End of Input:           Yes

            Preferred alignment:
                Input data:             1 byte
                Output data:            Freely adjustable


4.3.2.1. Format of the Encoded Output

        The encoded data from the Subblock filter consist of zero or
        more Subblocks:

            +==========+==========+
            | Subblock | Subblock | ...
            +==========+==========+

        Each Subblock contains two fields:

            +----------------+===============+
            | Subblock Flags | Subblock Data |
            +----------------+===============+

        Subblock Flags is a bitfield:

            Bits   Mask   Description
            0-3    0x0F   The interpretation of these bits depend on
                          the Subblock Type:
                            - 0x20    Bits 0-3 for Size
                            - 0x30    Bits 0-3 for Repeat Count
                            - Other   These bits must be zero.
            4-7    0xF0   Subblock Type:
                            - 0x00: Padding
                            - 0x10: End of Payload Marker
                            - 0x20: Data
                            - 0x30: Repeating Data
                            - 0x40: Set Subfilter
                            - 0x50: Unset Subfilter
                          If some other value is detected, the decoder
                          must indicate an error.

        The format of the Subblock Data field depends on Subblock Type.

        Subblocks with the Subblock Type 0x00 (Padding) don't have a
        Subblock Data field. These Subblocks can be useful for fixing
        alignment. There can be at maximum of 31 consecutive Subblocks
        with this Subblock Type; if there are more, the decoder must
        indicate an error.

        Subblock with the Subblock Type 0x10 (End of Payload Marker)
        doesn't have a Subblock Data field. The decoder must indicate
        an error if this Subblock Type is detected when Subfilter is
        enabled, or when the Subblock filter is not supposed to embed
        the End of Payload Marker.

        Subblocks with the Subblock Type 0x20 (Data) contain the rest
        of the Size, which is followed by Size + 1 bytes in the Data
        field (that is, Data can never be empty):

            +------+------+------+======+
            | Bits 4-27 for Size | Data |
            +------+------+------+======+

        Subblocks with the Subblock Type 0x30 (Repeating Data) contain
        the rest of the Repeat Count, the Size of the Data, and finally
        the actual Data to be repeated:

            +---------+---------+--------+------+======+
            | Bits 4-27 for Repeat Count | Size | Data |
            +---------+---------+--------+------+======+

        The size of the Data field is Size + 1. It is repeated Repeat
        Count + 1 times. That is, the minimum size of Data is one byte;
        the maximum size of Data is 256 bytes. The minimum number of
        repeats is one; the maximum number of repeats is 2^28.

        If Subfilter is not used, the Data field of Subblock Types 0x20
        and 0x30 is the output of the decoded Subblock filter. If
        Subfilter is used, Data is the input of the Subfilter, and the
        decoded output of the Subfilter is the decoded output of the
        Subblock filter.

        Subblocks with the Subblock Type 0x40 (Set Subfilter) contain
        a Filter Flags field in Subblock Data:

            +==============+
            | Filter Flags |
            +==============+

        It is an error to set the Subfilter to Filter ID 0x00 (Copy)
        or 0x01 (Subblock). All the other Filter IDs are allowed.
        The decoder must indicate an error if this Subblock Type is
        detected when a Subfilter is already enabled.

        Subblocks with the Subblock Type 0x50 (Unset Subfilter) don't
        have a Subblock Data field. There must be at least one Subblock
        with Subblock Type 0x20 or 0x30 between Subblocks with Subblock
        Type 0x40 and 0x50; if there isn't, the decoder must indicate
        an error.

        Subblock Types 0x40 and 0x50 are always used as a pair: If the
        Subblock filter has been enabled with Subblock Type 0x40, it
        must always be disabled later with Subblock Type 0x50.
        Disabling must be done even if the Subfilter used End of
        Payload Marker; after the Subfilter has detected End of Payload
        Marker, the next Subblock that is not Padding must unset the
        Subfilter.

        When the Subblock filter is used as an implicit filter to embed
        End of Payload marker, the Subblock Types 0x40 and 0x50 (Set or
        Unset Subfilter) must not be used. The decoder must indicate an
        error if it detects any of these Subblock Types in an implicit
        Subblock filter.

        The following code illustrates the basic structure of a
        Subblock decoder.

            uint32_t consecutive_padding = 0;
            bool got_output_with_subfilter = false;

            while (true) {
                uint32_t size;
                uint32_t repeat;
                uint8_t flags = read_byte();

                if (flags != 0)
                    consecutive_padding = 0;

                switch (flags >> 4) {
                    case 0:
                        // Padding
                        if (flags & 0x0F)
                            return DATA_ERROR;
                        if (++consecutive_padding == 32)
                            return DATA_ERROR;
                        break;

                    case 1:
                        // End of Payload Marker
                        if (flags & 0x0F)
                            return DATA_ERROR;
                        if (subfilter_enabled || !allow_eopm)
                            return DATA_ERROR;
                        break;

                    case 2:
                        // Data
                        size = flags & 0x0F;
                        for (size_t i = 4; i < 28; i += 8)
                            size |= (uint32_t)(read_byte()) << i;

                        // If any output is produced, this will
                        // set got_output_with_subfilter to true.
                        copy_data(size);
                        break;

                    case 3:
                        // Repeating Data
                        repeat = flags & 0x0F;
                        for (size_t i = 4; i < 28; i += 8)
                            repeat |= (uint32_t)(read_byte()) << i;
                        size = read_byte();

                        // If any output is produced, this will
                        // set got_output_with_subfilter to true.
                        copy_repeating_data(size, repeat);
                        break;

                    case 4:
                        // Set Subfilter
                        if (flags & 0x0F)
                            return DATA_ERROR;
                        if (subfilter_enabled)
                            return DATA_ERROR;
                        got_output_with_subfilter = false;
                        set_subfilter();
                        break;

                    case 5:
                        // Unset Subfilter
                        if (flags & 0x0F)
                            return DATA_ERROR;
                        if (!subfilter_enabled)
                            return DATA_ERROR;
                        if (!got_output_with_subfilter)
                            return DATA_ERROR;
                        unset_subfilter();
                        break;

                    default:
                        return DATA_ERROR;
                }
            }


4.3.3. Delta

        The Delta filter may increase compression ratio when the value
        of the next byte correlates with the value of an earlier byte
        at specified distance.

            Filter ID:                  0x20
            Size of Filter Properties:  1 byte
            Changes size of data:       No

            Detecting when all of the data has been decoded:
                Uncompressed size:      Yes
                End of Payload Marker:  No
                End of Input:           Yes

            Preferred alignment:
                Input data:             1 byte
                Output data:            Same as the original input data

        The Properties byte indicates the delta distance, which can be
        1-256 bytes backwards from the current byte: 0x00 indicates
        distance of 1 byte and 0xFF distance of 256 bytes.


4.3.3.1. Format of the Encoded Output

        The code below illustrates both encoding and decoding with
        the Delta filter.

            // Distance is in the range [1, 256].
            const unsigned int distance = get_properties_byte() + 1;
            uint8_t pos = 0;
            uint8_t delta[256];

            memset(delta, 0, sizeof(delta));

            while (1) {
                const int byte = read_byte();
                if (byte == EOF)
                    break;

                uint8_t tmp = delta[(uint8_t)(distance + pos)];
                if (is_encoder) {
                    tmp = (uint8_t)(byte) - tmp;
                    delta[pos] = (uint8_t)(byte);
                } else {
                    tmp = (uint8_t)(byte) + tmp;
                    delta[pos] = tmp;
                }

                write_byte(tmp);
                --pos;
            }


4.3.4. LZMA

        LZMA (Lempel-Ziv-Markov chain-Algorithm) is a general-purporse
        compression algorithm with high compression ratio and fast
        decompression. LZMA based on LZ77 and range coding algorithms.

            Filter ID:                  0x40
            Size of Filter Properties:  2 bytes
            Changes size of data:       Yes, unpredictably

            Detecting when all of the data has been decoded:
                Uncompressed size:      Yes
                End of Payload Marker:  Yes
                End of Input:           No

            Preferred alignment:
                Input data:             Adjustable to 1/2/4/8/16 byte(s)
                Output data:            1 byte

        At the time of writing, there is no other documentation about
        how LZMA works than the source code in LZMA SDK. Once such
        documentation gets written, it will probably be published as
        a separate document, because including the documentation here
        would lengthen this document considerably.

        The format of the Filter Properties field is as follows:

            +-----------------+------------------+
            | LZMA Properties | Dictionary Flags |
            +-----------------+------------------+


4.3.4.1. LZMA Properties

        The LZMA Properties field contains three properties. An
        abbreviation is given in parentheses, followed by the value
        range of the property. The field consists of

            1) the number of literal context bits (lc, [0, 8]);
            2) the number of literal position bits (lp, [0, 4]); and
            3) the number of position bits (pb, [0, 4]).

        They are encoded using the following formula:

            LZMA Properties = (pb * 5 + lp) * 9 + lc

        The following C code illustrates a straightforward way to
        decode the properties:

            uint8_t lc, lp, pb;
            uint8_t prop = get_lzma_properties() & 0xFF;
            if (prop > (4 * 5 + 4) * 9 + 8)
                return LZMA_PROPERTIES_ERROR;

            pb = prop / (9 * 5);
            prop -= pb * 9 * 5;
            lp = prop / 9;
            lc = prop - lp * 9;


4.3.4.2. Dictionary Flags

        Currently the lowest six bits of the Dictionary Flags field
        are in use:

            Bits   Mask   Description
            0-5    0x3F   Dictionary Size
            6-7    0xC0   Reserved for future use; must be zero for now.

        Dictionary Size is encoded with one-bit mantissa and five-bit
        exponent. To avoid wasting space, one-byte dictionary has its
        own special value.

            Raw value   Mantissa   Exponent   Dictionary size
                0           1          0      1 byte
                1           2          0      2 bytes
                2           3          0      3 bytes
                3           2          1      4 bytes
                4           3          1      6 bytes
                5           2          2      8 bytes
                6           3          2      12 bytes
                7           2          3      16 bytes
                8           3          3      24 bytes
                9           2          4      32 bytes
              ...         ...        ...      ...
               61           2         30      2 GiB
               62           3         30      3 GiB
               63           2         31      4 GiB (*)

            (*) The real maximum size of the dictionary is one byte
                less than 4 GiB, because the distance of 4 GiB is
                reserved for End of Payload Marker.

        Instead of having a table in the decoder, the dictionary size
        can be decoded using the following C code:

            uint64_t dictionary_size;
            const uint8_t bits = get_dictionary_flags() & 0x3F;
            if (bits == 0) {
                dictionary_size = 1;
            } else {
                dictionary_size = 2 | ((bits + 1) & 1);
                dictionary_size = dictionary_size << ((bits - 1)  / 2);
            }


4.3.5. Branch/Call/Jump Filters for Executables

        These filters convert relative branch, call, and jump
        instructions to their absolute counterparts in executable
        files. This conversion increases redundancy and thus
        compression ratio.

            Size of Filter Properties:  0 or 4 bytes
            Changes size of data:       No

            Detecting when all of the data has been decoded:
                Uncompressed size:      Yes
                End of Payload Marker:  No
                End of Input:           Yes

        Below is the list of filters in this category. The alignment
        is the same for both input and output data.

            Filter ID   Alignment   Description
              0x04       1 byte     x86 filter (BCJ)
              0x05       4 bytes    PowerPC (big endian) filter
              0x06      16 bytes    IA64 filter
              0x07       4 bytes    ARM (little endian) filter
              0x08       2 bytes    ARM Thumb (little endian) filter
              0x09       4 bytes    SPARC filter

        If the size of Filter Properties is four bytes, the Filter
        Properties field contains the start offset used for address
        conversions. It is stored as an unsigned 32-bit little endian
        integer. If the size of Filter Properties is zero, the start
        offset is zero.

        Setting the start offset may be useful if an executable has
        multiple sections, and there are many cross-section calls.
        Taking advantage of this feature usually requires usage of
        the Subblock filter.


5. Metadata

        Metadata is stored in Metadata Blocks, which can be in the
        beginning or at the end of a Multi-Block Stream. Because of
        Blocks, it is possible to compress Metadata in the same way
        as the actual data is compressed. This Section describes the
        format of the data stored in Metadata Blocks.

            +----------------+===============================+
            | Metadata Flags | Size of Header Metadata Block |
            +----------------+===============================+

                 +============+===================+=======+=======+
            ---> | Total Size | Uncompressed Size | Index | Extra |
                 +============+===================+=======+=======+

        Stream must be parseable backwards. That is, there must be
        a way to locate the beginning of the Stream by starting from
        the end of the Stream. Thus, the Footer Metadata Block must
        contain the Total Size field or the Index field. If the Stream
        has Header Metadata Block, also the Size of Header Metadata
        Block field must be present in Footer Metadata Block.

        It must be possible to quickly locate the Blocks in
        non-streamed mode. Thus, the Index field must be present
        at least in one Metadata Block.

        If the above conditions are not met, the decoder must indicate
        an error.

        There should be no additional data after the last field. If
        there is, the the decoder should indicate an error.


5.1. Metadata Flags

        This field describes which fields are present in a Metadata
        Block:

            Bit(s)  Mask   Desription
              0     0x01   Size of Header Metadata Block is present.
              1     0x02   Total Size is present.
              2     0x04   Uncompressed Size is present.
              3     0x08   Index is present.
             4-6    0x70   Reserve for future use; must be zero for now.
              7     0x80   Extra is present.

        If any reserved bit is set, the decoder must indicate an error.
        It is possible that there is a new field present which the
        decoder is not aware of, and can thus parse the Metadata
        incorrectly.


5.2. Size of Header Metadata Block

        This field is present only if the appropriate bit is set in
        the Metadata Flags field (see Section 5.1).

        Size of Header Metadata Block is needed to make it possible to
        parse the Stream backwards. The size is stored using the
        encoding described in Section 1.2. The decoder must verify that
        that the value stored in this field is non-zero. In Footer
        Metadata Block, the decoder must also verify that the stored
        size matches the real size of Header Metadata Block. In the
        Header Meatadata Block, the value of this field is ignored as
        long as it is not zero.


5.3. Total Size

        This field is present only if the appropriate bit is set in the
        Metadata Flags field (see Section 5.1).

        This field contains the total size of the Data Blocks in the
        Stream. Total Size is stored using the encoding described in
        Section 1.2. If the stored value does not match the real total
        size of the Data Blocks, the decoder must indicate an error.
        The value of this field must be non-zero.

        Total Size can be used to quickly locate the beginning or end
        of the Stream. This can be useful for example when doing
        random-access reading, and the Index field is not in the
        Metadata Block currently being read.

        It is useless to have both Total Size and Index in the same
        Metadata Block, because Total Size can be calculated from the
        Index field.


5.4. Uncompressed Size

        This field is present only if the appropriate bit is set in the
        Metadata Flags field (see Section 5.1).

        This field contains the total uncompressed size of the Data
        Blocks in the Stream. Uncompresssed Size is stored using the
        encoding described in Section 1.2. If the stored value does not
        match the real uncompressed size of the Data Blocks, the
        decoder must indicate an error.

        It is useless to have both Uncompressed Size and Index in
        the same Metadata Block, because Uncompressed Size can be
        calculated from the Index field.


5.5. Index

        +=======================+=============+====================+
        | Number of Data Blocks | Total Sizes | Uncompressed Sizes |
        +=======================+=============+====================+

        Index serves several purporses. Using it, one can
          - verify that all Blocks in a Stream have been processed;
          - find out the Uncompressed Size of a Stream; and
          - quickly access the beginning of any Block (random access).


5.5.1. Number of Data Blocks

        This field contains the number of Data Blocks in the Stream.
        The value is stored using the encoding described in Section
        1.2. If the decoder has decoded all the Data Blocks of the
        Stream, and then notices that the Number of Records doesn't
        match the real number of Data Blocks, the decoder must
        indicate an error. The value of this field must be non-zero.


5.5.2. Total Sizes

        +============+============+
        | Total Size | Total Size | ...
        +============+============+

        This field lists the Total Sizes of every Data Block in the
        Stream. There are as many Total Size fields as indicated by
        the Number of Data Blocks field.

        Total Size is the size of Block Header, Compressed Data, and
        Block Footer. It is stored using the encoding described in
        Section 1.2. If the Total Sizes do not match the real sizes
        of respective Blocks, the decoder should indicate an error.
        All the Total Size fields must have a non-zero value.


5.5.3. Uncompressed Sizes

        +===================+===================+
        | Uncompressed Size | Uncompressed Size | ...
        +===================+===================+

        This field lists the Uncompressed Sizes of every Data Block
        in the Stream. There are as many Uncompressed Size fields as
        indicated by the Number of Records field.

        Uncompressed Sizes are stored using the encoding described
        in Section 1.2. If the Uncompressed Sizes do not match the
        real sizes of respective Blocks, the decoder shoud indicate
        an error.


5.6. Extra

        This field is present only if the appropriate bit is set in the
        Metadata Flags field (see Section 5.1). Note that the bit does
        not indicate that there is any data in the Extra field; it only
        indicates that Extra may be non-empty.

        The Extra field contains only information that is not required
        to properly uncompress the Stream or to do random-access
        reading. Supporting the Extra field is optional. In case the
        decoder doesn't support the Extra field, it should silently
        ignore it.

        Extra consists of zero or more Records:

            +========+========+
            | Record | Record | ...
            +========+========+

        Excluding Records with Record ID 0x00, each Record contains
        three fields:

            +==========+==============+======+
            | Reord ID | Size of Data | Data |
            +==========+==============+======+

        The Record ID and Size of Data are stored using the encoding
        described in Section 1.2. Data can be binary or UTF-8
        [RFC-3629] strings. Non-UTF-8 strings should be avoided.
        Because the Size of Data is known, there is no need to
        terminate strings with a nul byte, although doing so should
        not be considered an error.

        The Record IDs are divided in two categories:
          - Safe-to-Copy Records may be preserved as is when the
            Stream is modified in ways that don't change the actual
            uncompressed data. Examples of such operatings include
            recompressing and adding, modifying, or deleting unrelated
            Extra Records.
          - Unsafe-to-Copy Records should be removed (and possibly
            recreated) when any kind of changes are made to the Stream.

        When the actual uncompressed data is modified, all Records
        should be removed (and possibly recreated), unless the
        application knows that the Data stored to the Record(s) is
        still valid.

        The following subsections describe the standard Record IDs and
        the format of their Data fields. Safe-to-Copy Records have an
        odd ID, while Unsafe-to-Copy Records have an even ID.


5.6.1. 0x00: Dummy/Padding

        This Record is special, because it doesn't have the Size of
        Data or Data fields.

        Dummy Records can be used, for example, to fill Metadata Block
        when a few bytes of extra space has been reserved for it. There
        can be any number of Dummy Records.


5.6.2. 0x01: OpenPGP Signature

        OpenPGP signature is computed from uncompressed data. The
        signature can be used to verify that the contents of a Stream
        has been created by a trustworthy source.

        If the decoder supports decoding concatenated Streams, it
        must indicate an error when verifying OpenPGP signatures if
        there is more than one Stream.

        OpenPGP format is documented in [RFC-2440].


5.6.3. 0x02: Filter Information

        The Filter Information Record contains information about the
        filters used in the Stream. This field can be used to quickly
          - display which filters are used in each Block;
          - check if all the required filters are supported by the
            current decoder version; and
          - check how much memory is required to decode each Block.

        The format of the Filter Information field is as follows:

            +=================+=================+
            | Block 0 Filters | Block 1 Filters | ...
            +=================+=================+

        There can be at maximum of as many Block Filters fields as
        there are Data Blocks in the Stream. The format of the Block
        Filters field is as follows:

            +------------------+======================+============+
            | Block Properties | List of Filter Flags | Subfilters |
            +------------------+======================+============+

        Block Properties is a bitfield:

            Bit(s)  Mask  Description
             0-2    0x07  Number of filters (0-7)
              3     0x08  End of Payload Marker is used.
              4     0x10  The Subfilters field is present.
             5-7    0xE0  Reserved for future use; must be zero for now.

        The contents of the List of Filter Flags field must match the
        List of Filter Flags field in the respective Block Header.

        The Subfilters field may be present only if the List of Filter
        Flags contains a Filter Flags field for a Subblock filter. The
        format of the Subfilters field is as follows:

            +======================+=========================+
            | Number of Subfilters | List of Subfilter Flags |
            +======================+=========================+

        The value stored in the Number of Subfilters field is stored
        using the encoding described in Section 1.2. The List of
        Subfilter Flags field contains as many Filter Flags fields
        as indicated by the Number of Subfilters field. These Filter
        Flags fields list some or all the Subfilters used via the
        Subblock filter. The order of the listed Subfilters is not
        significant.

        Decoders supporting this Record should indicate a warning or
        error if this Record contains Filter Flags that are not
        actually used by the respective Blocks.


5.6.4. 0x03: Comment

        Free-form comment is stored in UTF-8 [RFC-3629] encoding.

        The beginning of a new line should be indicated using the
        ASCII Line Feed character (0x0A). When the Line Feed character
        is not the native way to indicate new line in the underlying
        operating system, the encoder and decoder should convert the
        newline characters to and from Line Feeds.


5.6.5. 0x04: List of Checks

        +=======+=======+
        | Check | Check | ...
        +=======+=======+

        There are as many Check fields as there are Blocks in the
        Stream. The size of Check fields depend on Stream Flags
        (see Section 2.2.2).

        Decoders supporting this Record should indicate a warning or
        error if the Checks don't match the respective Blocks.


5.6.6. 0x05: Original Filename

        Original filename is stored in UTF-8 [RFC-3629] encoding.

        The filename must not include any path, only the filename
        itself. Special care must be taken to prevent directory
        traversal vulnerabilities.

        When files are moved between different operating systems, it
        is possible that filename valid in the source system is not
        valid in the target system. It is implementation defined how
        the decoder handles this kind of situations.


5.6.7. 0x07: Modification Time

        Modification time is stored as POSIX time, as an unsigned
        little endian integer. The number of bits depends on the
        Size of Data field. Note that the usage of unsigned integer
        limits the earliest representable time to 1970-01-01T00:00:00.


5.6.8. 0x09: High-Resolution Modification Time

        This Record extends the `0x04: Modification time' Record with
        a subsecond time information. There are two supported formats
        of this field, which can be distinguished by looking at the
        Size of Data field.

        Size  Data
          3   [0; 9,999,999] times 100 nanoseconds
          4   [0; 999,999,999] nanoseconds

        The value is stored as an unsigned 24-bit or 32-bit little
        endian integer.


5.6.9. 0x0B: MIME Type

        MIME type of the uncompressed Stream. This can be used to
        detect the content type. [IANA-MIME]


5.6.10. 0x0D: Homepage URL

        This field can be used, for example, when distributing software
        packages (sources or binaries). The field would indicate the
        homepage of the program.

        For details on how to encode URLs, see [RFC-1738].


6. Custom Filter and Extra Record IDs

        If a developer wants to use custom Filter or Extra Record IDs,
        he has two choices. The first choice is to contact Lasse Collin
        and ask him to allocate a range of IDs for the developer.

        The second choice is to generate a 40-bit random integer,
        which the developer can use as his personal Developer ID.
        To minimalize the risk of collisions, Developer ID has to be
        a randomly generated integer, not manually selected "hex word".
        The following command, which works on many free operating
        systems, can be used to generate Developer ID:

            dd if=/dev/urandom bs=5 count=1 | hexdump

        The developer can then use his Developer ID to create unique
        (well, hopefully unique) Filter and Extra Record IDs.

            Bits    Mask                    Description
             0-15   0x0000_0000_0000_FFFF   Filter or Extra Record ID
            16-55   0x00FF_FFFF_FFFF_0000   Developer ID
            56-62   0x7F00_0000_0000_0000   Static prefix: 0x7F

        The resulting 63-bit integer will use 9 bytes of space when
        stored using the encoding described in Section 1.2. To get
        a shorter ID, see the beginning of this Section how to
        request a custom ID range.

        Note that Filter and Metadata Record IDs are in their own
        namespaces. That is, you can use the same ID value as Filter ID
        and Metadata Record ID, and the meanings of the IDs do not need
        to be related to each other.


6.1. Reserved Custom Filter ID Ranges

        Range                       Description
        0x0000_0000 - 0x0000_00DF   IDs fitting into the Misc field
        0x0002_0000 - 0x0007_FFFF   Reserved to ease .7z compatibility
        0x0200_0000 - 0x07FF_FFFF   Reserved to ease .7z compatibility


7. Cyclic Redundancy Checks

        There are several incompatible variations to calculate CRC32
        and CRC64. For simplicity and clarity, complete examples are
        provided to calculate the checks as they are used in this file
        format. Implementations may use different code as long as it
        gives identical results.

        The program below reads data from standard input, calculates
        the CRC32 and CRC64 values, and prints the calculated values
        as big endian hexadecimal strings to standard output.

            #include <sys/types.h>
            #include <inttypes.h>
            #include <stdio.h>

            uint32_t crc32_table[256];
            uint64_t crc64_table[256];

            void
            init(void)
            {
                static const uint32_t poly32 = UINT32_C(0xEDB88320);
                static const uint64_t poly64
                        = UINT64_C(0xC96C5795D7870F42);

                for (size_t i = 0; i < 256; ++i) {
                    uint32_t crc32 = i;
                    uint64_t crc64 = i;

                    for (size_t j = 0; j < 8; ++j) {
                        if (crc32 & 1)
                            crc32 = (crc32 >> 1) ^ poly32;
                        else
                            crc32 >>= 1;

                        if (crc64 & 1)
                            crc64 = (crc64 >> 1) ^ poly64;
                        else
                            crc64 >>= 1;
                    }

                    crc32_table[i] = crc32;
                    crc64_table[i] = crc64;
                }
            }

            uint32_t
            crc32(const uint8_t *buf, size_t size, uint32_t crc)
            {
                crc = ~crc;
                for (size_t i = 0; i < size; ++i)
                    crc = crc32_table[buf[i] ^ (crc & 0xFF)]
                            ^ (crc >> 8);
                return ~crc;
            }

            uint64_t
            crc64(const uint8_t *buf, size_t size, uint64_t crc)
            {
                crc = ~crc;
                for (size_t i = 0; i < size; ++i)
                    crc = crc64_table[buf[i] ^ (crc & 0xFF)]
                            ^ (crc >> 8);
                return ~crc;
            }

            int
            main()
            {
                init();

                uint32_t value32 = 0;
                uint64_t value64 = 0;
                uint64_t total_size = 0;
                uint8_t buf[8192];

                while (1) {
                    const size_t buf_size = fread(buf, 1, 8192, stdin);
                    if (buf_size == 0)
                        break;

                    total_size += buf_size;
                    value32 = crc32(buf, buf_size, value32);
                    value64 = crc64(buf, buf_size, value64);
                }

                printf("Bytes:  %" PRIu64 "\n", total_size);
                printf("CRC-32: 0x%08" PRIX32 "\n", value32);
                printf("CRC-64: 0x%016" PRIX64 "\n", value64);

                return 0;
            }


8. References

8.1. Normative References

        [RFC-1738]
        Uniform Resource Locators (URL)
        http://www.ietf.org/rfc/rfc1738.txt

        [RFC-2119]
        Key words for use in RFCs to Indicate Requirement Levels
        http://www.ietf.org/rfc/rfc2119.txt

        [RFC-2440]
        OpenPGP Message Format
        http://www.ietf.org/rfc/rfc2440.txt

        [RFC-3629]
        UTF-8, a transformation format of ISO 10646
        http://www.ietf.org/rfc/rfc3629.txt

        [IANA-MIME]
        MIME Media Types
        http://www.iana.org/assignments/media-types/


8.2. Informative References

        LZMA SDK - The original LZMA implementation
        http://7-zip.org/sdk.html

        LZMA Utils - LZMA adapted to POSIX-like systems
        http://tukaani.org/lzma/

        [RFC-1952]
        GZIP file format specification version 4.3
        http://www.ietf.org/rfc/rfc1952.txt
          - Notation of byte boxes in section `2.1. Overall conventions'

        [GNU-tar]
        GNU tar 1.16.1 manual
        http://www.gnu.org/software/tar/manual/html_node/Blocking-Factor.html
          - Node 9.4.2 `Blocking Factor', paragraph that begins
            `gzip will complain about trailing garbage'
          - Note that this URL points to the latest version of the
            manual, and may some day not contain the note which is in
            1.16.1. For the exact version of the manual, download GNU
            tar 1.16.1: ftp://ftp.gnu.org/pub/gnu/tar/tar-1.16.1.tar.gz