liblzma: Add lzma_file_info_decoder().

author Lasse Collin <lasse.collin@tukaani.org>

Mon, 24 Apr 2017 16:35:50 +0000 (19:35 +0300)

committer Lasse Collin <lasse.collin@tukaani.org>

Mon, 24 Apr 2017 16:48:04 +0000 (19:48 +0300)
author Lasse Collin <lasse.collin@tukaani.org>
Mon, 24 Apr 2017 16:35:50 +0000 (19:35 +0300)
committer Lasse Collin <lasse.collin@tukaani.org>
Mon, 24 Apr 2017 16:48:04 +0000 (19:48 +0300)
diff --git a/src/liblzma/api/lzma/index.h b/src/liblzma/api/lzma/index.h

index 3dac6fb85ccb9b6191817fd9a45764e10cb07e0e..144d416615a496b3bbcad04534425435b344c56d 100644 (file)
--- a/src/liblzma/api/lzma/index.h
+++ b/src/liblzma/api/lzma/index.h
@@ -684,3 +684,69 @@ extern LZMA_API(lzma_ret) lzma_index_buffer_decode(lzma_index **i,
                 uint64_t *memlimit, const lzma_allocator *allocator,
                 const uint8_t *in, size_t *in_pos, size_t in_size)
                 lzma_nothrow;
+
+
+/**
+ * \brief       Initialize a .xz file information decoder
+ *
+ * \param       strm        Pointer to a properly prepared lzma_stream
+ * \param       dest_index  Pointer to a pointer where the decoder will put
+ *                          the decoded lzma_index. The old value
+ *                          of *dest_index is ignored (not freed).
+ * \param       memlimit    How much memory the resulting lzma_index is
+ *                          allowed to require. Use UINT64_MAX to
+ *                          effectively disable the limiter.
+ * \param       file_size   Size of the input .xz file
+ *
+ * This decoder decodes the Stream Header, Stream Footer, Index, and
+ * Stream Padding field(s) from the input .xz file and stores the resulting
+ * combined index in *dest_index. This information can be used to get the
+ * uncompressed file size with lzma_index_uncompressed_size(*dest_index) or,
+ * for example, to implement random access reading by locating the Blocks
+ * in the Streams.
+ *
+ * To get the required information from the .xz file, lzma_code() may ask
+ * the application to seek in the input file by returning LZMA_SEEK_NEEDED
+ * and having the target file position specified in lzma_stream.seek_pos.
+ * The number of seeks required depends on the input file and how big buffers
+ * the application provides. When possible, the decoder will seek backward
+ * and forward in the given buffer to avoid useless seek requests. Thus, if
+ * the application provides the whole file at once, no external seeking will
+ * be required (that is, lzma_code() won't return LZMA_SEEK_NEEDED).
+ *
+ * The value in lzma_stream.total_in can be used to estimate how much data
+ * liblzma had to read to get the file information. However, due to seeking
+ * and the way total_in is updated, the value of total_in will be somewhat
+ * inaccurate (a little too big). Thus, total_in is a good estimate but don't
+ * expect to see the same exact value for the same file if you change the
+ * input buffer size or switch to a different liblzma version.
+ *
+ * Valid `action' arguments to lzma_code() are LZMA_RUN and LZMA_FINISH.
+ * You only need to use LZMA_RUN; LZMA_FINISH is only supported because it
+ * might be convenient for some applications. If you use LZMA_FINISH and if
+ * lzma_code() asks the application to seek, remember to reset `action' back
+ * to LZMA_RUN unless you hit the end of the file again.
+ *
+ * Possible return values from lzma_code():
+ *   - LZMA_OK: All OK so far, more input needed
+ *   - LZMA_SEEK_NEEDED: Provide more input starting from the absolute
+ *     file position strm->seek_pos
+ *   - LZMA_STREAM_END: Decoding was successful, *dest_index has been set
+ *   - LZMA_FORMAT_ERROR: The input file is not in the .xz format (the
+ *     expected magic bytes were not found from the beginning of the file)
+ *   - LZMA_OPTIONS_ERROR: File looks valid but contains headers that aren't
+ *     supported by this version of liblzma
+ *   - LZMA_DATA_ERROR: File is corrupt
+ *   - LZMA_BUF_ERROR
+ *   - LZMA_MEM_ERROR
+ *   - LZMA_MEMLIMIT_ERROR
+ *   - LZMA_PROG_ERROR
+ *
+ * \return      - LZMA_OK
+ *              - LZMA_MEM_ERROR
+ *              - LZMA_PROG_ERROR
+ */
+extern LZMA_API(lzma_ret) lzma_file_info_decoder(
+               lzma_stream *strm, lzma_index **dest_index,
+               uint64_t memlimit, uint64_t file_size)
+               lzma_nothrow;
diff --git a/src/liblzma/common/Makefile.inc b/src/liblzma/common/Makefile.inc

index 67c8e48ca366555bf6aedb1aaf4a72bb5f6f850c..0408f9a48c4db58254bed742f726a2039ad41bc0 100644 (file)
--- a/src/liblzma/common/Makefile.inc
+++ b/src/liblzma/common/Makefile.inc
@@ -65,6 +65,7 @@ liblzma_la_SOURCES += \
         common/block_decoder.h \
         common/block_header_decoder.c \
         common/easy_decoder_memusage.c \
+       common/file_info.c \
         common/filter_buffer_decoder.c \
         common/filter_decoder.c \
         common/filter_decoder.h \
diff --git a/src/liblzma/common/file_info.c b/src/liblzma/common/file_info.c

new file mode 100644 (file)

index 0000000..deb644f
--- /dev/null
+++ b/src/liblzma/common/file_info.c
@@ -0,0 +1,855 @@
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       file_info.c
+/// \brief      Decode .xz file information into a lzma_index structure
+//
+//  Author:     Lasse Collin
+//
+//  This file has been put into the public domain.
+//  You can do whatever you want with this file.
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include "index_decoder.h"
+
+
+typedef struct {
+       enum {
+               SEQ_MAGIC_BYTES,
+               SEQ_PADDING_SEEK,
+               SEQ_PADDING_DECODE,
+               SEQ_FOOTER,
+               SEQ_INDEX_INIT,
+               SEQ_INDEX_DECODE,
+               SEQ_HEADER_DECODE,
+               SEQ_HEADER_COMPARE,
+       } sequence;
+
+       /// Absolute position of in[*in_pos] in the file. All code that
+       /// modifies *in_pos also updates this. seek_to_pos() needs this
+       /// to determine if we need to request the application to seek for
+       /// us or if we can do the seeking internally by adjusting *in_pos.
+       uint64_t file_cur_pos;
+
+       /// This refers to absolute positions of interesting parts of the
+       /// input file. Sometimes it points to the *beginning* of a specific
+       /// field and sometimes to the *end* of a field. The current target
+       /// position at each moment is explained in the comments.
+       uint64_t file_target_pos;
+
+       /// Size of the .xz file (from the application).
+       uint64_t file_size;
+
+       /// Index decoder
+       lzma_next_coder index_decoder;
+
+       /// Number of bytes remaining in the Index field that is currently
+       /// being decoded.
+       lzma_vli index_remaining;
+
+       /// The Index decoder will store the decoded Index in this pointer.
+       lzma_index *this_index;
+
+       /// Amount of Stream Padding in the current Stream.
+       lzma_vli stream_padding;
+
+       /// The final combined index is collected here.
+       lzma_index *combined_index;
+
+       /// Pointer from the application where to store the index information
+       /// after successful decoding.
+       lzma_index **dest_index;
+
+       /// Pointer to lzma_stream.seek_pos to be used when returning
+       /// LZMA_SEEK_NEEDED. This is set by seek_to_pos() when needed.
+       uint64_t *external_seek_pos;
+
+       /// Memory usage limit
+       uint64_t memlimit;
+
+       /// Stream Flags from the very beginning of the file.
+       lzma_stream_flags first_header_flags;
+
+       /// Stream Flags from Stream Header of the current Stream.
+       lzma_stream_flags header_flags;
+
+       /// Stream Flags from Stream Footer of the current Stream.
+       lzma_stream_flags footer_flags;
+
+       size_t temp_pos;
+       size_t temp_size;
+       uint8_t temp[8192];
+
+} lzma_file_info_coder;
+
+
+/// Copies data from in[*in_pos] into coder->temp until
+/// coder->temp_pos == coder->temp_size. This also keeps coder->file_cur_pos
+/// in sync with *in_pos. Returns true if more input is needed.
+static bool
+fill_temp(lzma_file_info_coder *coder, const uint8_t *restrict in,
+               size_t *restrict in_pos, size_t in_size)
+{
+       coder->file_cur_pos += lzma_bufcpy(in, in_pos, in_size,
+                       coder->temp, &coder->temp_pos, coder->temp_size);
+       return coder->temp_pos < coder->temp_size;
+}
+
+
+/// Seeks to the absolute file position specified by target_pos.
+/// This tries to do the seeking by only modifying *in_pos, if possible.
+/// The main benefit of this is that if one passes the whole file at once
+/// to lzma_code(), the decoder will never need to return LZMA_SEEK_NEEDED
+/// as all the seeking can be done by adjusting *in_pos in this function.
+///
+/// Returns true if an external seek is needed and the caller must return
+/// LZMA_SEEK_NEEDED.
+static bool
+seek_to_pos(lzma_file_info_coder *coder, uint64_t target_pos,
+               size_t in_start, size_t *in_pos, size_t in_size)
+{
+       // The input buffer doesn't extend beyond the end of the file.
+       // This has been checked by file_info_decode() already.
+       assert(coder->file_size - coder->file_cur_pos >= in_size - *in_pos);
+
+       const uint64_t pos_min = coder->file_cur_pos - (*in_pos - in_start);
+       const uint64_t pos_max = coder->file_cur_pos + (in_size - *in_pos);
+
+       bool external_seek_needed;
+
+       if (target_pos >= pos_min && target_pos <= pos_max) {
+               // The requested position is available in the current input
+               // buffer or right after it. That is, in a corner case we
+               // end up setting *in_pos == in_size and thus will immediately
+               // need new input bytes from the application.
+               *in_pos += (size_t)(target_pos - coder->file_cur_pos);
+               external_seek_needed = false;
+       } else {
+               // Ask the application to seek the input file.
+               *coder->external_seek_pos = target_pos;
+               external_seek_needed = true;
+
+               // Mark the whole input buffer as used. This way
+               // lzma_stream.total_in will have a better estimate
+               // of the amount of data read. It still won't be perfect
+               // as the value will depend on the input buffer size that
+               // the application uses, but it should be good enough for
+               // those few who want an estimate.
+               *in_pos = in_size;
+       }
+
+       // After seeking (internal or external) the current position
+       // will match the requested target position.
+       coder->file_cur_pos = target_pos;
+
+       return external_seek_needed;
+}
+
+
+/// The caller sets coder->file_target_pos so that it points to the *end*
+/// of the desired file position. This function then determines how far
+/// backwards from that position we can seek. After seeking fill_temp()
+/// can be used to read data into coder->temp. When fill_temp() has finished,
+/// coder->temp[coder->temp_size] will match coder->file_target_pos.
+///
+/// This also validates that coder->target_file_pos is sane in sense that
+/// we aren't trying to seek too far backwards (too close or beyond the
+/// beginning of the file).
+static lzma_ret
+reverse_seek(lzma_file_info_coder *coder,
+               size_t in_start, size_t *in_pos, size_t in_size)
+{
+       // Check that there is enough data before the target position
+       // to contain at least Stream Header and Stream Footer. If there
+       // isn't, the file cannot be valid.
+       if (coder->file_target_pos < 2 * LZMA_STREAM_HEADER_SIZE)
+               return LZMA_DATA_ERROR;
+
+       coder->temp_pos = 0;
+
+       // The Stream Header at the very beginning of the file gets handled
+       // specially in SEQ_MAGIC_BYTES and thus we will never need to seek
+       // there. By not seeking to the first LZMA_STREAM_HEADER_SIZE bytes
+       // we avoid a useless external seek after SEQ_MAGIC_BYTES if the
+       // application uses an extremely small input buffer and the input
+       // file is very small.
+       if (coder->file_target_pos - LZMA_STREAM_HEADER_SIZE
+                       < sizeof(coder->temp))
+               coder->temp_size = (size_t)(coder->file_target_pos
+                               - LZMA_STREAM_HEADER_SIZE);
+       else
+               coder->temp_size = sizeof(coder->temp);
+
+       // The above if-statements guarantee this. This is important because
+       // the Stream Header/Footer decoders assume that there's at least
+       // LZMA_STREAM_HEADER_SIZE bytes in coder->temp.
+       assert(coder->temp_size >= LZMA_STREAM_HEADER_SIZE);
+
+       if (seek_to_pos(coder, coder->file_target_pos - coder->temp_size,
+                       in_start, in_pos, in_size))
+               return LZMA_SEEK_NEEDED;
+
+       return LZMA_OK;
+}
+
+
+/// Gets the number of zero-bytes at the end of the buffer.
+static size_t
+get_padding_size(const uint8_t *buf, size_t buf_size)
+{
+       size_t padding = 0;
+       while (buf_size > 0 && buf[--buf_size] == 0x00)
+               ++padding;
+
+       return padding;
+}
+
+
+/// With the Stream Header at the very beginning of the file, LZMA_FORMAT_ERROR
+/// is used to tell the application that Magic Bytes didn't match. In other
+/// Stream Header/Footer fields (in the middle/end of the file) it could be
+/// a bit confusing to return LZMA_FORMAT_ERROR as we already know that there
+/// is a valid Stream Header at the beginning of the file. For those cases
+/// this function is used to convert LZMA_FORMAT_ERROR to LZMA_DATA_ERROR.
+static lzma_ret
+hide_format_error(lzma_ret ret)
+{
+       if (ret == LZMA_FORMAT_ERROR)
+               ret = LZMA_DATA_ERROR;
+
+       return ret;
+}
+
+
+/// Calls the Index decoder and updates coder->index_remaining.
+/// This is a separate function because the input can be either directly
+/// from the application or from coder->temp.
+static lzma_ret
+decode_index(lzma_file_info_coder *coder, const lzma_allocator *allocator,
+               const uint8_t *restrict in, size_t *restrict in_pos,
+               size_t in_size, bool update_file_cur_pos)
+{
+       const size_t in_start = *in_pos;
+
+       const lzma_ret ret = coder->index_decoder.code(
+                       coder->index_decoder.coder,
+                       allocator, in, in_pos, in_size,
+                       NULL, NULL, 0, LZMA_RUN);
+
+       coder->index_remaining -= *in_pos - in_start;
+
+       if (update_file_cur_pos)
+               coder->file_cur_pos += *in_pos - in_start;
+
+       return ret;
+}
+
+
+static lzma_ret
+file_info_decode(void *coder_ptr, const lzma_allocator *allocator,
+               const uint8_t *restrict in, size_t *restrict in_pos,
+               size_t in_size,
+               uint8_t *restrict out lzma_attribute((__unused__)),
+               size_t *restrict out_pos lzma_attribute((__unused__)),
+               size_t out_size lzma_attribute((__unused__)),
+               lzma_action action lzma_attribute((__unused__)))
+{
+       lzma_file_info_coder *coder = coder_ptr;
+       const size_t in_start = *in_pos;
+
+       // If the caller provides input past the end of the file, trim
+       // the extra bytes from the buffer so that we won't read too far.
+       assert(coder->file_size >= coder->file_cur_pos);
+       if (coder->file_size - coder->file_cur_pos < in_size - in_start)
+               in_size = in_start
+                       + (size_t)(coder->file_size - coder->file_cur_pos);
+
+       while (true)
+       switch (coder->sequence) {
+       case SEQ_MAGIC_BYTES:
+               // Decode the Stream Header at the beginning of the file
+               // first to check if the Magic Bytes match. The flags
+               // are stored in coder->first_header_flags so that we
+               // don't need to seek to it again.
+               //
+               // Check that the file is big enough to contain at least
+               // Stream Header.
+               if (coder->file_size < LZMA_STREAM_HEADER_SIZE)
+                       return LZMA_FORMAT_ERROR;
+
+               // Read the Stream Header field into coder->temp.
+               if (fill_temp(coder, in, in_pos, in_size))
+                       return LZMA_OK;
+
+               // This is the only Stream Header/Footer decoding where we
+               // want to return LZMA_FORMAT_ERROR if the Magic Bytes don't
+               // match. Elsewehere it will be converted to LZMA_DATA_ERROR.
+               return_if_error(lzma_stream_header_decode(
+                               &coder->first_header_flags, coder->temp));
+
+               // Now that we know that the Magic Bytes match, check the
+               // file size. It's better to do this here after checking the
+               // Magic Bytes since this way we can give LZMA_FORMAT_ERROR
+               // instead of LZMA_DATA_ERROR when the Magic Bytes don't
+               // match in a file that is too big or isn't a multiple of
+               // four bytes.
+               if (coder->file_size > LZMA_VLI_MAX || (coder->file_size & 3))
+                       return LZMA_DATA_ERROR;
+
+               // Start looking for Stream Padding and Stream Footer
+               // at the end of the file.
+               coder->file_target_pos = coder->file_size;
+
+       // Fall through
+
+       case SEQ_PADDING_SEEK:
+               coder->sequence = SEQ_PADDING_DECODE;
+               return_if_error(reverse_seek(
+                               coder, in_start, in_pos, in_size));
+
+       // Fall through
+
+       case SEQ_PADDING_DECODE: {
+               // Copy to coder->temp first. This keeps the code simpler if
+               // the application only provides input a few bytes at a time.
+               if (fill_temp(coder, in, in_pos, in_size))
+                       return LZMA_OK;
+
+               // Scan the buffer backwards to get the size of the
+               // Stream Padding field (if any).
+               const size_t new_padding = get_padding_size(
+                               coder->temp, coder->temp_size);
+               coder->stream_padding += new_padding;
+
+               // Set the target position to the beginning of Stream Padding
+               // that has been observed so far. If all Stream Padding has
+               // been seen, then the target position will be at the end
+               // of the Stream Footer field.
+               coder->file_target_pos -= new_padding;
+
+               if (new_padding == coder->temp_size) {
+                       // The whole buffer was padding. Seek backwards in
+                       // the file to get more input.
+                       coder->sequence = SEQ_PADDING_SEEK;
+                       break;
+               }
+
+               // Size of Stream Padding must be a multiple of 4 bytes.
+               if (coder->stream_padding & 3)
+                       return LZMA_DATA_ERROR;
+
+               coder->sequence = SEQ_FOOTER;
+
+               // Calculate the amount of non-padding data in coder->temp.
+               coder->temp_size -= new_padding;
+               coder->temp_pos = coder->temp_size;
+
+               // We can avoid an external seek if the whole Stream Footer
+               // is already in coder->temp. In that case SEQ_FOOTER won't
+               // read more input and will find the Stream Footer from
+               // coder->temp[coder->temp_size - LZMA_STREAM_HEADER_SIZE].
+               //
+               // Otherwise we will need to seek. The seeking is done so
+               // that Stream Footer wil be at the end of coder->temp.
+               // This way it's likely that we also get a complete Index
+               // field into coder->temp without needing a separate seek
+               // for that (unless the Index field is big).
+               if (coder->temp_size < LZMA_STREAM_HEADER_SIZE)
+                       return_if_error(reverse_seek(
+                                       coder, in_start, in_pos, in_size));
+       }
+
+       // Fall through
+
+       case SEQ_FOOTER:
+               // Copy the Stream Footer field into coder->temp.
+               // If Stream Footer was already available in coder->temp
+               // in SEQ_PADDING_DECODE, then this does nothing.
+               if (fill_temp(coder, in, in_pos, in_size))
+                       return LZMA_OK;
+
+               // Make coder->file_target_pos and coder->temp_size point
+               // to the beginning of Stream Footer and thus to the end
+               // of the Index field. coder->temp_pos will be updated
+               // a bit later.
+               coder->file_target_pos -= LZMA_STREAM_HEADER_SIZE;
+               coder->temp_size -= LZMA_STREAM_HEADER_SIZE;
+
+               // Decode Stream Footer.
+               return_if_error(hide_format_error(lzma_stream_footer_decode(
+                               &coder->footer_flags,
+                               coder->temp + coder->temp_size)));
+
+               // Check that we won't seek past the beginning of the file.
+               //
+               // LZMA_STREAM_HEADER_SIZE is added because there must be
+               // space for Stream Header too even though we won't seek
+               // there before decoding the Index field.
+               //
+               // There's no risk of integer overflow here because
+               // Backward Size cannot be greater than 2^34.
+               if (coder->file_target_pos < coder->footer_flags.backward_size
+                               + LZMA_STREAM_HEADER_SIZE)
+                       return LZMA_DATA_ERROR;
+
+               // Set the target position to the beginning of the Index field.
+               coder->file_target_pos -= coder->footer_flags.backward_size;
+               coder->sequence = SEQ_INDEX_INIT;
+
+               // We can avoid an external seek if the whole Index field is
+               // already available in coder->temp.
+               if (coder->temp_size >= coder->footer_flags.backward_size) {
+                       // Set coder->temp_pos to point to the beginning
+                       // of the Index.
+                       coder->temp_pos = coder->temp_size
+                                       - coder->footer_flags.backward_size;
+               } else {
+                       // These are set to zero to indicate that there's no
+                       // useful data (Index or anything else) in coder->temp.
+                       coder->temp_pos = 0;
+                       coder->temp_size = 0;
+
+                       // Seek to the beginning of the Index field.
+                       if (seek_to_pos(coder, coder->file_target_pos,
+                                       in_start, in_pos, in_size))
+                               return LZMA_SEEK_NEEDED;
+               }
+
+       // Fall through
+
+       case SEQ_INDEX_INIT: {
+               // Calculate the amount of memory already used by the earlier
+               // Indexes so that we know how big memory limit to pass to
+               // the Index decoder.
+               //
+               // NOTE: When there are multiple Streams, the separate
+               // lzma_index structures can use more RAM (as measured by
+               // lzma_index_memused()) than the final combined lzma_index.
+               // Thus memlimit may need to be slightly higher than the final
+               // calculated memory usage will be. This is perhaps a bit
+               // confusing to the application, but I think it shouldn't
+               // cause problems in practice.
+               uint64_t memused = 0;
+               if (coder->combined_index != NULL) {
+                       memused = lzma_index_memused(coder->combined_index);
+                       assert(memused <= coder->memlimit);
+                       if (memused > coder->memlimit) // Extra sanity check
+                               return LZMA_PROG_ERROR;
+               }
+
+               // Initialize the Index decoder.
+               return_if_error(lzma_index_decoder_init(
+                               &coder->index_decoder, allocator,
+                               &coder->this_index,
+                               coder->memlimit - memused));
+
+               coder->index_remaining = coder->footer_flags.backward_size;
+               coder->sequence = SEQ_INDEX_DECODE;
+       }
+
+       // Fall through
+
+       case SEQ_INDEX_DECODE: {
+               // Decode (a part of) the Index. If the whole Index is already
+               // in coder->temp, read it from there. Otherwise read from
+               // in[*in_pos] onwards. Note that index_decode() updates
+               // coder->index_remaining and optionally coder->file_cur_pos.
+               lzma_ret ret;
+               if (coder->temp_size != 0) {
+                       assert(coder->temp_size - coder->temp_pos
+                                       == coder->index_remaining);
+                       ret = decode_index(coder, allocator, coder->temp,
+                                       &coder->temp_pos, coder->temp_size,
+                                       false);
+               } else {
+                       // Don't give the decoder more input than the known
+                       // remaining size of the Index field.
+                       size_t in_stop = in_size;
+                       if (in_size - *in_pos > coder->index_remaining)
+                               in_stop = *in_pos
+                                       + (size_t)(coder->index_remaining);
+
+                       ret = decode_index(coder, allocator,
+                                       in, in_pos, in_stop, true);
+               }
+
+               switch (ret) {
+               case LZMA_OK:
+                       // If the Index docoder asks for more input when we
+                       // have already given it as much input as Backward Size
+                       // indicated, the file is invalid.
+                       if (coder->index_remaining == 0)
+                               return LZMA_DATA_ERROR;
+
+                       // We cannot get here if we were reading Index from
+                       // coder->temp because when reading from coder->temp
+                       // we give the Index decoder exactly
+                       // coder->index_remaining bytes of input.
+                       assert(coder->temp_size == 0);
+
+                       return LZMA_OK;
+
+               case LZMA_STREAM_END:
+                       // If the decoding seems to be successful, check also
+                       // that the Index decoder consumed as much input as
+                       // indicated by the Backward Size field.
+                       if (coder->index_remaining != 0)
+                               return LZMA_DATA_ERROR;
+
+                       break;
+
+               default:
+                       return ret;
+               }
+
+               // Calculate how much the Index tells us to seek backwards
+               // (relative to the beginning of the Index): Total size of
+               // all Blocks plus the size of the Stream Header field.
+               // No integer overflow here because lzma_index_total_size()
+               // cannot return a value greater than LZMA_VLI_MAX.
+               const uint64_t seek_amount
+                               = lzma_index_total_size(coder->this_index)
+                                       + LZMA_STREAM_HEADER_SIZE;
+
+               // Check that Index is sane in sense that seek_amount won't
+               // make us seek past the beginning of the file when locating
+               // the Stream Header.
+               //
+               // coder->file_target_pos still points to the beginning of
+               // the Index field.
+               if (coder->file_target_pos < seek_amount)
+                       return LZMA_DATA_ERROR;
+
+               // Set the target to the beginning of Stream Header.
+               coder->file_target_pos -= seek_amount;
+
+               if (coder->file_target_pos == 0) {
+                       // We would seek to the beginning of the file, but
+                       // since we already decoded that Stream Header in
+                       // SEQ_MAGIC_BYTES, we can use the cached value from
+                       // coder->first_header_flags to avoid the seek.
+                       coder->header_flags = coder->first_header_flags;
+                       coder->sequence = SEQ_HEADER_COMPARE;
+                       break;
+               }
+
+               coder->sequence = SEQ_HEADER_DECODE;
+
+               // Make coder->file_target_pos point to the end of
+               // the Stream Header field.
+               coder->file_target_pos += LZMA_STREAM_HEADER_SIZE;
+
+               // If coder->temp_size is non-zero, it points to the end
+               // of the Index field. Then the beginning of the Index
+               // field is at coder->temp[coder->temp_size
+               // - coder->footer_flags.backward_size].
+               assert(coder->temp_size == 0 || coder->temp_size
+                               >= coder->footer_flags.backward_size);
+
+               // If coder->temp contained the whole Index, see if it has
+               // enough data to contain also the Stream Header. If so,
+               // we avoid an external seek.
+               //
+               // NOTE: This can happen only with small .xz files and only
+               // for the non-first Stream as the Stream Flags of the first
+               // Stream are cached and already handled a few lines above.
+               // So this isn't as useful as the other seek-avoidance cases.
+               if (coder->temp_size != 0 && coder->temp_size
+                               - coder->footer_flags.backward_size
+                               >= seek_amount) {
+                       // Make temp_pos and temp_size point to the *end* of
+                       // Stream Header so that SEQ_HEADER_DECODE will find
+                       // the start of Stream Header from coder->temp[
+                       // coder->temp_size - LZMA_STREAM_HEADER_SIZE].
+                       coder->temp_pos = coder->temp_size
+                                       - coder->footer_flags.backward_size
+                                       - seek_amount
+                                       + LZMA_STREAM_HEADER_SIZE;
+                       coder->temp_size = coder->temp_pos;
+               } else {
+                       // Seek so that Stream Header will be at the end of
+                       // coder->temp. With typical multi-Stream files we
+                       // will usually also get the Stream Footer and Index
+                       // of the *previous* Stream in coder->temp and thus
+                       // won't need a separate seek for them.
+                       return_if_error(reverse_seek(coder,
+                                       in_start, in_pos, in_size));
+               }
+       }
+
+       // Fall through
+
+       case SEQ_HEADER_DECODE:
+               // Copy the Stream Header field into coder->temp.
+               // If Stream Header was already available in coder->temp
+               // in SEQ_INDEX_DECODE, then this does nothing.
+               if (fill_temp(coder, in, in_pos, in_size))
+                       return LZMA_OK;
+
+               // Make all these point to the beginning of Stream Header.
+               coder->file_target_pos -= LZMA_STREAM_HEADER_SIZE;
+               coder->temp_size -= LZMA_STREAM_HEADER_SIZE;
+               coder->temp_pos = coder->temp_size;
+
+               // Decode the Stream Header.
+               return_if_error(hide_format_error(lzma_stream_header_decode(
+                               &coder->header_flags,
+                               coder->temp + coder->temp_size)));
+
+               coder->sequence = SEQ_HEADER_COMPARE;
+
+       // Fall through
+
+       case SEQ_HEADER_COMPARE:
+               // Compare Stream Header against Stream Footer. They must
+               // match.
+               return_if_error(lzma_stream_flags_compare(
+                               &coder->header_flags, &coder->footer_flags));
+
+               // Store the decoded Stream Flags into the Index. Use the
+               // Footer Flags because it contains Backward Size, although
+               // it shouldn't matter in practice.
+               if (lzma_index_stream_flags(coder->this_index,
+                               &coder->footer_flags) != LZMA_OK)
+                       return LZMA_PROG_ERROR;
+
+               // Store also the size of the Stream Padding field. It is
+               // needed to calculate the offsets of the Streams correctly.
+               if (lzma_index_stream_padding(coder->this_index,
+                               coder->stream_padding) != LZMA_OK)
+                       return LZMA_PROG_ERROR;
+
+               // Reset it so that it's ready for the next Stream.
+               coder->stream_padding = 0;
+
+               // Append the earlier decoded Indexes after this_index.
+               if (coder->combined_index != NULL)
+                       return_if_error(lzma_index_cat(coder->this_index,
+                                       coder->combined_index, allocator));
+
+               coder->combined_index = coder->this_index;
+               coder->this_index = NULL;
+
+               // If the whole file was decoded, tell the caller that we
+               // are finished.
+               if (coder->file_target_pos == 0) {
+                       // The combined index must indicate the same file
+                       // size as was told to us at initialization.
+                       assert(lzma_index_file_size(coder->combined_index)
+                                       == coder->file_size);
+
+                       // Make the combined index available to
+                       // the application.
+                       *coder->dest_index = coder->combined_index;
+                       coder->combined_index = NULL;
+
+                       // Mark the input buffer as used since we may have
+                       // done internal seeking and thus don't know how
+                       // many input bytes were actually used. This way
+                       // lzma_stream.total_in gets a slightly better
+                       // estimate of the amount of input used.
+                       *in_pos = in_size;
+                       return LZMA_STREAM_END;
+               }
+
+               // We didn't hit the beginning of the file yet, so continue
+               // reading backwards in the file. If we have unprocessed
+               // data in coder->temp, use it before requesting more data
+               // from the application.
+               //
+               // coder->file_target_pos, coder->temp_size, and
+               // coder->temp_pos all point to the beginning of Stream Header
+               // and thus the end of the previous Stream in the file.
+               coder->sequence = coder->temp_size > 0
+                               ? SEQ_PADDING_DECODE : SEQ_PADDING_SEEK;
+               break;
+
+       default:
+               assert(0);
+               return LZMA_PROG_ERROR;
+       }
+}
+
+
+static lzma_ret
+file_info_decoder_memconfig(void *coder_ptr, uint64_t *memusage,
+               uint64_t *old_memlimit, uint64_t new_memlimit)
+{
+       lzma_file_info_coder *coder = coder_ptr;
+
+       // The memory usage calculation comes from three things:
+       //
+       // (1) The Indexes that have already been decoded and processed into
+       //     coder->combined_index.
+       //
+       // (2) The latest Index in coder->this_index that has been decoded but
+       //     not yet put into coder->combined_index.
+       //
+       // (3) The latest Index that we have started decoding but haven't
+       //     finished and thus isn't available in coder->this_index yet.
+       //     Memory usage and limit information needs to be communicated
+       //     from/to coder->index_decoder.
+       //
+       // Care has to be taken to not do both (2) and (3) when calculating
+       // the memory usage.
+       uint64_t combined_index_memusage = 0;
+       uint64_t this_index_memusage = 0;
+
+       // (1) If we have already successfully decoded one or more Indexes,
+       // get their memory usage.
+       if (coder->combined_index != NULL)
+               combined_index_memusage = lzma_index_memused(
+                               coder->combined_index);
+
+       // Choose between (2), (3), or neither.
+       if (coder->this_index != NULL) {
+               // (2) The latest Index is available. Use its memory usage.
+               this_index_memusage = lzma_index_memused(coder->this_index);
+
+       } else if (coder->sequence == SEQ_INDEX_DECODE) {
+               // (3) The Index decoder is activate and hasn't yet stored
+               // the new index in coder->this_index. Get the memory usage
+               // information from the Index decoder.
+               //
+               // NOTE: If the Index decoder doesn't yet know how much memory
+               // it will eventually need, it will return a tiny value here.
+               uint64_t dummy;
+               if (coder->index_decoder.memconfig(coder->index_decoder.coder,
+                                       &this_index_memusage, &dummy, 0)
+                               != LZMA_OK) {
+                       assert(0);
+                       return LZMA_PROG_ERROR;
+               }
+       }
+
+       // Now we know the total memory usage/requirement. If we had neither
+       // old Indexes nor a new Index, this will be zero which isn't
+       // acceptable as lzma_memusage() has to return non-zero on success
+       // and even with an empty .xz file we will end up with a lzma_index
+       // that takes some memory.
+       *memusage = combined_index_memusage + this_index_memusage;
+       if (*memusage == 0)
+               *memusage = lzma_index_memusage(1, 0);
+
+       *old_memlimit = coder->memlimit;
+
+       // If requested, set a new memory usage limit.
+       if (new_memlimit != 0) {
+               if (new_memlimit < *memusage)
+                       return LZMA_MEMLIMIT_ERROR;
+
+               // In the condition (3) we need to tell the Index decoder
+               // its new memory usage limit.
+               if (coder->this_index == NULL
+                               && coder->sequence == SEQ_INDEX_DECODE) {
+                       const uint64_t idec_new_memlimit = new_memlimit
+                                       - combined_index_memusage;
+
+                       assert(this_index_memusage > 0);
+                       assert(idec_new_memlimit > 0);
+
+                       uint64_t dummy1;
+                       uint64_t dummy2;
+
+                       if (coder->index_decoder.memconfig(
+                                       coder->index_decoder.coder,
+                                       &dummy1, &dummy2, idec_new_memlimit)
+                                       != LZMA_OK) {
+                               assert(0);
+                               return LZMA_PROG_ERROR;
+                       }
+               }
+
+               coder->memlimit = new_memlimit;
+       }
+
+       return LZMA_OK;
+}
+
+
+static void
+file_info_decoder_end(void *coder_ptr, const lzma_allocator *allocator)
+{
+       lzma_file_info_coder *coder = coder_ptr;
+
+       lzma_next_end(&coder->index_decoder, allocator);
+       lzma_index_end(coder->this_index, allocator);
+       lzma_index_end(coder->combined_index, allocator);
+
+       lzma_free(coder, allocator);
+       return;
+}
+
+
+static lzma_ret
+lzma_file_info_decoder_init(lzma_next_coder *next,
+               const lzma_allocator *allocator, uint64_t *seek_pos,
+               lzma_index **dest_index,
+               uint64_t memlimit, uint64_t file_size)
+{
+       lzma_next_coder_init(&lzma_file_info_decoder_init, next, allocator);
+
+       if (dest_index == NULL)
+               return LZMA_PROG_ERROR;
+
+       lzma_file_info_coder *coder = next->coder;
+       if (coder == NULL) {
+               coder = lzma_alloc(sizeof(lzma_file_info_coder), allocator);
+               if (coder == NULL)
+                       return LZMA_MEM_ERROR;
+
+               next->coder = coder;
+               next->code = &file_info_decode;
+               next->end = &file_info_decoder_end;
+               next->memconfig = &file_info_decoder_memconfig;
+
+               coder->index_decoder = LZMA_NEXT_CODER_INIT;
+               coder->this_index = NULL;
+               coder->combined_index = NULL;
+       }
+
+       coder->sequence = SEQ_MAGIC_BYTES;
+       coder->file_cur_pos = 0;
+       coder->file_target_pos = 0;
+       coder->file_size = file_size;
+
+       lzma_index_end(coder->this_index, allocator);
+       coder->this_index = NULL;
+
+       lzma_index_end(coder->combined_index, allocator);
+       coder->combined_index = NULL;
+
+       coder->stream_padding = 0;
+
+       coder->dest_index = dest_index;
+       coder->external_seek_pos = seek_pos;
+
+       // If memlimit is 0, make it 1 to ensure that lzma_memlimit_get()
+       // won't return 0 (which would indicate an error).
+       coder->memlimit = my_max(1, memlimit);
+
+       // Preprare thse for reading the first Stream Header into coder->temp.
+       coder->temp_pos = 0;
+       coder->temp_size = LZMA_STREAM_HEADER_SIZE;
+
+       return LZMA_OK;
+}
+
+
+extern LZMA_API(lzma_ret)
+lzma_file_info_decoder(lzma_stream *strm, lzma_index **dest_index,
+               uint64_t memlimit, uint64_t file_size)
+{
+       lzma_next_strm_init(lzma_file_info_decoder_init, strm, &strm->seek_pos,
+                       dest_index, memlimit, file_size);
+
+       // We allow LZMA_FINISH in addition to LZMA_RUN for convenience.
+       // lzma_code() is able to handle the LZMA_FINISH + LZMA_SEEK_NEEDED
+       // combination in a sane way. Applications still need to be careful
+       // if they use LZMA_FINISH so that they remember to reset it back
+       // to LZMA_RUN after seeking if needed.
+       strm->internal->supported_actions[LZMA_RUN] = true;
+       strm->internal->supported_actions[LZMA_FINISH] = true;
+
+       return LZMA_OK;
+}
diff --git a/src/liblzma/liblzma.map b/src/liblzma/liblzma.map

index f53a4ea30a3cc741566d22b578501498730af2e5..930e5e860232fb64befa8e22dd97f1bcdee446e6 100644 (file)
--- a/src/liblzma/liblzma.map
+++ b/src/liblzma/liblzma.map
@@ -102,7 +102,12 @@ global:
         lzma_get_progress;
         lzma_stream_encoder_mt;
         lzma_stream_encoder_mt_memusage;
+} XZ_5.0;
+
+XZ_5.3.0alpha {
+global:
+       lzma_file_info_decoder;
  
  local:
         *;
-} XZ_5.0;
+} XZ_5.2;
author	Lasse Collin <lasse.collin@tukaani.org>
	Mon, 24 Apr 2017 16:35:50 +0000 (19:35 +0300)
committer	Lasse Collin <lasse.collin@tukaani.org>
	Mon, 24 Apr 2017 16:48:04 +0000 (19:48 +0300)
src/liblzma/api/lzma/index.h		patch \| blob \| blame \| history
src/liblzma/common/Makefile.inc		patch \| blob \| blame \| history
src/liblzma/common/file_info.c	[new file with mode: 0644]	patch \| blob
src/liblzma/liblzma.map		patch \| blob \| blame \| history