Docs: Add doc/examples/12_seekable_decompress.c

author Lasse Collin <lasse.collin@tukaani.org>

Fri, 23 May 2025 11:38:49 +0000 (14:38 +0300)

committer Lasse Collin <lasse.collin@tukaani.org>

Fri, 23 May 2025 12:23:08 +0000 (15:23 +0300)
author Lasse Collin <lasse.collin@tukaani.org>
Fri, 23 May 2025 11:38:49 +0000 (14:38 +0300)
committer Lasse Collin <lasse.collin@tukaani.org>
Fri, 23 May 2025 12:23:08 +0000 (15:23 +0300)
diff --git a/Makefile.am b/Makefile.am

index fc54f477e1f83c40c2dd5c4d1b59fbc94d8c840f..1fd374a7f3d5b39ab4cd09f3dfca6aba9e5473b3 100644 (file)
--- a/Makefile.am
+++ b/Makefile.am
@@ -35,6 +35,7 @@ dist_examples_DATA = \
         doc/examples/03_compress_custom.c \
         doc/examples/04_compress_easy_mt.c \
         doc/examples/11_file_info.c \
+       doc/examples/12_seekable_decompress.c \
         doc/examples/Makefile
  endif
  
diff --git a/doc/examples/12_seekable_decompress.c b/doc/examples/12_seekable_decompress.c

new file mode 100644 (file)

index 0000000..db74650
--- /dev/null
+++ b/doc/examples/12_seekable_decompress.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       12_seekable_decompress.c
+/// \brief      Decompress .xz file with limited random access
+///
+/// It's limited because acceptable performance requires that the offsets
+/// are close enough to the beginnings of Blocks or that Blocks are small.
+///
+/// Usage:      ./12_seekable_decompress INFILE.xz [OFFSET LENGTH]...
+///
+/// Example:    ./12_seekable_decompress foo.xz 1000 50 200 10
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+#define INBUF_SIZE BUFSIZ
+#define OUTBUF_SIZE BUFSIZ
+
+
+// Get lzma_index for the input .xz file. This function is based
+// on 11_file_info.c.
+//
+// On success, return a pointer to lzma_index. The caller must free it
+// using lzma_index_end(idx, NULL) when the lzma_index is no longer needed.
+static lzma_index *
+read_file_info(FILE *infile, const char *filename)
+{
+       // Get the file size. In standard C it can be done by seeking to
+       // the end of the file and then getting the file position.
+       // In POSIX one can use fstat() and then st_size from struct stat.
+       // Also note that fseek() and ftell() use long and thus don't support
+       // large files on 32-bit systems (POSIX versions fseeko() and
+       // ftello() can support large files).
+       if (fseek(infile, 0, SEEK_END)) {
+               fprintf(stderr, "Error seeking the file '%s': %s\n",
+                               filename, strerror(errno));
+               return NULL;
+       }
+
+       const long file_size = ftell(infile);
+
+       // The decoder wants to start from the beginning of the .xz file.
+       rewind(infile);
+
+       // Initialize the decoder.
+       lzma_stream strm = LZMA_STREAM_INIT;
+       lzma_index *idx;
+       lzma_ret ret = lzma_file_info_decoder(&strm, &idx, UINT64_MAX,
+                       (uint64_t)file_size);
+       switch (ret) {
+       case LZMA_OK:
+               // Initialization succeeded.
+               break;
+
+       case LZMA_MEM_ERROR:
+               fprintf(stderr, "Out of memory when initializing "
+                               "the .xz file info decoder\n");
+               return NULL;
+
+       case LZMA_PROG_ERROR:
+       default:
+               fprintf(stderr, "Unknown error, possibly a bug\n");
+               return NULL;
+       }
+
+       // This example program reuses the same lzma_stream structure,
+       // so we need to reset this when starting a new file.
+       strm.avail_in = 0;
+
+       // Buffer for input data.
+       uint8_t inbuf[INBUF_SIZE];
+
+       // Pass data to the decoder and seek when needed.
+       while (true) {
+               if (strm.avail_in == 0) {
+                       strm.next_in = inbuf;
+                       strm.avail_in = fread(inbuf, 1, sizeof(inbuf),
+                                       infile);
+
+                       if (ferror(infile)) {
+                               fprintf(stderr,
+                                       "Error reading from '%s': %s\n",
+                                       filename, strerror(errno));
+                               goto error;
+                       }
+
+                       // We don't need to care about hitting the end of
+                       // the file so no need to check for feof().
+               }
+
+               ret = lzma_code(&strm, LZMA_RUN);
+
+               switch (ret) {
+               case LZMA_OK:
+                       break;
+
+               case LZMA_SEEK_NEEDED:
+                       // The cast is safe because liblzma won't ask us to
+                       // seek past the known size of the input file which
+                       // did fit into a long.
+                       //
+                       // NOTE: Remember to change these to off_t if you
+                       // switch fseeko() or lseek().
+                       if (fseek(infile, (long)(strm.seek_pos), SEEK_SET)) {
+                               fprintf(stderr, "Error seeking the "
+                                               "file '%s': %s\n",
+                                               filename, strerror(errno));
+                               goto error;
+                       }
+
+                       // The old data in the inbuf is useless now. Set
+                       // avail_in to zero so that we will read new input
+                       // from the new file position on the next iteration
+                       // of this loop.
+                       strm.avail_in = 0;
+                       break;
+
+               case LZMA_STREAM_END:
+                       // File information was successfully decoded.
+                       lzma_end(&strm);
+                       return idx;
+
+               case LZMA_FORMAT_ERROR:
+                       // .xz magic bytes weren't found.
+                       fprintf(stderr, "The file '%s' is not "
+                                       "in the .xz format\n", filename);
+                       goto error;
+
+               case LZMA_OPTIONS_ERROR:
+                       fprintf(stderr, "The file '%s' has .xz headers that "
+                                       "are not supported by this liblzma "
+                                       "version\n", filename);
+                       goto error;
+
+               case LZMA_DATA_ERROR:
+                       fprintf(stderr, "The file '%s' is corrupt\n",
+                                       filename);
+                       goto error;
+
+               case LZMA_MEM_ERROR:
+                       fprintf(stderr, "Memory allocation failed when "
+                                       "decoding the file '%s'\n", filename);
+                       goto error;
+
+               // LZMA_MEMLIMIT_ERROR shouldn't happen because we used
+               // UINT64_MAX as the limit.
+               //
+               // LZMA_BUF_ERROR shouldn't happen because we always provide
+               // new input when the input buffer is empty. The decoder
+               // knows the input file size and thus won't try to read past
+               // the end of the file.
+               case LZMA_MEMLIMIT_ERROR:
+               case LZMA_BUF_ERROR:
+               case LZMA_PROG_ERROR:
+               default:
+                       fprintf(stderr, "Unknown error, possibly a bug\n");
+                       goto error;
+               }
+       }
+
+       // This line is never reached.
+
+error:
+       lzma_end(&strm);
+       return NULL;
+}
+
+
+static bool
+decode_from_offset(lzma_stream *strm, uint8_t inbuf[static INBUF_SIZE],
+               FILE *infile, const char *filename,
+               uint64_t offset, uint64_t len)
+{
+       // Buffer for output data. The input buffer we get as an argument
+       // to preserve read-but-unused input between calls to this function.
+       uint8_t outbuf[OUTBUF_SIZE];
+
+       // strm->next_out and strm->avail_out are set in the loop below.
+       // It's simpler to not duplicate it here.
+       strm->avail_out = 0;
+
+       // strm->next_in and ->avail_out aren't touched here. On the first
+       // call to this function, avail_in == 0, and on later calls there
+       // may be unused bytes waiting to be decoded. The unused bytes are
+       // discarded if we need to seek the input; otherwise the decoder
+       // can use them to continue decoding where it stopped earlier.
+
+       // Seek to the requested uncompressed offset. (To seek to a specific
+       // Block by its number in the file, use LZMA_SEEK_TO_BLOCK instead
+       // and set the Block number in strm->seek_pos.)
+       lzma_action action = LZMA_SEEK_TO_OFFSET;
+       strm->seek_pos = offset;
+
+       while (len > 0) {
+               // Read more input if all input has been consumed.
+               // However, don't do this on the first iteration of
+               // this loop (action != LZMA_RUN) because we might
+               // need to seek the input.
+               if (strm->avail_in == 0 && action == LZMA_RUN) {
+                       strm->next_in = inbuf;
+                       strm->avail_in = fread(inbuf, 1, INBUF_SIZE, infile);
+
+                       if (ferror(infile)) {
+                               fprintf(stderr,
+                                       "Error reading from '%s': %s\n",
+                                       filename, strerror(errno));
+                               return false;
+                       }
+               }
+
+               // Provide more output space. Note that on the first
+               // iteration of the loop strm->avail_out equals 0.
+               if (strm->avail_out == 0) {
+                       strm->next_out = outbuf;
+
+                       // Don't provide more space than needed to decode
+                       // the requested amount.
+                       strm->avail_out = len < OUTBUF_SIZE
+                                       ? len : OUTBUF_SIZE;
+               }
+
+               // On the first iteration of the loop, the 'action' argument
+               // will tell lzma_code() to seek.
+               const lzma_ret ret = lzma_code(strm, action);
+
+               // The seek request has been passed to lzma_code().
+               // If more data is needed, the later lzma_code() calls
+               // will decode more data normally.
+               action = LZMA_RUN;
+
+               // Write the decoded data out if the output buffer became
+               // full or if the end of the file was reached. Due to how
+               // we have set strm->avail_out, LZMA_STREAM_END is checked
+               // here only write the last bytes out in case len was so
+               // large that we would need to read past the end of the file.
+               if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+                       // Use pointer arithmetic instead of calculating
+                       // OUTBUF_SIZE - avail_out because when
+                       // len < OUTBUF_SIZE, we set avail_out = len.
+                       size_t write_size = (size_t)(strm->next_out - outbuf);
+                       len -= write_size;
+
+                       if (fwrite(outbuf, 1, write_size, stdout)
+                                       != write_size) {
+                               fprintf(stderr, "Write error: %s\n",
+                               strerror(errno));
+                               return false;
+                       }
+               }
+
+               switch (ret) {
+               case LZMA_OK:
+                       break;
+
+               case LZMA_SEEK_NEEDED:
+                       // Even if we don't use LZMA_SEEK_TO_OFFSET or
+                       // LZMA_SEEK_TO_BLOCK, it's possible that
+                       // LZMA_SEEK_NEEDED is still returned under
+                       // some conditions.
+                       //
+                       // The cast is safe because liblzma won't ask us to
+                       // seek past the known size of the input file which
+                       // did fit into a long.
+                       //
+                       // NOTE: Remember to change these to off_t if you
+                       // switch fseeko() or lseek().
+                       if (fseek(infile, (long)(strm->seek_pos), SEEK_SET)) {
+                               fprintf(stderr, "Error seeking the "
+                                               "file '%s': %s\n",
+                                               filename, strerror(errno));
+                               return NULL;
+                       }
+
+                       // The old data in the inbuf is useless now. Set
+                       // avail_in to zero so that we will read new input
+                       // from the new file position on the next iteration
+                       // of this loop.
+                       strm->avail_in = 0;
+                       break;
+
+               case LZMA_SEEK_ERROR:
+                       // The requested position is greater than or equal
+                       // to the uncompressed size of the file.
+                       //
+                       // This error is recoverable: the input and output
+                       // positions of the lzma_stream weren't modified,
+                       // and one can either continue decoding from the
+                       // old position or try to seek again.
+                       fprintf(stderr, "The specified uncompressed offset "
+                                       "%" PRIu64 " is at or past the end of "
+                                       "the file '%s'\n", offset, filename);
+                       return false;
+
+               case LZMA_STREAM_END:
+                       // End of file was reached. If we didn't produce as
+                       // much data as requested, consider it an error.
+                       if (len != 0) {
+                               fprintf(stderr,
+                                       "Cannot read %" PRIu64 " byte(s) "
+                                       "past the end of the file '%s'\n",
+                                       len, filename);
+                               return false;
+                       }
+
+                       return true;
+
+               case LZMA_OPTIONS_ERROR:
+                       fprintf(stderr, "The file '%s' has .xz headers that "
+                                       "are not supported by this liblzma "
+                                       "version\n", filename);
+                       return false;
+
+               case LZMA_DATA_ERROR:
+                       fprintf(stderr, "The file '%s' is corrupt\n",
+                                       filename);
+                       return false;
+
+               case LZMA_MEM_ERROR:
+                       fprintf(stderr, "Memory allocation failed when "
+                                       "decoding the file '%s'\n", filename);
+                       return false;
+
+               // LZMA_MEMLIMIT_ERROR shouldn't happen because we used
+               // UINT64_MAX as the limit.
+               //
+               // LZMA_BUF_ERROR shouldn't happen because we always provide
+               // new input when the input buffer is empty. The decoder
+               // knows the input file size and thus won't try to read past
+               // the end of the file.
+               case LZMA_MEMLIMIT_ERROR:
+               case LZMA_BUF_ERROR:
+               case LZMA_PROG_ERROR:
+               default:
+                       fprintf(stderr, "Unknown error, possibly a bug\n");
+                       return false;
+               }
+       }
+
+       // Decoding was successful.
+       return true;
+}
+
+
+static bool
+str_to_uint64(uint64_t *result, const char *str)
+{
+       *result = 0;
+
+       char *endptr;
+       errno = 0;
+       *result = strtoull(str, &endptr, 10);
+
+       if (*str == '\0' || *endptr != '\0') {
+               fprintf(stderr, "Not a decimal integer: %s\n", str);
+               return false;
+       }
+
+       if (errno == ERANGE) {
+               fprintf(stderr, "Integer is too large: %s\n", str);
+               return false;
+       }
+
+       return true;
+}
+
+
+extern int
+main(int argc, char **argv)
+{
+       // We need one filename and an even number of integer arguments.
+       if (argc < 2 || (argc % 2) != 0) {
+               fprintf(stderr, "Usage: %s FILE.xz [OFFSET LENGTH]...\n",
+                               argv[0]);
+               return EXIT_FAILURE;
+       }
+
+       FILE *infile = fopen(argv[1], "rb");
+       if (infile == NULL) {
+               fprintf(stderr, "Cannot open the file '%s': %s\n",
+                               argv[1], strerror(errno));
+               return EXIT_FAILURE;
+       }
+
+       lzma_index *index = read_file_info(infile, argv[1]);
+       if (index == NULL) {
+               // Error message was already printed.
+               fclose(infile);
+               return EXIT_FAILURE;
+       }
+
+       lzma_stream strm = LZMA_STREAM_INIT;
+       lzma_ret ret = lzma_seekable_decoder(&strm, UINT64_MAX, 0, index);
+       if (ret != LZMA_OK) {
+               fprintf(stderr, "Error initializing seekable decoder\n");
+               lzma_index_end(index, NULL);
+               fclose(infile);
+               return EXIT_FAILURE;
+       }
+
+       // decode_from_offset() reads from infile in INBUF_SIZE chunks.
+       // Often the last chunk won't be consumed completely by the decoder.
+       // On the next call to decode_from_offset() the remaining data may
+       // be needed though. Thus, keep the same inbuf available between
+       // the calls to decode_from_offset().
+       uint8_t inbuf[INBUF_SIZE];
+
+       // Initially there is no already-read input. The initialization with
+       // LZMA_STREAM_INIT already set these and so in this example code
+       // these are redundant. If one was reusing the same lzma_stream,
+       // then strm.avail_in = 0 would be required here so that the newly-
+       // initialized decoder won't see stale data.
+       strm.next_in = NULL;
+       strm.avail_in = 0;
+
+       // If errors occur, this will be set to false.
+       bool success = true;
+
+       for (int i = 2; i < argc; i += 2) {
+               uint64_t offset;
+               uint64_t len;
+
+               if (!str_to_uint64(&offset, argv[i])) {
+                       success = false;
+                       break;
+               }
+
+               if (!str_to_uint64(&len, argv[i + 1])) {
+                       success = false;
+                       break;
+               }
+
+               if (!decode_from_offset(
+                               &strm, inbuf, infile, argv[1], offset, len)) {
+                       success = false;
+                       break;
+               }
+       }
+
+       // Free memory and close the input file.
+       lzma_end(&strm);
+       lzma_index_end(index, NULL);
+       fclose(infile);
+
+       // Close stdout to catch possible write errors that can occur
+       // when pending data is flushed from the stdio buffers.
+       if (success && fclose(stdout)) {
+               fprintf(stderr, "Write error: %s\n", strerror(errno));
+               return EXIT_FAILURE;
+       }
+
+       return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/Makefile b/doc/examples/Makefile

index f5b98788ece821f1727ec3644c30f7115bd9148f..3b4fe6862ecbe12485b9417d16cae1240439db1d 100644 (file)
--- a/doc/examples/Makefile
+++ b/doc/examples/Makefile
@@ -10,7 +10,8 @@ PROGS = \
         02_decompress \
         03_compress_custom \
         04_compress_easy_mt \
-       11_file_info
+       11_file_info \
+       12_seekable_decompress
  
  all: $(PROGS)
author	Lasse Collin <lasse.collin@tukaani.org>
	Fri, 23 May 2025 11:38:49 +0000 (14:38 +0300)
committer	Lasse Collin <lasse.collin@tukaani.org>
	Fri, 23 May 2025 12:23:08 +0000 (15:23 +0300)
Makefile.am		patch \| blob \| blame \| history
doc/examples/12_seekable_decompress.c	[new file with mode: 0644]	patch \| blob
doc/examples/Makefile		patch \| blob \| blame \| history