From: Lasse Collin <lasse.collin@tukaani.org>
Date: Fri, 23 May 2025 11:38:49 +0000 (+0300)
Subject: Docs: Add doc/examples/12_seekable_decompress.c
X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=refs%2Fheads%2Fseekdec;p=thirdparty%2Fxz.git

Docs: Add doc/examples/12_seekable_decompress.c
---

diff --git a/Makefile.am b/Makefile.am
index fc54f477..1fd374a7 100644
--- a/Makefile.am
+++ b/Makefile.am
@@ -35,6 +35,7 @@ dist_examples_DATA = \
 	doc/examples/03_compress_custom.c \
 	doc/examples/04_compress_easy_mt.c \
 	doc/examples/11_file_info.c \
+	doc/examples/12_seekable_decompress.c \
 	doc/examples/Makefile
 endif
 
diff --git a/doc/examples/12_seekable_decompress.c b/doc/examples/12_seekable_decompress.c
new file mode 100644
index 00000000..db74650c
--- /dev/null
+++ b/doc/examples/12_seekable_decompress.c
@@ -0,0 +1,464 @@
+// SPDX-License-Identifier: 0BSD
+
+///////////////////////////////////////////////////////////////////////////////
+//
+/// \file       12_seekable_decompress.c
+/// \brief      Decompress .xz file with limited random access
+///
+/// It's limited because acceptable performance requires that the offsets
+/// are close enough to the beginnings of Blocks or that Blocks are small.
+///
+/// Usage:      ./12_seekable_decompress INFILE.xz [OFFSET LENGTH]...
+///
+/// Example:    ./12_seekable_decompress foo.xz 1000 50 200 10
+//
+//  Author:     Lasse Collin
+//
+///////////////////////////////////////////////////////////////////////////////
+
+#include <stdbool.h>
+#include <inttypes.h>
+#include <stdlib.h>
+#include <stdio.h>
+#include <string.h>
+#include <errno.h>
+#include <lzma.h>
+
+
+#define INBUF_SIZE BUFSIZ
+#define OUTBUF_SIZE BUFSIZ
+
+
+// Get lzma_index for the input .xz file. This function is based
+// on 11_file_info.c.
+//
+// On success, return a pointer to lzma_index. The caller must free it
+// using lzma_index_end(idx, NULL) when the lzma_index is no longer needed.
+static lzma_index *
+read_file_info(FILE *infile, const char *filename)
+{
+	// Get the file size. In standard C it can be done by seeking to
+	// the end of the file and then getting the file position.
+	// In POSIX one can use fstat() and then st_size from struct stat.
+	// Also note that fseek() and ftell() use long and thus don't support
+	// large files on 32-bit systems (POSIX versions fseeko() and
+	// ftello() can support large files).
+	if (fseek(infile, 0, SEEK_END)) {
+		fprintf(stderr, "Error seeking the file '%s': %s\n",
+				filename, strerror(errno));
+		return NULL;
+	}
+
+	const long file_size = ftell(infile);
+
+	// The decoder wants to start from the beginning of the .xz file.
+	rewind(infile);
+
+	// Initialize the decoder.
+	lzma_stream strm = LZMA_STREAM_INIT;
+	lzma_index *idx;
+	lzma_ret ret = lzma_file_info_decoder(&strm, &idx, UINT64_MAX,
+			(uint64_t)file_size);
+	switch (ret) {
+	case LZMA_OK:
+		// Initialization succeeded.
+		break;
+
+	case LZMA_MEM_ERROR:
+		fprintf(stderr, "Out of memory when initializing "
+				"the .xz file info decoder\n");
+		return NULL;
+
+	case LZMA_PROG_ERROR:
+	default:
+		fprintf(stderr, "Unknown error, possibly a bug\n");
+		return NULL;
+	}
+
+	// This example program reuses the same lzma_stream structure,
+	// so we need to reset this when starting a new file.
+	strm.avail_in = 0;
+
+	// Buffer for input data.
+	uint8_t inbuf[INBUF_SIZE];
+
+	// Pass data to the decoder and seek when needed.
+	while (true) {
+		if (strm.avail_in == 0) {
+			strm.next_in = inbuf;
+			strm.avail_in = fread(inbuf, 1, sizeof(inbuf),
+					infile);
+
+			if (ferror(infile)) {
+				fprintf(stderr,
+					"Error reading from '%s': %s\n",
+					filename, strerror(errno));
+				goto error;
+			}
+
+			// We don't need to care about hitting the end of
+			// the file so no need to check for feof().
+		}
+
+		ret = lzma_code(&strm, LZMA_RUN);
+
+		switch (ret) {
+		case LZMA_OK:
+			break;
+
+		case LZMA_SEEK_NEEDED:
+			// The cast is safe because liblzma won't ask us to
+			// seek past the known size of the input file which
+			// did fit into a long.
+			//
+			// NOTE: Remember to change these to off_t if you
+			// switch fseeko() or lseek().
+			if (fseek(infile, (long)(strm.seek_pos), SEEK_SET)) {
+				fprintf(stderr, "Error seeking the "
+						"file '%s': %s\n",
+						filename, strerror(errno));
+				goto error;
+			}
+
+			// The old data in the inbuf is useless now. Set
+			// avail_in to zero so that we will read new input
+			// from the new file position on the next iteration
+			// of this loop.
+			strm.avail_in = 0;
+			break;
+
+		case LZMA_STREAM_END:
+			// File information was successfully decoded.
+			lzma_end(&strm);
+			return idx;
+
+		case LZMA_FORMAT_ERROR:
+			// .xz magic bytes weren't found.
+			fprintf(stderr, "The file '%s' is not "
+					"in the .xz format\n", filename);
+			goto error;
+
+		case LZMA_OPTIONS_ERROR:
+			fprintf(stderr, "The file '%s' has .xz headers that "
+					"are not supported by this liblzma "
+					"version\n", filename);
+			goto error;
+
+		case LZMA_DATA_ERROR:
+			fprintf(stderr, "The file '%s' is corrupt\n",
+					filename);
+			goto error;
+
+		case LZMA_MEM_ERROR:
+			fprintf(stderr, "Memory allocation failed when "
+					"decoding the file '%s'\n", filename);
+			goto error;
+
+		// LZMA_MEMLIMIT_ERROR shouldn't happen because we used
+		// UINT64_MAX as the limit.
+		//
+		// LZMA_BUF_ERROR shouldn't happen because we always provide
+		// new input when the input buffer is empty. The decoder
+		// knows the input file size and thus won't try to read past
+		// the end of the file.
+		case LZMA_MEMLIMIT_ERROR:
+		case LZMA_BUF_ERROR:
+		case LZMA_PROG_ERROR:
+		default:
+			fprintf(stderr, "Unknown error, possibly a bug\n");
+			goto error;
+		}
+	}
+
+	// This line is never reached.
+
+error:
+	lzma_end(&strm);
+	return NULL;
+}
+
+
+static bool
+decode_from_offset(lzma_stream *strm, uint8_t inbuf[static INBUF_SIZE],
+		FILE *infile, const char *filename,
+		uint64_t offset, uint64_t len)
+{
+	// Buffer for output data. The input buffer we get as an argument
+	// to preserve read-but-unused input between calls to this function.
+	uint8_t outbuf[OUTBUF_SIZE];
+
+	// strm->next_out and strm->avail_out are set in the loop below.
+	// It's simpler to not duplicate it here.
+	strm->avail_out = 0;
+
+	// strm->next_in and ->avail_out aren't touched here. On the first
+	// call to this function, avail_in == 0, and on later calls there
+	// may be unused bytes waiting to be decoded. The unused bytes are
+	// discarded if we need to seek the input; otherwise the decoder
+	// can use them to continue decoding where it stopped earlier.
+
+	// Seek to the requested uncompressed offset. (To seek to a specific
+	// Block by its number in the file, use LZMA_SEEK_TO_BLOCK instead
+	// and set the Block number in strm->seek_pos.)
+	lzma_action action = LZMA_SEEK_TO_OFFSET;
+	strm->seek_pos = offset;
+
+	while (len > 0) {
+		// Read more input if all input has been consumed.
+		// However, don't do this on the first iteration of
+		// this loop (action != LZMA_RUN) because we might
+		// need to seek the input.
+		if (strm->avail_in == 0 && action == LZMA_RUN) {
+			strm->next_in = inbuf;
+			strm->avail_in = fread(inbuf, 1, INBUF_SIZE, infile);
+
+			if (ferror(infile)) {
+				fprintf(stderr,
+					"Error reading from '%s': %s\n",
+					filename, strerror(errno));
+				return false;
+			}
+		}
+
+		// Provide more output space. Note that on the first
+		// iteration of the loop strm->avail_out equals 0.
+		if (strm->avail_out == 0) {
+			strm->next_out = outbuf;
+
+			// Don't provide more space than needed to decode
+			// the requested amount.
+			strm->avail_out = len < OUTBUF_SIZE
+					? len : OUTBUF_SIZE;
+		}
+
+		// On the first iteration of the loop, the 'action' argument
+		// will tell lzma_code() to seek.
+		const lzma_ret ret = lzma_code(strm, action);
+
+		// The seek request has been passed to lzma_code().
+		// If more data is needed, the later lzma_code() calls
+		// will decode more data normally.
+		action = LZMA_RUN;
+
+		// Write the decoded data out if the output buffer became
+		// full or if the end of the file was reached. Due to how
+		// we have set strm->avail_out, LZMA_STREAM_END is checked
+		// here only write the last bytes out in case len was so
+		// large that we would need to read past the end of the file.
+		if (strm->avail_out == 0 || ret == LZMA_STREAM_END) {
+			// Use pointer arithmetic instead of calculating
+			// OUTBUF_SIZE - avail_out because when
+			// len < OUTBUF_SIZE, we set avail_out = len.
+			size_t write_size = (size_t)(strm->next_out - outbuf);
+			len -= write_size;
+
+			if (fwrite(outbuf, 1, write_size, stdout)
+					!= write_size) {
+				fprintf(stderr, "Write error: %s\n",
+				strerror(errno));
+				return false;
+			}
+		}
+
+		switch (ret) {
+		case LZMA_OK:
+			break;
+
+		case LZMA_SEEK_NEEDED:
+			// Even if we don't use LZMA_SEEK_TO_OFFSET or
+			// LZMA_SEEK_TO_BLOCK, it's possible that
+			// LZMA_SEEK_NEEDED is still returned under
+			// some conditions.
+			//
+			// The cast is safe because liblzma won't ask us to
+			// seek past the known size of the input file which
+			// did fit into a long.
+			//
+			// NOTE: Remember to change these to off_t if you
+			// switch fseeko() or lseek().
+			if (fseek(infile, (long)(strm->seek_pos), SEEK_SET)) {
+				fprintf(stderr, "Error seeking the "
+						"file '%s': %s\n",
+						filename, strerror(errno));
+				return NULL;
+			}
+
+			// The old data in the inbuf is useless now. Set
+			// avail_in to zero so that we will read new input
+			// from the new file position on the next iteration
+			// of this loop.
+			strm->avail_in = 0;
+			break;
+
+		case LZMA_SEEK_ERROR:
+			// The requested position is greater than or equal
+			// to the uncompressed size of the file.
+			//
+			// This error is recoverable: the input and output
+			// positions of the lzma_stream weren't modified,
+			// and one can either continue decoding from the
+			// old position or try to seek again.
+			fprintf(stderr, "The specified uncompressed offset "
+					"%" PRIu64 " is at or past the end of "
+					"the file '%s'\n", offset, filename);
+			return false;
+
+		case LZMA_STREAM_END:
+			// End of file was reached. If we didn't produce as
+			// much data as requested, consider it an error.
+			if (len != 0) {
+				fprintf(stderr,
+					"Cannot read %" PRIu64 " byte(s) "
+					"past the end of the file '%s'\n",
+					len, filename);
+				return false;
+			}
+
+			return true;
+
+		case LZMA_OPTIONS_ERROR:
+			fprintf(stderr, "The file '%s' has .xz headers that "
+					"are not supported by this liblzma "
+					"version\n", filename);
+			return false;
+
+		case LZMA_DATA_ERROR:
+			fprintf(stderr, "The file '%s' is corrupt\n",
+					filename);
+			return false;
+
+		case LZMA_MEM_ERROR:
+			fprintf(stderr, "Memory allocation failed when "
+					"decoding the file '%s'\n", filename);
+			return false;
+
+		// LZMA_MEMLIMIT_ERROR shouldn't happen because we used
+		// UINT64_MAX as the limit.
+		//
+		// LZMA_BUF_ERROR shouldn't happen because we always provide
+		// new input when the input buffer is empty. The decoder
+		// knows the input file size and thus won't try to read past
+		// the end of the file.
+		case LZMA_MEMLIMIT_ERROR:
+		case LZMA_BUF_ERROR:
+		case LZMA_PROG_ERROR:
+		default:
+			fprintf(stderr, "Unknown error, possibly a bug\n");
+			return false;
+		}
+	}
+
+	// Decoding was successful.
+	return true;
+}
+
+
+static bool
+str_to_uint64(uint64_t *result, const char *str)
+{
+	*result = 0;
+
+	char *endptr;
+	errno = 0;
+	*result = strtoull(str, &endptr, 10);
+
+	if (*str == '\0' || *endptr != '\0') {
+		fprintf(stderr, "Not a decimal integer: %s\n", str);
+		return false;
+	}
+
+	if (errno == ERANGE) {
+		fprintf(stderr, "Integer is too large: %s\n", str);
+		return false;
+	}
+
+	return true;
+}
+
+
+extern int
+main(int argc, char **argv)
+{
+	// We need one filename and an even number of integer arguments.
+	if (argc < 2 || (argc % 2) != 0) {
+		fprintf(stderr, "Usage: %s FILE.xz [OFFSET LENGTH]...\n",
+				argv[0]);
+		return EXIT_FAILURE;
+	}
+
+	FILE *infile = fopen(argv[1], "rb");
+	if (infile == NULL) {
+		fprintf(stderr, "Cannot open the file '%s': %s\n",
+				argv[1], strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	lzma_index *index = read_file_info(infile, argv[1]);
+	if (index == NULL) {
+		// Error message was already printed.
+		fclose(infile);
+		return EXIT_FAILURE;
+	}
+
+	lzma_stream strm = LZMA_STREAM_INIT;
+	lzma_ret ret = lzma_seekable_decoder(&strm, UINT64_MAX, 0, index);
+	if (ret != LZMA_OK) {
+		fprintf(stderr, "Error initializing seekable decoder\n");
+		lzma_index_end(index, NULL);
+		fclose(infile);
+		return EXIT_FAILURE;
+	}
+
+	// decode_from_offset() reads from infile in INBUF_SIZE chunks.
+	// Often the last chunk won't be consumed completely by the decoder.
+	// On the next call to decode_from_offset() the remaining data may
+	// be needed though. Thus, keep the same inbuf available between
+	// the calls to decode_from_offset().
+	uint8_t inbuf[INBUF_SIZE];
+
+	// Initially there is no already-read input. The initialization with
+	// LZMA_STREAM_INIT already set these and so in this example code
+	// these are redundant. If one was reusing the same lzma_stream,
+	// then strm.avail_in = 0 would be required here so that the newly-
+	// initialized decoder won't see stale data.
+	strm.next_in = NULL;
+	strm.avail_in = 0;
+
+	// If errors occur, this will be set to false.
+	bool success = true;
+
+	for (int i = 2; i < argc; i += 2) {
+		uint64_t offset;
+		uint64_t len;
+
+		if (!str_to_uint64(&offset, argv[i])) {
+			success = false;
+			break;
+		}
+
+		if (!str_to_uint64(&len, argv[i + 1])) {
+			success = false;
+			break;
+		}
+
+		if (!decode_from_offset(
+				&strm, inbuf, infile, argv[1], offset, len)) {
+			success = false;
+			break;
+		}
+	}
+
+	// Free memory and close the input file.
+	lzma_end(&strm);
+	lzma_index_end(index, NULL);
+	fclose(infile);
+
+	// Close stdout to catch possible write errors that can occur
+	// when pending data is flushed from the stdio buffers.
+	if (success && fclose(stdout)) {
+		fprintf(stderr, "Write error: %s\n", strerror(errno));
+		return EXIT_FAILURE;
+	}
+
+	return success ? EXIT_SUCCESS : EXIT_FAILURE;
+}
diff --git a/doc/examples/Makefile b/doc/examples/Makefile
index f5b98788..3b4fe686 100644
--- a/doc/examples/Makefile
+++ b/doc/examples/Makefile
@@ -10,7 +10,8 @@ PROGS = \
 	02_decompress \
 	03_compress_custom \
 	04_compress_easy_mt \
-	11_file_info
+	11_file_info \
+	12_seekable_decompress
 
 all: $(PROGS)