From: Tim Kientzle Date: Sun, 30 Mar 2025 16:26:25 +0000 (-0700) Subject: Issue 2548: Reading GNU sparse entries (#2558) X-Git-Tag: v3.8.0~56 X-Git-Url: http://git.ipfire.org/gitweb/gitweb.cgi?a=commitdiff_plain;h=29fd918e1886abacca88864ad3676fa237ff21e2;p=thirdparty%2Flibarchive.git Issue 2548: Reading GNU sparse entries (#2558) My attempt to fix #2404 just made the confusion between the size of the extracted file and the size of the contents in the tar archive worse than it was before. @ferivoz in #2557 showed that the confusion stemmed from a point where we were setting the size in the entry (which is by definition the size of the file on disk) when we read the `GNU.sparse.size` and `GNU.sparse.realsize` attributes (which might represent the size on disk or in the archive) and then using that to determine whether to read the value in ustar header (which represents the size of the data in the archive). The confusion stems from three issues: * The GNU.sparse.* fields mean different things depending on the version of GNU tar used. * The regular Pax `size` field overrides the value in the ustar header, but the GNU sparse size fields don't always do so. * The previous libarchive code tried to reconcile different size information as we went along, which is problematic because the order in which this information appears can vary. This PR makes one big structural change: We now have separate storage for every different size field we might encounter. We now just store these values and record which one we saw. Then at the end, when we have all the information available at once, we can use this data to determine the size on disk and the size in the archive. A few key facts about GNU sparse formats: * GNU legacy sparse format: Stored all the relevant info in an extension of the ustar header. * GNU pax 0.0 format: Used `GNU.sparse.size` to store the size on disk * GNU pax 0.1 format: Used `GNU.sparse.size` to store the size on disk * GNU pax 1.0 format: Used `GNU.sparse.realsize` to store the size on disk; repurposed `GNU.sparse.size` to store the size in the archive, but omitted this in favor of the ustar size field when that could be used. And of course, some key precedence information: * Pax `size` field always overrides the ustar header size field. * GNU sparse size fields override it ONLY when they represent the size of the data in the archive. Resolves #2548 --- diff --git a/Makefile.am b/Makefile.am index 7860c5c50..efc491800 100644 --- a/Makefile.am +++ b/Makefile.am @@ -492,6 +492,7 @@ libarchive_test_SOURCES= \ libarchive/test/test_read_format_gtar_gz.c \ libarchive/test/test_read_format_gtar_lzma.c \ libarchive/test/test_read_format_gtar_sparse.c \ + libarchive/test/test_read_format_gtar_sparse_length.c \ libarchive/test/test_read_format_gtar_sparse_skip_entry.c \ libarchive/test/test_read_format_huge_rpm.c \ libarchive/test/test_read_format_iso_Z.c \ @@ -847,6 +848,7 @@ libarchive_test_EXTRA_DIST=\ libarchive/test/test_read_format_gtar_sparse_1_17_posix01.tar.uu \ libarchive/test/test_read_format_gtar_sparse_1_17_posix10.tar.uu \ libarchive/test/test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu \ + libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu \ libarchive/test/test_read_format_gtar_sparse_skip_entry.tar.Z.uu \ libarchive/test/test_read_format_huge_rpm.rpm.uu \ libarchive/test/test_read_format_iso.iso.Z.uu \ diff --git a/libarchive/archive_read_support_format_tar.c b/libarchive/archive_read_support_format_tar.c index 34ba85409..7615d7cd7 100644 --- a/libarchive/archive_read_support_format_tar.c +++ b/libarchive/archive_read_support_format_tar.c @@ -129,7 +129,11 @@ struct tar { int64_t entry_offset; int64_t entry_padding; int64_t entry_bytes_unconsumed; - int64_t realsize; + int64_t disk_size; + int64_t GNU_sparse_realsize; + int64_t GNU_sparse_size; + int64_t SCHILY_sparse_realsize; + int64_t pax_size; struct sparse_block *sparse_list; struct sparse_block *sparse_last; int64_t sparse_offset; @@ -138,6 +142,7 @@ struct tar { int sparse_gnu_minor; char sparse_gnu_attributes_seen; char filetype; + char size_fields; /* Bits defined below */ struct archive_string localname; struct archive_string_conv *opt_sconv; @@ -148,9 +153,15 @@ struct tar { int compat_2x; int process_mac_extensions; int read_concatenated_archives; - int realsize_override; }; +/* Track which size fields were present in the headers */ +#define TAR_SIZE_PAX_SIZE 1 +#define TAR_SIZE_GNU_SPARSE_REALSIZE 2 +#define TAR_SIZE_GNU_SPARSE_SIZE 4 +#define TAR_SIZE_SCHILY_SPARSE_REALSIZE 8 + + static int archive_block_is_null(const char *p); static char *base64_decode(const char *, size_t, size_t *); static int gnu_add_sparse_entry(struct archive_read *, struct tar *, @@ -529,8 +540,7 @@ archive_read_format_tar_read_header(struct archive_read *a, tar = (struct tar *)(a->format->data); tar->entry_offset = 0; gnu_clear_sparse_list(tar); - tar->realsize = -1; /* Mark this as "unset" */ - tar->realsize_override = 0; + tar->size_fields = 0; /* We don't have any size info yet */ /* Setup default string conversion. */ tar->sconv = tar->opt_sconv; @@ -622,7 +632,7 @@ archive_read_format_tar_read_data(struct archive_read *a, tar->entry_padding = 0; *buff = NULL; *size = 0; - *offset = tar->realsize; + *offset = tar->disk_size; return (ARCHIVE_EOF); } @@ -1290,6 +1300,11 @@ read_body_to_string(struct archive_read *a, struct tar *tar, * allows header_old_tar and header_ustar * to handle filenames differently, while still putting most of the * common parsing into one place. + * + * This is called _after_ ustar, GNU tar, Schily, etc, special + * fields have already been parsed into the `tar` structure. + * So we can make final decisions here about how to reconcile + * size, mode, etc, information. */ static int header_common(struct archive_read *a, struct tar *tar, @@ -1323,28 +1338,60 @@ header_common(struct archive_read *a, struct tar *tar, archive_entry_set_mtime(entry, tar_atol(header->mtime, sizeof(header->mtime)), 0); } - /* Update size information as appropriate */ - if (!archive_entry_size_is_set(entry)) { - tar->entry_bytes_remaining = tar_atol(header->size, sizeof(header->size)); - if (tar->entry_bytes_remaining < 0) { - tar->entry_bytes_remaining = 0; - archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, - "Tar entry has negative size"); - return (ARCHIVE_FATAL); - } - if (tar->entry_bytes_remaining > entry_limit) { - tar->entry_bytes_remaining = 0; - archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, - "Tar entry size overflow"); - return (ARCHIVE_FATAL); - } - if (!tar->realsize_override) { - tar->realsize = tar->entry_bytes_remaining; - } - archive_entry_set_size(entry, tar->realsize); - } else if (tar->realsize_override) { - tar->entry_bytes_remaining = tar->realsize; - archive_entry_set_size(entry, tar->realsize); + /* Reconcile the size info. */ + /* First, how big is the file on disk? */ + if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_REALSIZE) != 0) { + /* GNU sparse format 1.0 uses `GNU.sparse.realsize` + * to hold the size of the file on disk. */ + tar->disk_size = tar->GNU_sparse_realsize; + } else if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0 + && (tar->sparse_gnu_major == 0)) { + /* GNU sparse format 0.0 and 0.1 use `GNU.sparse.size` + * to hold the size of the file on disk. */ + tar->disk_size = tar->GNU_sparse_size; + } else if ((tar->size_fields & TAR_SIZE_SCHILY_SPARSE_REALSIZE) != 0) { + tar->disk_size = tar->SCHILY_sparse_realsize; + } else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) { + tar->disk_size = tar->pax_size; + } else { + /* There wasn't a suitable pax header, so use the ustar info */ + tar->disk_size = tar_atol(header->size, sizeof(header->size)); + } + + if (tar->disk_size < 0) { + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, + "Tar entry has negative file size"); + return (ARCHIVE_FATAL); + } else if (tar->disk_size > entry_limit) { + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, + "Tar entry size overflow"); + return (ARCHIVE_FATAL); + } else { + archive_entry_set_size(entry, tar->disk_size); + } + + /* Second, how big is the data in the archive? */ + if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0 + && (tar->sparse_gnu_major == 1)) { + /* GNU sparse format 1.0 uses `GNU.sparse.size` + * to hold the size of the data in the archive. */ + tar->entry_bytes_remaining = tar->GNU_sparse_size; + } else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) { + tar->entry_bytes_remaining = tar->pax_size; + } else { + tar->entry_bytes_remaining + = tar_atol(header->size, sizeof(header->size)); + } + if (tar->entry_bytes_remaining < 0) { + tar->entry_bytes_remaining = 0; + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, + "Tar entry has negative size"); + return (ARCHIVE_FATAL); + } else if (tar->entry_bytes_remaining > entry_limit) { + tar->entry_bytes_remaining = 0; + archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC, + "Tar entry size overflow"); + return (ARCHIVE_FATAL); } /* Handle the tar type flag appropriately. */ @@ -2299,10 +2346,13 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent } else if (key_length == 4 && memcmp(key, "size", 4) == 0) { /* GNU.sparse.size */ + /* This is either the size of stored entry OR the size of data on disk, + * depending on which GNU sparse format version is in use. + * Since pax attributes can be in any order, we may not actually + * know at this point how to interpret this. */ if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) { - tar->realsize = t; - archive_entry_set_size(entry, tar->realsize); - tar->realsize_override = 1; + tar->GNU_sparse_size = t; + tar->size_fields |= TAR_SIZE_GNU_SPARSE_SIZE; } return (err); } @@ -2370,11 +2420,10 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent return (err); } else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) { - /* GNU.sparse.realsize */ + /* GNU.sparse.realsize = size of file on disk */ if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) { - tar->realsize = t; - archive_entry_set_size(entry, tar->realsize); - tar->realsize_override = 1; + tar->GNU_sparse_realsize = t; + tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE; } return (err); } @@ -2555,12 +2604,12 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent } else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) { if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) { - tar->realsize = t; - tar->realsize_override = 1; - archive_entry_set_size(entry, tar->realsize); + tar->SCHILY_sparse_realsize = t; + tar->size_fields |= TAR_SIZE_SCHILY_SPARSE_REALSIZE; } return (err); } + /* TODO: Is there a SCHILY.sparse.size similar to GNU.sparse.size ? */ else if (key_length > 6 && memcmp(key, "xattr.", 6) == 0) { key_length -= 6; key += 6; @@ -2727,19 +2776,8 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent if (key_length == 4 && memcmp(key, "size", 4) == 0) { /* "size" is the size of the data in the entry. */ if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) { - tar->entry_bytes_remaining = t; - /* - * The "size" pax header keyword always overrides the - * "size" field in the tar header. - * GNU.sparse.realsize, GNU.sparse.size and - * SCHILY.realsize override this value. - */ - if (!tar->realsize_override) { - archive_entry_set_size(entry, - tar->entry_bytes_remaining); - tar->realsize - = tar->entry_bytes_remaining; - } + tar->pax_size = t; + tar->size_fields |= TAR_SIZE_PAX_SIZE; } else if (t == INT64_MAX) { /* Note: pax_attr_read_number returns INT64_MAX on overflow or < 0 */ @@ -2851,11 +2889,6 @@ header_gnutar(struct archive_read *a, struct tar *tar, * filename is stored as in old-style archives. */ - /* Grab fields common to all tar variants. */ - err = header_common(a, tar, entry, h); - if (err == ARCHIVE_FATAL) - return (err); - /* Copy filename over (to ensure null termination). */ header = (const struct archive_entry_header_gnutar *)h; const char *existing_pathname = archive_entry_pathname(entry); @@ -2904,8 +2937,6 @@ header_gnutar(struct archive_read *a, struct tar *tar, archive_entry_set_rdev(entry, 0); } - tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining); - /* Grab GNU-specific fields. */ if (!archive_entry_atime_is_set(entry)) { t = tar_atol(header->atime, sizeof(header->atime)); @@ -2919,10 +2950,10 @@ header_gnutar(struct archive_read *a, struct tar *tar, } if (header->realsize[0] != 0) { - tar->realsize + /* Treat as a synonym for the pax GNU.sparse.realsize attr */ + tar->GNU_sparse_realsize = tar_atol(header->realsize, sizeof(header->realsize)); - archive_entry_set_size(entry, tar->realsize); - tar->realsize_override = 1; + tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE; } if (header->sparse[0].offset[0] != 0) { @@ -2935,6 +2966,13 @@ header_gnutar(struct archive_read *a, struct tar *tar, } } + /* Grab fields common to all tar variants. */ + err = header_common(a, tar, entry, h); + if (err == ARCHIVE_FATAL) + return (err); + + tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining); + return (err); } @@ -3114,8 +3152,7 @@ gnu_sparse_01_parse(struct archive_read *a, struct tar *tar, const char *p, size * it's not possible to support both variants. This code supports * the later variant at the expense of not supporting the former. * - * This variant also replaced GNU.sparse.size with GNU.sparse.realsize - * and introduced the GNU.sparse.major/GNU.sparse.minor attributes. + * This variant also introduced the GNU.sparse.major/GNU.sparse.minor attributes. */ /* diff --git a/libarchive/test/CMakeLists.txt b/libarchive/test/CMakeLists.txt index 77f015099..c69601898 100644 --- a/libarchive/test/CMakeLists.txt +++ b/libarchive/test/CMakeLists.txt @@ -134,6 +134,7 @@ IF(ENABLE_TEST) test_read_format_gtar_gz.c test_read_format_gtar_lzma.c test_read_format_gtar_sparse.c + test_read_format_gtar_sparse_length.c test_read_format_gtar_sparse_skip_entry.c test_read_format_huge_rpm.c test_read_format_iso_Z.c diff --git a/libarchive/test/test_read_format_gtar_sparse_length.c b/libarchive/test/test_read_format_gtar_sparse_length.c new file mode 100644 index 000000000..e8f3955b3 --- /dev/null +++ b/libarchive/test/test_read_format_gtar_sparse_length.c @@ -0,0 +1,53 @@ +/*- + * Copyright (c) 2003-2025 Tim Kientzle + * All rights reserved. + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions + * are met: + * 1. Redistributions of source code must retain the above copyright + * notice, this list of conditions and the following disclaimer. + * 2. Redistributions in binary form must reproduce the above copyright + * notice, this list of conditions and the following disclaimer in the + * documentation and/or other materials provided with the distribution. + * + * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR + * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES + * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED. + * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT, + * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT + * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, + * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY + * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT + * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF + * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + */ +#include "test.h" + + +DEFINE_TEST(test_read_format_gtar_sparse_length) +{ + const char *refname = "test_read_format_gtar_sparse_length.tar.Z"; + int err; + struct archive *a; + struct archive_entry *ae; + + extract_reference_file(refname); + + assert((a = archive_read_new()) != NULL); + assert(0 == archive_read_support_filter_all(a)); + assert(0 == archive_read_support_format_tar(a)); + failure("Can't open %s", refname); + assert(0 == archive_read_open_filename(a, refname, 3)); + + err = archive_read_next_header(a, &ae); + assertEqualIntA(a, ARCHIVE_OK, err); + err = archive_read_next_header(a, &ae); + assertEqualIntA(a, ARCHIVE_OK, err); + + err = archive_read_next_header(a, &ae); + assertEqualIntA(a, ARCHIVE_EOF, err); + + assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a)); + assertEqualInt(ARCHIVE_OK, archive_read_free(a)); +} diff --git a/libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu b/libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu new file mode 100644 index 000000000..a8667b861 --- /dev/null +++ b/libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu @@ -0,0 +1,12 @@ +begin 644 test_read_format_gtar_sparse_length.tar.Z +M'YV09-+(>0$E#!XD9<*0*3/03!HV96(`F$BQHL6+&#-JW,BQ(T48(&W0H`$" +M`,@8-VK$*'DRY4J3(&.>C`$#1`P:-V#,J#$CAHP8-D#``%IC)``0>#PJ7;0"2,')APV=_FW2NG[]\T;@;W@'$81V*\>OGZE9.0 +MS9PT>LH4EG$CAX*XJ%.K7LVZM>O7L&/+GLTQX$"[5:8P[FODH5\8+QQ"E$C; +M=4R1)&&B5,ERJ,OF9VF"M(E3)T^?0(6BG#'CQE$8Q<,WC3JU*LBP6KFR\
555BE1U9[+;['7EGT93@= +.BCCFJ...//;HXX]`_@@` +` +end