]> git.ipfire.org Git - thirdparty/libarchive.git/commitdiff
Issue 2548: Reading GNU sparse entries (#2558)
authorTim Kientzle <kientzle@acm.org>
Sun, 30 Mar 2025 16:26:25 +0000 (09:26 -0700)
committerGitHub <noreply@github.com>
Sun, 30 Mar 2025 16:26:25 +0000 (09:26 -0700)
My attempt to fix #2404 just made the confusion between the size of the
extracted file and the size of the contents in the tar archive worse
than it was before.

@ferivoz in #2557 showed that the confusion stemmed from a point where
we were setting the size in the entry (which is by definition the size
of the file on disk) when we read the `GNU.sparse.size` and
`GNU.sparse.realsize` attributes (which might represent the size on disk
or in the archive) and then using that to determine whether to read the
value in ustar header (which represents the size of the data in the
archive).

The confusion stems from three issues:
* The GNU.sparse.* fields mean different things depending on the version
of GNU tar used.
* The regular Pax `size` field overrides the value in the ustar header,
but the GNU sparse size fields don't always do so.
* The previous libarchive code tried to reconcile different size
information as we went along, which is problematic because the order in
which this information appears can vary.

This PR makes one big structural change: We now have separate storage
for every different size field we might encounter. We now just store
these values and record which one we saw. Then at the end, when we have
all the information available at once, we can use this data to determine
the size on disk and the size in the archive.

A few key facts about GNU sparse formats:

* GNU legacy sparse format: Stored all the relevant info in an extension
of the ustar header.
* GNU pax 0.0 format: Used `GNU.sparse.size` to store the size on disk
* GNU pax 0.1 format: Used `GNU.sparse.size` to store the size on disk
* GNU pax 1.0 format: Used `GNU.sparse.realsize` to store the size on
disk; repurposed `GNU.sparse.size` to store the size in the archive, but
omitted this in favor of the ustar size field when that could be used.

And of course, some key precedence information:
* Pax `size` field always overrides the ustar header size field.
* GNU sparse size fields override it ONLY when they represent the size
of the data in the archive.

Resolves #2548

Makefile.am
libarchive/archive_read_support_format_tar.c
libarchive/test/CMakeLists.txt
libarchive/test/test_read_format_gtar_sparse_length.c [new file with mode: 0644]
libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu [new file with mode: 0644]

index 7860c5c50375c768e26b5f5eb96a3a95266da59e..efc49180044c6081e32bbd1c946208f3996183bd 100644 (file)
@@ -492,6 +492,7 @@ libarchive_test_SOURCES= \
        libarchive/test/test_read_format_gtar_gz.c \
        libarchive/test/test_read_format_gtar_lzma.c \
        libarchive/test/test_read_format_gtar_sparse.c \
+       libarchive/test/test_read_format_gtar_sparse_length.c \
        libarchive/test/test_read_format_gtar_sparse_skip_entry.c \
        libarchive/test/test_read_format_huge_rpm.c \
        libarchive/test/test_read_format_iso_Z.c \
@@ -847,6 +848,7 @@ libarchive_test_EXTRA_DIST=\
        libarchive/test/test_read_format_gtar_sparse_1_17_posix01.tar.uu \
        libarchive/test/test_read_format_gtar_sparse_1_17_posix10.tar.uu \
        libarchive/test/test_read_format_gtar_sparse_1_17_posix10_modified.tar.uu \
+       libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu \
        libarchive/test/test_read_format_gtar_sparse_skip_entry.tar.Z.uu \
        libarchive/test/test_read_format_huge_rpm.rpm.uu \
        libarchive/test/test_read_format_iso.iso.Z.uu \
index 34ba85409418ba2590f5fd68ba240497318d1828..7615d7cd78ed281a3b82d7349ec9aeb4aada7cb5 100644 (file)
@@ -129,7 +129,11 @@ struct tar {
        int64_t                  entry_offset;
        int64_t                  entry_padding;
        int64_t                  entry_bytes_unconsumed;
-       int64_t                  realsize;
+       int64_t                  disk_size;
+       int64_t                  GNU_sparse_realsize;
+       int64_t                  GNU_sparse_size;
+       int64_t                  SCHILY_sparse_realsize;
+       int64_t                  pax_size;
        struct sparse_block     *sparse_list;
        struct sparse_block     *sparse_last;
        int64_t                  sparse_offset;
@@ -138,6 +142,7 @@ struct tar {
        int                      sparse_gnu_minor;
        char                     sparse_gnu_attributes_seen;
        char                     filetype;
+       char                     size_fields; /* Bits defined below */
 
        struct archive_string    localname;
        struct archive_string_conv *opt_sconv;
@@ -148,9 +153,15 @@ struct tar {
        int                      compat_2x;
        int                      process_mac_extensions;
        int                      read_concatenated_archives;
-       int                      realsize_override;
 };
 
+/* Track which size fields were present in the headers */
+#define TAR_SIZE_PAX_SIZE 1
+#define TAR_SIZE_GNU_SPARSE_REALSIZE 2
+#define TAR_SIZE_GNU_SPARSE_SIZE 4
+#define TAR_SIZE_SCHILY_SPARSE_REALSIZE 8
+
+
 static int     archive_block_is_null(const char *p);
 static char    *base64_decode(const char *, size_t, size_t *);
 static int     gnu_add_sparse_entry(struct archive_read *, struct tar *,
@@ -529,8 +540,7 @@ archive_read_format_tar_read_header(struct archive_read *a,
        tar = (struct tar *)(a->format->data);
        tar->entry_offset = 0;
        gnu_clear_sparse_list(tar);
-       tar->realsize = -1; /* Mark this as "unset" */
-       tar->realsize_override = 0;
+       tar->size_fields = 0; /* We don't have any size info yet */
 
        /* Setup default string conversion. */
        tar->sconv = tar->opt_sconv;
@@ -622,7 +632,7 @@ archive_read_format_tar_read_data(struct archive_read *a,
                        tar->entry_padding = 0;
                        *buff = NULL;
                        *size = 0;
-                       *offset = tar->realsize;
+                       *offset = tar->disk_size;
                        return (ARCHIVE_EOF);
                }
 
@@ -1290,6 +1300,11 @@ read_body_to_string(struct archive_read *a, struct tar *tar,
  * allows header_old_tar and header_ustar
  * to handle filenames differently, while still putting most of the
  * common parsing into one place.
+ *
+ * This is called _after_ ustar, GNU tar, Schily, etc, special
+ * fields have already been parsed into the `tar` structure.
+ * So we can make final decisions here about how to reconcile
+ * size, mode, etc, information.
  */
 static int
 header_common(struct archive_read *a, struct tar *tar,
@@ -1323,28 +1338,60 @@ header_common(struct archive_read *a, struct tar *tar,
                archive_entry_set_mtime(entry, tar_atol(header->mtime, sizeof(header->mtime)), 0);
        }
 
-       /* Update size information as appropriate */
-       if (!archive_entry_size_is_set(entry)) {
-               tar->entry_bytes_remaining = tar_atol(header->size, sizeof(header->size));
-               if (tar->entry_bytes_remaining < 0) {
-                       tar->entry_bytes_remaining = 0;
-                       archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
-                                         "Tar entry has negative size");
-                       return (ARCHIVE_FATAL);
-               }
-               if (tar->entry_bytes_remaining > entry_limit) {
-                       tar->entry_bytes_remaining = 0;
-                       archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
-                                         "Tar entry size overflow");
-                       return (ARCHIVE_FATAL);
-               }
-               if (!tar->realsize_override) {
-                       tar->realsize = tar->entry_bytes_remaining;
-               }
-               archive_entry_set_size(entry, tar->realsize);
-       } else if (tar->realsize_override) {
-               tar->entry_bytes_remaining = tar->realsize;
-               archive_entry_set_size(entry, tar->realsize);
+       /* Reconcile the size info. */
+       /* First, how big is the file on disk? */
+       if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_REALSIZE) != 0) {
+               /* GNU sparse format 1.0 uses `GNU.sparse.realsize`
+                * to hold the size of the file on disk. */
+               tar->disk_size = tar->GNU_sparse_realsize;
+       } else if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0
+                  && (tar->sparse_gnu_major == 0)) {
+               /* GNU sparse format 0.0 and 0.1 use `GNU.sparse.size`
+                * to hold the size of the file on disk. */
+               tar->disk_size = tar->GNU_sparse_size;
+       } else if ((tar->size_fields & TAR_SIZE_SCHILY_SPARSE_REALSIZE) != 0) {
+               tar->disk_size = tar->SCHILY_sparse_realsize;
+       } else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) {
+               tar->disk_size = tar->pax_size;
+       } else {
+               /* There wasn't a suitable pax header, so use the ustar info */
+               tar->disk_size = tar_atol(header->size, sizeof(header->size));
+       }
+
+       if (tar->disk_size < 0) {
+               archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+                                 "Tar entry has negative file size");
+               return (ARCHIVE_FATAL);
+       } else if (tar->disk_size > entry_limit) {
+               archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+                                 "Tar entry size overflow");
+               return (ARCHIVE_FATAL);
+       } else {
+               archive_entry_set_size(entry, tar->disk_size);
+       }
+
+       /* Second, how big is the data in the archive? */
+       if ((tar->size_fields & TAR_SIZE_GNU_SPARSE_SIZE) != 0
+           && (tar->sparse_gnu_major == 1)) {
+               /* GNU sparse format 1.0 uses `GNU.sparse.size`
+                * to hold the size of the data in the archive. */
+               tar->entry_bytes_remaining = tar->GNU_sparse_size;
+       } else if ((tar->size_fields & TAR_SIZE_PAX_SIZE) != 0) {
+               tar->entry_bytes_remaining = tar->pax_size;
+       } else {
+               tar->entry_bytes_remaining
+                       = tar_atol(header->size, sizeof(header->size));
+       }
+       if (tar->entry_bytes_remaining < 0) {
+               tar->entry_bytes_remaining = 0;
+               archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+                                 "Tar entry has negative size");
+               return (ARCHIVE_FATAL);
+       } else if (tar->entry_bytes_remaining > entry_limit) {
+               tar->entry_bytes_remaining = 0;
+               archive_set_error(&a->archive, ARCHIVE_ERRNO_MISC,
+                                 "Tar entry size overflow");
+               return (ARCHIVE_FATAL);
        }
 
        /* Handle the tar type flag appropriately. */
@@ -2299,10 +2346,13 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
                                }
                                else if (key_length == 4 && memcmp(key, "size", 4) == 0) {
                                        /* GNU.sparse.size */
+                                       /* This is either the size of stored entry OR the size of data on disk,
+                                        * depending on which GNU sparse format version is in use.
+                                        * Since pax attributes can be in any order, we may not actually
+                                        * know at this point how to interpret this. */
                                        if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
-                                               tar->realsize = t;
-                                               archive_entry_set_size(entry, tar->realsize);
-                                               tar->realsize_override = 1;
+                                               tar->GNU_sparse_size = t;
+                                               tar->size_fields |= TAR_SIZE_GNU_SPARSE_SIZE;
                                        }
                                        return (err);
                                }
@@ -2370,11 +2420,10 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
                                        return (err);
                                }
                                else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) {
-                                       /* GNU.sparse.realsize */
+                                       /* GNU.sparse.realsize = size of file on disk */
                                        if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
-                                               tar->realsize = t;
-                                               archive_entry_set_size(entry, tar->realsize);
-                                               tar->realsize_override = 1;
+                                               tar->GNU_sparse_realsize = t;
+                                               tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE;
                                        }
                                        return (err);
                                }
@@ -2555,12 +2604,12 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
                        }
                        else if (key_length == 8 && memcmp(key, "realsize", 8) == 0) {
                                if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
-                                       tar->realsize = t;
-                                       tar->realsize_override = 1;
-                                       archive_entry_set_size(entry, tar->realsize);
+                                       tar->SCHILY_sparse_realsize = t;
+                                       tar->size_fields |= TAR_SIZE_SCHILY_SPARSE_REALSIZE;
                                }
                                return (err);
                        }
+                       /* TODO: Is there a SCHILY.sparse.size similar to GNU.sparse.size ? */
                        else if (key_length > 6 && memcmp(key, "xattr.", 6) == 0) {
                                key_length -= 6;
                                key += 6;
@@ -2727,19 +2776,8 @@ pax_attribute(struct archive_read *a, struct tar *tar, struct archive_entry *ent
                if (key_length == 4 && memcmp(key, "size", 4) == 0) {
                        /* "size" is the size of the data in the entry. */
                        if ((err = pax_attribute_read_number(a, value_length, &t)) == ARCHIVE_OK) {
-                               tar->entry_bytes_remaining = t;
-                               /*
-                                * The "size" pax header keyword always overrides the
-                                * "size" field in the tar header.
-                                * GNU.sparse.realsize, GNU.sparse.size and
-                                * SCHILY.realsize override this value.
-                                */
-                               if (!tar->realsize_override) {
-                                       archive_entry_set_size(entry,
-                                                              tar->entry_bytes_remaining);
-                                       tar->realsize
-                                               = tar->entry_bytes_remaining;
-                               }
+                               tar->pax_size = t;
+                               tar->size_fields |= TAR_SIZE_PAX_SIZE;
                        }
                        else if (t == INT64_MAX) {
                                /* Note: pax_attr_read_number returns INT64_MAX on overflow or < 0 */
@@ -2851,11 +2889,6 @@ header_gnutar(struct archive_read *a, struct tar *tar,
         * filename is stored as in old-style archives.
         */
 
-       /* Grab fields common to all tar variants. */
-       err = header_common(a, tar, entry, h);
-       if (err == ARCHIVE_FATAL)
-               return (err);
-
        /* Copy filename over (to ensure null termination). */
        header = (const struct archive_entry_header_gnutar *)h;
        const char *existing_pathname = archive_entry_pathname(entry);
@@ -2904,8 +2937,6 @@ header_gnutar(struct archive_read *a, struct tar *tar,
                archive_entry_set_rdev(entry, 0);
        }
 
-       tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining);
-
        /* Grab GNU-specific fields. */
        if (!archive_entry_atime_is_set(entry)) {
                t = tar_atol(header->atime, sizeof(header->atime));
@@ -2919,10 +2950,10 @@ header_gnutar(struct archive_read *a, struct tar *tar,
        }
 
        if (header->realsize[0] != 0) {
-               tar->realsize
+               /* Treat as a synonym for the pax GNU.sparse.realsize attr */
+               tar->GNU_sparse_realsize
                    = tar_atol(header->realsize, sizeof(header->realsize));
-               archive_entry_set_size(entry, tar->realsize);
-               tar->realsize_override = 1;
+               tar->size_fields |= TAR_SIZE_GNU_SPARSE_REALSIZE;
        }
 
        if (header->sparse[0].offset[0] != 0) {
@@ -2935,6 +2966,13 @@ header_gnutar(struct archive_read *a, struct tar *tar,
                }
        }
 
+       /* Grab fields common to all tar variants. */
+       err = header_common(a, tar, entry, h);
+       if (err == ARCHIVE_FATAL)
+               return (err);
+
+       tar->entry_padding = 0x1ff & (-tar->entry_bytes_remaining);
+
        return (err);
 }
 
@@ -3114,8 +3152,7 @@ gnu_sparse_01_parse(struct archive_read *a, struct tar *tar, const char *p, size
  * it's not possible to support both variants.  This code supports
  * the later variant at the expense of not supporting the former.
  *
- * This variant also replaced GNU.sparse.size with GNU.sparse.realsize
- * and introduced the GNU.sparse.major/GNU.sparse.minor attributes.
+ * This variant also introduced the GNU.sparse.major/GNU.sparse.minor attributes.
  */
 
 /*
index 77f01509986dbb4c5b6e2d03551f9d6db8e0e396..c6960189830092e00c16e5076346c7b208c4d3e8 100644 (file)
@@ -134,6 +134,7 @@ IF(ENABLE_TEST)
     test_read_format_gtar_gz.c
     test_read_format_gtar_lzma.c
     test_read_format_gtar_sparse.c
+    test_read_format_gtar_sparse_length.c
     test_read_format_gtar_sparse_skip_entry.c
     test_read_format_huge_rpm.c
     test_read_format_iso_Z.c
diff --git a/libarchive/test/test_read_format_gtar_sparse_length.c b/libarchive/test/test_read_format_gtar_sparse_length.c
new file mode 100644 (file)
index 0000000..e8f3955
--- /dev/null
@@ -0,0 +1,53 @@
+/*-
+ * Copyright (c) 2003-2025 Tim Kientzle
+ * All rights reserved.
+ *
+ * Redistribution and use in source and binary forms, with or without
+ * modification, are permitted provided that the following conditions
+ * are met:
+ * 1. Redistributions of source code must retain the above copyright
+ *    notice, this list of conditions and the following disclaimer.
+ * 2. Redistributions in binary form must reproduce the above copyright
+ *    notice, this list of conditions and the following disclaimer in the
+ *    documentation and/or other materials provided with the distribution.
+ *
+ * THIS SOFTWARE IS PROVIDED BY THE AUTHOR(S) ``AS IS'' AND ANY EXPRESS OR
+ * IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE IMPLIED WARRANTIES
+ * OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE DISCLAIMED.
+ * IN NO EVENT SHALL THE AUTHOR(S) BE LIABLE FOR ANY DIRECT, INDIRECT,
+ * INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT
+ * NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE,
+ * DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY
+ * THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT
+ * (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF
+ * THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
+ */
+#include "test.h"
+
+
+DEFINE_TEST(test_read_format_gtar_sparse_length)
+{
+       const char *refname = "test_read_format_gtar_sparse_length.tar.Z";
+       int err;
+       struct archive *a;
+       struct archive_entry *ae;
+
+       extract_reference_file(refname);
+
+       assert((a = archive_read_new()) != NULL);
+       assert(0 == archive_read_support_filter_all(a));
+       assert(0 == archive_read_support_format_tar(a));
+       failure("Can't open %s", refname);
+       assert(0 == archive_read_open_filename(a, refname, 3));
+
+       err = archive_read_next_header(a, &ae);
+       assertEqualIntA(a, ARCHIVE_OK, err);
+       err = archive_read_next_header(a, &ae);
+       assertEqualIntA(a, ARCHIVE_OK, err);
+
+       err = archive_read_next_header(a, &ae);
+       assertEqualIntA(a, ARCHIVE_EOF, err);
+
+       assertEqualIntA(a, ARCHIVE_OK, archive_read_close(a));
+       assertEqualInt(ARCHIVE_OK, archive_read_free(a));
+}
diff --git a/libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu b/libarchive/test/test_read_format_gtar_sparse_length.tar.Z.uu
new file mode 100644 (file)
index 0000000..a8667b8
--- /dev/null
@@ -0,0 +1,12 @@
+begin 644 test_read_format_gtar_sparse_length.tar.Z
+M'YV09-+(>0$E#!XD9<*0*3/03!HV96(`F$BQHL6+&#-JW,BQ(T48(&W0H`$"
+M`,@8-VK$*'DRY4J3(&.>C`$#1`P:-V#,J#$CAHP8-D#``%IC)``0>#PJ7<JT
+MJ5.,=>;0"2,')APV=<Z(J?.0S-.*5[-N[?KUH\R:,&66+,NVK=NW<.'*D`'B
+MB),J+N;`H3JGC(LV8=2\D=,CAH*Y=>_FW2NG[]\T;@;W@'$81V*\>OGZE9.0
+MS9PT>LH4EG$CAX*XJ%.K7LVZM>O7L&/+GLTQX$"[5:8P[FODH5\8+QQ"E$C;
+M=4R1)&&B5,ERJ,OF9VF"M(E3)T^?0(6BG#'CQE$8Q<,WC3JU*LBP6KFR\<H6
+M_=CU;<^BE;]6O/W[KF4HH$S99^G]I^$GX(`$%FC@@0@F.*!M,KR@(&TQI51#
+M<\N]U!)S:<DWW4TY[=333T$-U1T-,AQ5PX/VD4>555BE1U9[+;['7EGT93@=
+.BCCFJ...//;HXX]`_@@`
+`
+end