From: Lennart Poettering Date: Thu, 21 Aug 2025 17:08:01 +0000 (+0200) Subject: tar-util: properly deal with sparse files X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=a54f4520f36da05dc48322989f36b1c46ee7f01b;p=thirdparty%2Fsystemd.git tar-util: properly deal with sparse files The extractor already deals with sparse files properly (because archive_read_data_into_fd() does). Let's also make sure the archiver also does this, and attaches the necessary sparse file metadata to each file. --- diff --git a/src/shared/libarchive-util.c b/src/shared/libarchive-util.c index dcd532df1dc..e3387e6e97c 100644 --- a/src/shared/libarchive-util.c +++ b/src/shared/libarchive-util.c @@ -37,6 +37,7 @@ DLSYM_PROTOTYPE(archive_entry_set_rdevminor) = NULL; DLSYM_PROTOTYPE(archive_entry_set_size) = NULL; DLSYM_PROTOTYPE(archive_entry_set_symlink) = NULL; DLSYM_PROTOTYPE(archive_entry_set_uid) = NULL; +DLSYM_PROTOTYPE(archive_entry_sparse_add_entry) = NULL; DLSYM_PROTOTYPE(archive_entry_symlink) = NULL; DLSYM_PROTOTYPE(archive_entry_uid) = NULL; #if HAVE_LIBARCHIVE_UID_IS_SET @@ -61,7 +62,7 @@ DLSYM_PROTOTYPE(archive_write_new) = NULL; DLSYM_PROTOTYPE(archive_write_open_FILE) = NULL; DLSYM_PROTOTYPE(archive_write_open_fd) = NULL; DLSYM_PROTOTYPE(archive_write_set_format_filter_by_ext) = NULL; -DLSYM_PROTOTYPE(archive_write_set_format_gnutar) = NULL; +DLSYM_PROTOTYPE(archive_write_set_format_pax) = NULL; int dlopen_libarchive(void) { ELF_NOTE_DLOPEN("archive", @@ -103,6 +104,7 @@ int dlopen_libarchive(void) { DLSYM_ARG(archive_entry_set_size), DLSYM_ARG(archive_entry_set_symlink), DLSYM_ARG(archive_entry_set_uid), + DLSYM_ARG(archive_entry_sparse_add_entry), DLSYM_ARG(archive_entry_symlink), DLSYM_ARG(archive_entry_uid), #if HAVE_LIBARCHIVE_UID_IS_SET @@ -127,8 +129,7 @@ int dlopen_libarchive(void) { DLSYM_ARG(archive_write_open_FILE), DLSYM_ARG(archive_write_open_fd), DLSYM_ARG(archive_write_set_format_filter_by_ext), - DLSYM_ARG(archive_write_set_format_gnutar) - ); + DLSYM_ARG(archive_write_set_format_pax)); } /* libarchive uses its own file type macros. They happen to be defined the same way as the Linux ones, and diff --git a/src/shared/libarchive-util.h b/src/shared/libarchive-util.h index e58e4d26a54..7534b0d016e 100644 --- a/src/shared/libarchive-util.h +++ b/src/shared/libarchive-util.h @@ -33,6 +33,7 @@ extern DLSYM_PROTOTYPE(archive_entry_set_rdevminor); extern DLSYM_PROTOTYPE(archive_entry_set_size); extern DLSYM_PROTOTYPE(archive_entry_set_symlink); extern DLSYM_PROTOTYPE(archive_entry_set_uid); +extern DLSYM_PROTOTYPE(archive_entry_sparse_add_entry); extern DLSYM_PROTOTYPE(archive_entry_symlink); extern DLSYM_PROTOTYPE(archive_entry_uid); extern DLSYM_PROTOTYPE(archive_entry_xattr_add_entry); @@ -54,7 +55,7 @@ extern DLSYM_PROTOTYPE(archive_write_new); extern DLSYM_PROTOTYPE(archive_write_open_FILE); extern DLSYM_PROTOTYPE(archive_write_open_fd); extern DLSYM_PROTOTYPE(archive_write_set_format_filter_by_ext); -extern DLSYM_PROTOTYPE(archive_write_set_format_gnutar); +extern DLSYM_PROTOTYPE(archive_write_set_format_pax); #if HAVE_LIBARCHIVE_UID_IS_SET extern DLSYM_PROTOTYPE(archive_entry_gid_is_set); diff --git a/src/shared/tar-util.c b/src/shared/tar-util.c index 9d28009f719..83b346d52ce 100644 --- a/src/shared/tar-util.c +++ b/src/shared/tar-util.c @@ -804,6 +804,57 @@ bypass: return 0; } +static int archive_generate_sparse(struct archive_entry *entry, int fd) { + assert(entry); + assert(fd); + + off_t c = 0; + for (;;) { + /* Look for the next hole */ + off_t h = lseek(fd, c, SEEK_HOLE); + if (h < 0) { + if (errno != ENXIO) + return log_error_errno(errno, "Failed to issue SEEK_HOLE: %m"); + + /* If errno == ENXIO, that means we've reached the final data of the file and + * that data isn't followed by anything more */ + + /* Figure out where the end of the file is */ + off_t e = lseek(fd, 0, SEEK_END); + if (e < 0) + return log_error_errno(errno, "Failed to issue SEEK_END: %m"); + + /* Generate sparse entry for final block */ + if (e > c && c != 0) { + log_debug("final sparse block %" PRIu64 "…%" PRIu64, (uint64_t) c, (uint64_t) e); + sym_archive_entry_sparse_add_entry(entry, c, e - c); + } + + break; + } + + if (h > c) { + log_debug("inner sparse block %" PRIu64 "…%" PRIu64 " (%" PRIu64 ")", (uint64_t) c, (uint64_t) h, (uint64_t) h - (uint64_t) c); + sym_archive_entry_sparse_add_entry(entry, c, h - c); + } + + /* Now look for the next data after the hole */ + c = lseek(fd, h, SEEK_DATA); + if (c < 0) { + if (errno != ENXIO) + return log_error_errno(errno, "Failed to issue SEEK_DATA: %m"); + + /* No data anymore */ + break; + } + } + + if (lseek(fd, 0, SEEK_SET) < 0) + return log_error_errno(errno, "Failed to reset seek offset: %m"); + + return 0; +} + static int archive_item( RecurseDirEvent event, const char *path, @@ -912,17 +963,24 @@ static int archive_item( sym_archive_entry_xattr_add_entry(entry, xa, buf, size); } - if (sym_archive_write_header(d->archive, entry) != ARCHIVE_OK) - return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to write archive entry header: %s", sym_archive_error_string(d->archive)); - + _cleanup_close_ int data_fd = -EBADF; if (S_ISREG(sx->stx_mode)) { - _cleanup_close_ int data_fd = -EBADF; - - /* Convert the O_PATH fd in a proper fd */ + /* Convert the O_PATH fd into a proper fd */ data_fd = fd_reopen(inode_fd, O_RDONLY|O_CLOEXEC); if (data_fd < 0) return log_error_errno(data_fd, "Failed to open '%s': %m", path); + r = archive_generate_sparse(entry, data_fd); + if (r < 0) + return r; + } + + if (sym_archive_write_header(d->archive, entry) != ARCHIVE_OK) + return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to write archive entry header: %s", sym_archive_error_string(d->archive)); + + if (S_ISREG(sx->stx_mode)) { + assert(data_fd >= 0); + for (;;) { char buffer[64*1024]; ssize_t l; @@ -965,7 +1023,7 @@ int tar_c(int tree_fd, int output_fd, const char *filename, TarFlags flags) { if (filename) r = sym_archive_write_set_format_filter_by_ext(a, filename); else - r = sym_archive_write_set_format_gnutar(a); + r = sym_archive_write_set_format_pax(a); if (r != ARCHIVE_OK) return log_error_errno(SYNTHETIC_ERRNO(ENOTRECOVERABLE), "Failed to set libarchive output format: %s", sym_archive_error_string(a));