From 51300afefa29289b7fe88f0abf4f966ddf27df74 Mon Sep 17 00:00:00 2001 From: Joerg Sonnenberger Date: Tue, 13 May 2008 01:41:46 -0400 Subject: [PATCH] Add support to sparsify files on write. This scans output blocks for NUL bytes and seeks if a block is full of NULs. This is optional, but the penalty is very small. SVN-Revision: 51 --- libarchive/archive.h | 2 + libarchive/archive_write_disk.3 | 4 ++ libarchive/archive_write_disk.c | 90 +++++++++++++++++++++++++-------- tar/bsdtar.1 | 6 +++ tar/bsdtar.c | 5 +- 5 files changed, 86 insertions(+), 21 deletions(-) diff --git a/libarchive/archive.h b/libarchive/archive.h index 04207c973..0465cff1c 100644 --- a/libarchive/archive.h +++ b/libarchive/archive.h @@ -395,6 +395,8 @@ __LA_DECL int archive_read_data_into_fd(struct archive *, int fd); #define ARCHIVE_EXTRACT_NO_AUTODIR (0x0400) /* Default: Overwrite files, even if one on disk is newer. */ #define ARCHIVE_EXTRACT_NO_OVERWRITE_NEWER (0x0800) +/* Detect blocks of 0 and write holes instead. */ +#define ARCHIVE_EXTRACT_SPARSE (0x1000) __LA_DECL int archive_read_extract(struct archive *, struct archive_entry *, int flags); diff --git a/libarchive/archive_write_disk.3 b/libarchive/archive_write_disk.3 index 5d836e105..f71d7d548 100644 --- a/libarchive/archive_write_disk.3 +++ b/libarchive/archive_write_disk.3 @@ -170,6 +170,10 @@ Note that paths ending in .Pa .. always cause an error, regardless of this flag. .El +.It Cm ARCHIVE_EXTRACT_SPARSE +Scan data for blocks of NUL bytes and try to recreate them with holes. +This results in sparse files, independent of whether the archive format +supports or uses them. .It Xo .Fn archive_write_disk_set_group_lookup , .Fn archive_write_disk_set_user_lookup diff --git a/libarchive/archive_write_disk.c b/libarchive/archive_write_disk.c index 38ed2fc74..8010c1330 100644 --- a/libarchive/archive_write_disk.c +++ b/libarchive/archive_write_disk.c @@ -187,6 +187,8 @@ struct archive_write_disk { /* UID/GID to use in restoring this entry. */ uid_t uid; gid_t gid; + /* Last offset written to disk. */ + off_t last_offset; }; /* @@ -333,6 +335,7 @@ _archive_write_header(struct archive *_a, struct archive_entry *entry) } a->entry = archive_entry_clone(entry); a->fd = -1; + a->last_offset = 0; a->offset = 0; a->uid = a->user_uid; a->mode = archive_entry_mode(a->entry); @@ -488,6 +491,7 @@ _archive_write_data_block(struct archive *_a, { struct archive_write_disk *a = (struct archive_write_disk *)_a; ssize_t bytes_written = 0; + ssize_t block_size, bytes_to_write; int r = ARCHIVE_OK; __archive_check_magic(&a->archive, ARCHIVE_WRITE_DISK_MAGIC, @@ -498,33 +502,53 @@ _archive_write_data_block(struct archive *_a, } archive_clear_error(&a->archive); - /* Seek if necessary to the specified offset. */ - if (offset != a->offset) { - if (lseek(a->fd, offset, SEEK_SET) < 0) { - archive_set_error(&a->archive, errno, "Seek failed"); - return (ARCHIVE_WARN); - } - a->offset = offset; + if (a->flags & ARCHIVE_EXTRACT_SPARSE) { + if ((r = _archive_write_disk_lazy_stat(a)) != ARCHIVE_OK) + return (r); + block_size = a->pst->st_blksize; + } else + block_size = -1; + + if ((off_t)(offset + size) > a->filesize) { + size = (size_t)(a->filesize - a->offset); + archive_set_error(&a->archive, 0, + "Write request too large"); + r = ARCHIVE_WARN; } /* Write the data. */ - while (size > 0 && a->offset < a->filesize) { - if ((off_t)(a->offset + size) > a->filesize) { - archive_set_error(&a->archive, 0, - "Write request too large (tried to write %u bytes, but only %u bytes remain)", - (unsigned int)size, - (unsigned int)(a->filesize - a->offset)); - r = ARCHIVE_WARN; - size = (size_t)(a->filesize - a->offset); - } + while (size > 0) { + if (block_size != -1) { + const char *buf; + + for (buf = buff; size; ++buf, --size, ++offset) { + if (*buf != '\0') + break; + } + if (size == 0) + break; + bytes_to_write = block_size - offset % block_size; + buff = buf; + } else + bytes_to_write = size; + /* Seek if necessary to the specified offset. */ + if (offset != a->last_offset) { + if (lseek(a->fd, offset, SEEK_SET) < 0) { + archive_set_error(&a->archive, errno, "Seek failed"); + return (ARCHIVE_FATAL); + } + } bytes_written = write(a->fd, buff, size); if (bytes_written < 0) { archive_set_error(&a->archive, errno, "Write failed"); return (ARCHIVE_WARN); } + buff = (const char *)buff + bytes_written; size -= bytes_written; - a->offset += bytes_written; + offset += bytes_written; + a->last_offset = a->offset = offset; } + a->offset = offset; return (r); } @@ -532,7 +556,6 @@ static ssize_t _archive_write_data(struct archive *_a, const void *buff, size_t size) { struct archive_write_disk *a = (struct archive_write_disk *)_a; - off_t offset; int r; __archive_check_magic(&a->archive, ARCHIVE_WRITE_DISK_MAGIC, @@ -540,11 +563,10 @@ _archive_write_data(struct archive *_a, const void *buff, size_t size) if (a->fd < 0) return (ARCHIVE_OK); - offset = a->offset; r = _archive_write_data_block(_a, buff, size, a->offset); if (r < ARCHIVE_OK) return (r); - return (a->offset - offset); + return size; } static int @@ -560,6 +582,34 @@ _archive_write_finish_entry(struct archive *_a) return (ARCHIVE_OK); archive_clear_error(&a->archive); + if (a->last_offset != a->filesize && a->fd >= 0) { + if (ftruncate(a->fd, a->filesize) == -1 && + a->filesize == 0) { + archive_set_error(&a->archive, errno, + "File size could not be restored"); + return (ARCHIVE_FAILED); + } + /* + * Explicitly stat the file as some platforms might not + * implement the XSI option to extend files via ftruncate. + */ + a->pst = NULL; + if ((ret = _archive_write_disk_lazy_stat(a)) != ARCHIVE_OK) + return (ret); + if (a->st.st_size != a->filesize) { + const char nul = '\0'; + if (lseek(a->fd, a->st.st_size - 1, SEEK_SET) < 0) { + archive_set_error(&a->archive, errno, "Seek failed"); + return (ARCHIVE_FATAL); + } + if (write(a->fd, &nul, 1) < 0) { + archive_set_error(&a->archive, errno, + "Write to restore size failed"); + return (ARCHIVE_FATAL); + } + } + } + /* Restore metadata. */ /* diff --git a/tar/bsdtar.1 b/tar/bsdtar.1 index f3be2162a..60a5d8e77 100644 --- a/tar/bsdtar.1 +++ b/tar/bsdtar.1 @@ -335,6 +335,12 @@ By default, the archive is always read to the very end, since there can be multiple entries with the same name and, by convention, later entries overwrite earlier entries. This option is provided as a performance optimization. +.It Fl S +(x mode only) +Extract files as sparse files. +For every block on disk, check first if it contains only NULL bytes and seek +over it otherwise. +This works similiar to the conv=sparse option of dd. .It Fl -strip-components Ar count ( Fl W Cm strip-components Ns = Ns Ar count ) (x and t mode only) Remove the specified number of leading path elements. diff --git a/tar/bsdtar.c b/tar/bsdtar.c index 43881cf3f..23db60189 100644 --- a/tar/bsdtar.c +++ b/tar/bsdtar.c @@ -118,7 +118,7 @@ static void version(void); * non-option. Otherwise, GNU getopt() permutes the arguments and * screws up -C processing. */ -static const char *tar_opts = "+Bb:C:cf:HhI:jkLlmnOoPprtT:UuvW:wX:xyZz"; +static const char *tar_opts = "+Bb:C:cf:HhI:jkLlmnOoPprtST:UuvW:wX:xyZz"; /* * Most of these long options are deliberately not documented. They @@ -499,6 +499,9 @@ main(int argc, char **argv) case 'r': /* SUSv2 */ set_mode(bsdtar, opt); break; + case 'S': /* NetBSD pax-as-tar */ + bsdtar->extract_flags |= ARCHIVE_EXTRACT_SPARSE; + break; case OPTION_STRIP_COMPONENTS: /* GNU tar 1.15 */ bsdtar->strip_components = atoi(optarg); break; -- 2.47.3