From ac417cce0ca752aa85d4453c9e697a66f8545bc7 Mon Sep 17 00:00:00 2001 From: =?utf8?q?Dag-Erling=20Sm=C3=B8rgrav?= Date: Sat, 23 Mar 2024 01:35:29 +0100 Subject: [PATCH] Improved control over frame size in zstd filter. (#2081) Instead of just `min-frame-size` and `max-frame-size`, we now have four separate options: * `min-frame-in` delays the creation of a new frame on flush until the uncompressed size of the current frame passes a certain threshold. * `min-frame-out` delays the creation of a new frame on flush until the compressed size of the current frame passes a certain threshold. * `max-frame-in` forces the creation of a new frame as soon as possible after the uncompressed size of the current frame reaches a certain limit. * `max-frame-out` forces the creation of a new frame as soon as possible after the compressed size of the current frame reaches a certain limit. We now also support `k`, `kB`, `M`, `MB`, `G` and `GB` suffixes for all four options. The old options are retained as aliases for the corresponding new options. --- libarchive/archive_write_add_filter_zstd.c | 92 ++++++++++++++++----- libarchive/test/test_write_filter_zstd.c | 94 +++++++++++++++++++--- tar/bsdtar.1 | 50 ++++++++++-- 3 files changed, 198 insertions(+), 38 deletions(-) diff --git a/libarchive/archive_write_add_filter_zstd.c b/libarchive/archive_write_add_filter_zstd.c index 9a7b7f22c..9a74085b5 100644 --- a/libarchive/archive_write_add_filter_zstd.c +++ b/libarchive/archive_write_add_filter_zstd.c @@ -1,5 +1,6 @@ /*- * Copyright (c) 2017 Sean Purcell + * Copyright (c) 2023-2024 Klara, Inc. * All rights reserved. * * Redistribution and use in source and binary forms, with or without @@ -59,8 +60,10 @@ struct private_data { resetting, } state; int frame_per_file; - size_t min_frame_size; - size_t max_frame_size; + size_t min_frame_in; + size_t max_frame_in; + size_t min_frame_out; + size_t max_frame_out; size_t cur_frame; size_t cur_frame_in; size_t cur_frame_out; @@ -129,8 +132,10 @@ archive_write_add_filter_zstd(struct archive *_a) data->long_distance = 0; #if HAVE_ZSTD_H && HAVE_LIBZSTD_COMPRESSOR data->frame_per_file = 0; - data->min_frame_size = 0; - data->max_frame_size = SIZE_MAX; + data->min_frame_in = 0; + data->max_frame_in = SIZE_MAX; + data->min_frame_out = 0; + data->max_frame_out = SIZE_MAX; data->cur_frame_in = 0; data->cur_frame_out = 0; data->cstream = ZSTD_createCStream(); @@ -170,7 +175,8 @@ archive_compressor_zstd_free(struct archive_write_filter *f) return (ARCHIVE_OK); } -static int string_to_number(const char *string, intmax_t *numberp) +static int +string_to_number(const char *string, intmax_t *numberp) { char *end; @@ -184,6 +190,41 @@ static int string_to_number(const char *string, intmax_t *numberp) return (ARCHIVE_OK); } +static int +string_to_size(const char *string, size_t *numberp) +{ + uintmax_t number; + char *end; + unsigned int shift = 0; + + if (string == NULL || *string == '\0' || *string == '-') + return (ARCHIVE_WARN); + number = strtoumax(string, &end, 10); + if (end > string) { + if (*end == 'K' || *end == 'k') { + shift = 10; + end++; + } else if (*end == 'M' || *end == 'm') { + shift = 20; + end++; + } else if (*end == 'G' || *end == 'g') { + shift = 30; + end++; + } + if (*end == 'B' || *end == 'b') { + end++; + } + } + if (end == string || *end != '\0' || errno == EOVERFLOW) { + return (ARCHIVE_WARN); + } + if (number > (uintmax_t)SIZE_MAX >> shift) { + return (ARCHIVE_WARN); + } + *numberp = (size_t)(number << shift); + return (ARCHIVE_OK); +} + /* * Set write options. */ @@ -232,25 +273,29 @@ archive_compressor_zstd_options(struct archive_write_filter *f, const char *key, } else if (strcmp(key, "frame-per-file") == 0) { data->frame_per_file = 1; return (ARCHIVE_OK); - } else if (strcmp(key, "min-frame-size") == 0) { - intmax_t min_frame_size; - if (string_to_number(value, &min_frame_size) != ARCHIVE_OK) { + } else if (strcmp(key, "min-frame-in") == 0) { + if (string_to_size(value, &data->min_frame_in) != ARCHIVE_OK) { return (ARCHIVE_WARN); } - if (min_frame_size < 0) { + return (ARCHIVE_OK); + } else if (strcmp(key, "min-frame-out") == 0 || + strcmp(key, "min-frame-size") == 0) { + if (string_to_size(value, &data->min_frame_out) != ARCHIVE_OK) { return (ARCHIVE_WARN); } - data->min_frame_size = min_frame_size; return (ARCHIVE_OK); - } else if (strcmp(key, "max-frame-size") == 0) { - intmax_t max_frame_size; - if (string_to_number(value, &max_frame_size) != ARCHIVE_OK) { + } else if (strcmp(key, "max-frame-in") == 0 || + strcmp(key, "max-frame-size") == 0) { + if (string_to_size(value, &data->max_frame_in) != ARCHIVE_OK || + data->max_frame_in < 1024) { return (ARCHIVE_WARN); } - if (max_frame_size < 1024) { + return (ARCHIVE_OK); + } else if (strcmp(key, "max-frame-out") == 0) { + if (string_to_size(value, &data->max_frame_out) != ARCHIVE_OK || + data->max_frame_out < 1024) { return (ARCHIVE_WARN); } - data->max_frame_size = max_frame_size; return (ARCHIVE_OK); #endif } @@ -353,9 +398,12 @@ archive_compressor_zstd_flush(struct archive_write_filter *f) { struct private_data *data = (struct private_data *)f->data; - if (data->frame_per_file && data->state == running && - data->cur_frame_out > data->min_frame_size) - data->state = finishing; + if (data->frame_per_file && data->state == running) { + if (data->cur_frame_in > data->min_frame_in && + data->cur_frame_out > data->min_frame_out) { + data->state = finishing; + } + } return (drive_compressor(f, data, 1, NULL, 0)); } @@ -414,9 +462,11 @@ drive_compressor(struct archive_write_filter *f, data->total_in += in.pos - ipos; data->cur_frame_in += in.pos - ipos; data->cur_frame_out += data->out.pos - opos; - if (data->state == running && - data->cur_frame_in >= data->max_frame_size) { - data->state = finishing; + if (data->state == running) { + if (data->cur_frame_in >= data->max_frame_in || + data->cur_frame_out >= data->max_frame_out) { + data->state = finishing; + } } if (data->out.pos == data->out.size || (flush && data->out.pos > 0)) { diff --git a/libarchive/test/test_write_filter_zstd.c b/libarchive/test/test_write_filter_zstd.c index 04c0baa89..c68074e87 100644 --- a/libarchive/test/test_write_filter_zstd.c +++ b/libarchive/test/test_write_filter_zstd.c @@ -136,28 +136,98 @@ DEFINE_TEST(test_write_filter_zstd) /* frame-per-file: boolean */ assertEqualIntA(a, ARCHIVE_OK, archive_write_set_filter_option(a, NULL, "frame-per-file", "")); - /* min-frame-size: >= 0 */ + /* min-frame-in: >= 0 */ assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "min-frame-size", "")); + archive_write_set_filter_option(a, NULL, "min-frame-out", "")); assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "min-frame-size", "-1")); + archive_write_set_filter_option(a, NULL, "min-frame-out", "-1")); assertEqualIntA(a, ARCHIVE_OK, - archive_write_set_filter_option(a, NULL, "min-frame-size", "0")); + archive_write_set_filter_option(a, NULL, "min-frame-out", "0")); assertEqualIntA(a, ARCHIVE_OK, - archive_write_set_filter_option(a, NULL, "min-frame-size", "1048576")); - /* max-frame-size: >= 1024 */ + archive_write_set_filter_option(a, NULL, "min-frame-out", "1048576")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1k")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1kB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1M")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1MB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1G")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-out", "1GB")); + /* min-frame-out: >= 0 */ + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "min-frame-in", "")); + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "min-frame-in", "-1")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "0")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1048576")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1k")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1kB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1M")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1MB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1G")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "min-frame-in", "1GB")); + /* max-frame-in: >= 1024 */ + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "max-frame-in", "")); assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "max-frame-size", "")); + archive_write_set_filter_option(a, NULL, "max-frame-in", "-1")); assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "max-frame-size", "-1")); + archive_write_set_filter_option(a, NULL, "max-frame-in", "0")); assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "max-frame-size", "0")); + archive_write_set_filter_option(a, NULL, "max-frame-in", "1023")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1024")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1048576")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1k")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1kB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1M")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1MB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1G")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-in", "1GB")); + /* max-frame-out: >= 1024 */ + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "max-frame-out", "")); assertEqualIntA(a, ARCHIVE_FAILED, - archive_write_set_filter_option(a, NULL, "max-frame-size", "1023")); + archive_write_set_filter_option(a, NULL, "max-frame-out", "-1")); + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "max-frame-out", "0")); + assertEqualIntA(a, ARCHIVE_FAILED, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1023")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1024")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1048576")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1k")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1kB")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1M")); + assertEqualIntA(a, ARCHIVE_OK, + archive_write_set_filter_option(a, NULL, "max-frame-out", "1MB")); assertEqualIntA(a, ARCHIVE_OK, - archive_write_set_filter_option(a, NULL, "max-frame-size", "1024")); + archive_write_set_filter_option(a, NULL, "max-frame-out", "1G")); assertEqualIntA(a, ARCHIVE_OK, - archive_write_set_filter_option(a, NULL, "max-frame-size", "1048576")); + archive_write_set_filter_option(a, NULL, "max-frame-out", "1GB")); #endif #if ZSTD_VERSION_NUMBER >= MINVER_LONG if ((int)(sizeof(size_t) == 4)) diff --git a/tar/bsdtar.1 b/tar/bsdtar.1 index ae84e00fc..e570d2a48 100644 --- a/tar/bsdtar.1 +++ b/tar/bsdtar.1 @@ -23,7 +23,7 @@ .\" OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF .\" SUCH DAMAGE. .\" -.Dd December 10, 2023 +.Dd March 1, 2024 .Dt TAR 1 .Os .Sh NAME @@ -655,16 +655,56 @@ use as many threads as there are CPU cores on the system. .It Cm zstd:frame-per-file Start a new compression frame at the beginning of each file in the archive. -.It Cm zstd:min-frame-size Ns = Ns Ar N +.It Cm zstd:min-frame-in Ns = Ns Ar N In combination with .Cm zstd:frame-per-file , -do not start a new compression frame unless the current frame is at least +do not start a new compression frame unless the uncompressed size of +the current frame is at least .Ar N bytes. -.It Cm zstd:max-frame-size Ns = Ns Ar N -Start a new compression frame as soon as the current frame exceeds +The number may be followed by +.Li k / Li kB , +.Li M / Li MB , +or +.Li G / Li GB +to indicate kilobytes, megabytes or gigabytes respectively. +.It Cm zstd:min-frame-out Ns = Ns Ar N , Cm zstd:min-frame-size Ns = Ns Ar N +In combination with +.Cm zstd:frame-per-file , +do not start a new compression frame unless the compressed size of the +current frame is at least +.Ar N +bytes. +The number may be followed by +.Li k / Li kB , +.Li M / Li MB , +or +.Li G / Li GB +to indicate kilobytes, megabytes or gigabytes respectively. +.It Cm zstd:max-frame-in Ns = Ns Ar N , Cm zstd:max-frame-size Ns = Ns Ar N +Start a new compression frame as soon as possible after the +uncompressed size of the current frame exceeds .Ar N bytes. +The number may be followed by +.Li k / Li kB , +.Li M / Li MB , +or +.Li G / Li GB +to indicate kilobytes, megabytes or gigabytes respectively. +Values less than 1,024 will be rejected. +.It Cm zstd:max-frame-out Ns = Ns Ar N +Start a new compression frame as soon as possible after the compressed +size of the current frame exceeds +.Ar N +bytes. +The number may be followed by +.Li k / Li kB , +.Li M / Li MB , +or +.Li G / Li GB +to indicate kilobytes, megabytes or gigabytes respectively. +Values less than 1,024 will be rejected. .It Cm lzop:compression-level A decimal integer from 1 to 9 specifying the lzop compression level. .It Cm xz:compression-level -- 2.47.2