From: Karel Zak Date: Mon, 1 Nov 2021 11:00:21 +0000 (+0100) Subject: hardlink: add --cache-size X-Git-Tag: v2.38-rc1~196^2~2 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=f3212b9165ada01d7fc761c5bb29ecb0449eb0c1;p=thirdparty%2Futil-linux.git hardlink: add --cache-size Signed-off-by: Karel Zak --- diff --git a/misc-utils/hardlink.1.adoc b/misc-utils/hardlink.1.adoc index bdd83d0978..36f8bfdbf7 100644 --- a/misc-utils/hardlink.1.adoc +++ b/misc-utils/hardlink.1.adoc @@ -24,6 +24,15 @@ hardlink - link multiple copies of a file *hardlink* is a tool which replaces copies of a file with hardlinks, therefore saving space. +*hardlink* creates binary tree from file sizes and after that, it compares files with the +same sizes. There are two basic content comparison methods. *memcmp* method directly reads +data blocks from files and compares them. The other methods are based on checksums (like SHA256), +in this case for each data block is calculated checksum by Linux kernel crypto API, and this +checksum is stored in userspace and used for files comparison. For each file is also cached +"intro" buffer (32 bytes), this buffer is used independently on the comparison method and requested +cache-size and io-size. The "intro" buffer dramatically reduces operations with data content as +files are very often different from the beginning. + == OPTIONS *-h*, *--help*:: @@ -79,12 +88,19 @@ A regular expression to include files. If the option *--exclude* has been given, The minimum size to consider. By default this is 1, so empty files will not be linked. The _size_ argument may be followed by the multiplicative suffixes KiB (=1024), MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" is optional, e.g., "K" has the same meaning as "KiB"). *-b*, *--io-size* _size_:: -The size of read or sendfile buffer used when comparing file contents. The -_size_ argument may be followed by the multiplicative suffixes KiB, MiB, etc. -The "iB" is optional, e.g., "K" has the same meaning as "KiB". The default is -8KiB for memcmp method and 1MiB for the other methods. The only memcmp method -uses process memory for the buffer, other methods use zero-copy way and I/O -operation is done in kernel. +The size of the read() or sendfile() buffer used when comparing file contents. +The _size_ argument may be followed by the multiplicative suffixes KiB, MiB, +etc. The "iB" is optional, e.g., "K" has the same meaning as "KiB". The +default is 8KiB for memcmp method and 1MiB for the other methods. The only +memcmp method uses process memory for the buffer, other methods use zero-copy +way and I/O operation is done in the kernel. The size may be altered on the fly +to fit a number of cached content checksums. + +*-r*, *--cache-size* _size_:: +The size of the cache for content checksums. All non-memcmp methods calculate checksum for each +file content block (see --io-size), these checksums are cached for the next comparison. The +size is important for large files or a large sets of files of the same size. The default is +10MiB. == ARGUMENTS diff --git a/misc-utils/hardlink.c b/misc-utils/hardlink.c index 035e3f43ec..20998ffeaf 100644 --- a/misc-utils/hardlink.c +++ b/misc-utils/hardlink.c @@ -156,6 +156,7 @@ static struct options { unsigned int dry_run:1; uintmax_t min_size; size_t io_size; + size_t cache_size; } opts = { /* default setting */ .method = "sha256", @@ -164,7 +165,8 @@ static struct options { .respect_time = TRUE, .respect_xattrs = FALSE, .keep_oldest = FALSE, - .min_size = 1 + .min_size = 1, + .cache_size = 10*1024*1024 }; /* @@ -824,7 +826,9 @@ static void visitor(const void *nodep, const VISIT which, const int depth) nnodes = count_nodes(master); if (!nnodes) continue; - memsiz = (10*1024*1024)/nnodes; + + /* per-file cache size */ + memsiz = opts.cache_size / nnodes; /* filesiz, readsiz, memsiz */ ul_fileeq_set_size(&fileeq, master->st.st_size, opts.io_size, memsiz); @@ -912,6 +916,7 @@ static void __attribute__((__noreturn__)) usage(void) fputs(_(" -i, --include regular expression to include files/dirs\n"), out); fputs(_(" -s, --minimum-size minimum size for files.\n"), out); fputs(_(" -b, --io-size I/O buffer size for file reading (speedup, using more RAM)\n"), out); + fputs(_(" -r, --cache-size memory limit for cached file content data\n"), out); fputs(_(" -c, --content compare only file contents, same as -pot\n"), out); fputs(USAGE_SEPARATOR, out); @@ -928,7 +933,7 @@ static void __attribute__((__noreturn__)) usage(void) */ static int parse_options(int argc, char *argv[]) { - static const char optstr[] = "VhvnfpotXcmMOx:y:i:s:b:q"; + static const char optstr[] = "VhvnfpotXcmMOx:y:i:r:s:b:q"; static const struct option long_options[] = { {"version", no_argument, NULL, 'V'}, {"help", no_argument, NULL, 'h'}, @@ -949,6 +954,7 @@ static int parse_options(int argc, char *argv[]) {"io-size", required_argument, NULL, 'b'}, {"content", no_argument, NULL, 'c'}, {"quiet", no_argument, NULL, 'q'}, + {"cache-size", required_argument, NULL, 'r'}, {NULL, 0, NULL, 0} }; static const ul_excl_t excl[] = { @@ -1015,6 +1021,9 @@ static int parse_options(int argc, char *argv[]) case 's': opts.min_size = strtosize_or_err(optarg, _("failed to parse size")); break; + case 'r': + opts.cache_size = strtosize_or_err(optarg, _("failed to cache size")); + break; case 'b': opts.io_size = strtosize_or_err(optarg, _("failed to parse I/O size")); break;