*hardlink* is a tool which replaces copies of a file with hardlinks, therefore saving space.
+*hardlink* creates binary tree from file sizes and after that, it compares files with the
+same sizes. There are two basic content comparison methods. *memcmp* method directly reads
+data blocks from files and compares them. The other methods are based on checksums (like SHA256),
+in this case for each data block is calculated checksum by Linux kernel crypto API, and this
+checksum is stored in userspace and used for files comparison. For each file is also cached
+"intro" buffer (32 bytes), this buffer is used independently on the comparison method and requested
+cache-size and io-size. The "intro" buffer dramatically reduces operations with data content as
+files are very often different from the beginning.
+
== OPTIONS
*-h*, *--help*::
The minimum size to consider. By default this is 1, so empty files will not be linked. The _size_ argument may be followed by the multiplicative suffixes KiB (=1024), MiB (=1024*1024), and so on for GiB, TiB, PiB, EiB, ZiB and YiB (the "iB" is optional, e.g., "K" has the same meaning as "KiB").
*-b*, *--io-size* _size_::
-The size of read or sendfile buffer used when comparing file contents. The
-_size_ argument may be followed by the multiplicative suffixes KiB, MiB, etc.
-The "iB" is optional, e.g., "K" has the same meaning as "KiB". The default is
-8KiB for memcmp method and 1MiB for the other methods. The only memcmp method
-uses process memory for the buffer, other methods use zero-copy way and I/O
-operation is done in kernel.
+The size of the read() or sendfile() buffer used when comparing file contents.
+The _size_ argument may be followed by the multiplicative suffixes KiB, MiB,
+etc. The "iB" is optional, e.g., "K" has the same meaning as "KiB". The
+default is 8KiB for memcmp method and 1MiB for the other methods. The only
+memcmp method uses process memory for the buffer, other methods use zero-copy
+way and I/O operation is done in the kernel. The size may be altered on the fly
+to fit a number of cached content checksums.
+
+*-r*, *--cache-size* _size_::
+The size of the cache for content checksums. All non-memcmp methods calculate checksum for each
+file content block (see --io-size), these checksums are cached for the next comparison. The
+size is important for large files or a large sets of files of the same size. The default is
+10MiB.
== ARGUMENTS
unsigned int dry_run:1;
uintmax_t min_size;
size_t io_size;
+ size_t cache_size;
} opts = {
/* default setting */
.method = "sha256",
.respect_time = TRUE,
.respect_xattrs = FALSE,
.keep_oldest = FALSE,
- .min_size = 1
+ .min_size = 1,
+ .cache_size = 10*1024*1024
};
/*
nnodes = count_nodes(master);
if (!nnodes)
continue;
- memsiz = (10*1024*1024)/nnodes;
+
+ /* per-file cache size */
+ memsiz = opts.cache_size / nnodes;
/* filesiz, readsiz, memsiz */
ul_fileeq_set_size(&fileeq, master->st.st_size, opts.io_size, memsiz);
fputs(_(" -i, --include <regex> regular expression to include files/dirs\n"), out);
fputs(_(" -s, --minimum-size <size> minimum size for files.\n"), out);
fputs(_(" -b, --io-size <size> I/O buffer size for file reading (speedup, using more RAM)\n"), out);
+ fputs(_(" -r, --cache-size <size> memory limit for cached file content data\n"), out);
fputs(_(" -c, --content compare only file contents, same as -pot\n"), out);
fputs(USAGE_SEPARATOR, out);
*/
static int parse_options(int argc, char *argv[])
{
- static const char optstr[] = "VhvnfpotXcmMOx:y:i:s:b:q";
+ static const char optstr[] = "VhvnfpotXcmMOx:y:i:r:s:b:q";
static const struct option long_options[] = {
{"version", no_argument, NULL, 'V'},
{"help", no_argument, NULL, 'h'},
{"io-size", required_argument, NULL, 'b'},
{"content", no_argument, NULL, 'c'},
{"quiet", no_argument, NULL, 'q'},
+ {"cache-size", required_argument, NULL, 'r'},
{NULL, 0, NULL, 0}
};
static const ul_excl_t excl[] = {
case 's':
opts.min_size = strtosize_or_err(optarg, _("failed to parse size"));
break;
+ case 'r':
+ opts.cache_size = strtosize_or_err(optarg, _("failed to cache size"));
+ break;
case 'b':
opts.io_size = strtosize_or_err(optarg, _("failed to parse I/O size"));
break;