From: Karel Zak Date: Fri, 29 Oct 2021 11:42:58 +0000 (+0200) Subject: hardlink: rewrite files content comparison X-Git-Tag: v2.38-rc1~196^2~5 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=259bed153a9e97d3eec0f7d90ed0cef6fa407332;p=thirdparty%2Futil-linux.git hardlink: rewrite files content comparison Example: # hardlink --ignore-time --dry-run /var/lib Old version, based on memcmp(), nothing cached: Mode: dry-run Files: 93453 Linked: 36172 files Compared: 0 xattrs Compared: 25755615 files Saved: 80.11 MiB Duration: 204.557185 seconds New version, --method=memcmp, 32 bytes cached: Mode: dry-run Method: memcmp Files: 93453 Linked: 36172 files Compared: 0 xattrs Compared: 25755615 files Saved: 80.11 MiB Duration: 5.248426 seconds New version, zero-copy (Linux crypto API), sha1, crc32 and sha256: Mode: dry-run Method: sha1 Files: 93453 Linked: 36172 files Compared: 0 xattrs Compared: 25755615 files Saved: 80.11 MiB Duration: 2.870973 seconds Mode: dry-run Method: crc32 Files: 93453 Linked: 36172 files Compared: 0 xattrs Compared: 25755615 files Saved: 80.11 MiB Duration: 2.582651 seconds Mode: dry-run Method: sha256 Files: 93453 Linked: 36172 files Compared: 0 xattrs Compared: 25755615 files Saved: 80.11 MiB Duration: 3.152825 seconds The default is sha256 to be robust.. Signed-off-by: Karel Zak --- diff --git a/configure.ac b/configure.ac index 60e63592bd..60ee3b4235 100644 --- a/configure.ac +++ b/configure.ac @@ -315,6 +315,7 @@ AC_CHECK_HEADERS([ \ linux/tiocl.h \ linux/version.h \ linux/watchdog.h \ + linux/if_alg.h \ locale.h \ mntent.h \ net/if_dl.h \ diff --git a/misc-utils/Makemodule.am b/misc-utils/Makemodule.am index 6415f0cf0f..f5f12b8f13 100644 --- a/misc-utils/Makemodule.am +++ b/misc-utils/Makemodule.am @@ -241,7 +241,7 @@ if BUILD_HARDLINK usrbin_exec_PROGRAMS += hardlink MANPAGES += misc-utils/hardlink.1 dist_noinst_DATA += misc-utils/hardlink.1.adoc -hardlink_SOURCES = misc-utils/hardlink.c lib/monotonic.c +hardlink_SOURCES = misc-utils/hardlink.c lib/monotonic.c lib/fileeq.c hardlink_LDADD = $(LDADD) libcommon.la $(REALTIME_LIBS) hardlink_CFLAGS = $(AM_CFLAGS) endif diff --git a/misc-utils/hardlink.1.adoc b/misc-utils/hardlink.1.adoc index 84365656a2..cf7ce72bd5 100644 --- a/misc-utils/hardlink.1.adoc +++ b/misc-utils/hardlink.1.adoc @@ -38,6 +38,13 @@ Quiet mode, don't print anything. *-n*, *--dry-run*:: Do not act, just print what would happen. +*-y*, *--method* _name_:: +Set the file content comparison method. The currently supported methods are +sha256, sha1, crc32c and memcpy. The default is sha256 and memcmp if Linux +Crypto API is not available. The methods based on checksums are implemented in +zero-copy way, in this case file content is not copied to the userspace and all +calculation is done in kernel. + *-f*, *--respect-name*:: Only try to link files with the same (base)name. It's strongly recommended to use long options rather than *-f* which is interpreted in a different way by other *hardlink* implementations. @@ -77,6 +84,7 @@ costs some additional memory but potentially reduces the amount of seek operations and therefore improve performance, especially with mechanic disk drives. Optional factor suffixes are supported, like with the *-s* option. This is mostly efficient with other filters (i.e. with *-f* or *-X*) and can be less efficient with *-top* options. + == ARGUMENTS *hardlink* takes one or more directories which will be searched for files to be linked. diff --git a/misc-utils/hardlink.c b/misc-utils/hardlink.c index 7abec29006..d00d1db7ce 100644 --- a/misc-utils/hardlink.c +++ b/misc-utils/hardlink.c @@ -43,6 +43,7 @@ #include "strutils.h" #include "monotonic.h" #include "optutils.h" +#include "fileeq.h" #include /* regcomp(), regsearch() */ @@ -52,6 +53,8 @@ static int quiet; /* don't print anything */ +static struct ul_fileeq fileeq; + /** * struct file - Information about a file * @st: The stat buffer associated with the file @@ -63,6 +66,8 @@ static int quiet; /* don't print anything */ */ struct file { struct stat st; + struct ul_fileeq_data data; + struct file *next; struct link { struct link *next; @@ -142,6 +147,7 @@ static struct options { struct hdl_regex *include; struct hdl_regex *exclude; + const char *method; signed int verbosity; unsigned int respect_mode:1; unsigned int respect_owner:1; @@ -156,6 +162,7 @@ static struct options { size_t bufsiz; } opts = { /* default setting */ + .method = "sha256", .respect_mode = TRUE, .respect_owner = TRUE, .respect_time = TRUE, @@ -174,12 +181,6 @@ static struct options { static void *files; static void *files_by_ino; -/* - * Temporary buffers for reading file contents - */ -static char *buf_a = NULL; -static char *buf_b = NULL; - /* * last_signal * @@ -319,6 +320,7 @@ static void print_stats(void) jlog(JLOG_SUMMARY, "%-15s %s", _("Mode:"), opts.dry_run ? _("dry-run") : _("real")); + jlog(JLOG_SUMMARY, "%-15s %s", _("Method:"), opts.method); jlog(JLOG_SUMMARY, "%-15s %zu", _("Files:"), stats.files); jlog(JLOG_SUMMARY, _("%-15s %zu files"), _("Linked:"), stats.linked); @@ -553,82 +555,13 @@ static int file_xattrs_equal(const struct file *a, const struct file *b) } #endif /* HAVE_SYS_XATTR_H */ -/** - * file_contents_equal - Compare contents of two files for equality - * @a: The first file - * @b: The second file - * - * Compare the contents of the files for equality - */ -static int file_contents_equal(const struct file *a, const struct file *b) -{ - FILE *fa = NULL; - FILE *fb = NULL; - int cmp = 0; /* zero => equal */ - off_t off = 0; /* current offset */ - - assert(a->links != NULL); - assert(b->links != NULL); - - jlog(JLOG_VERBOSE2, _("Comparing %s to %s"), a->links->path, - b->links->path); - - stats.comparisons++; - - if ((fa = fopen(a->links->path, "rb")) == NULL) - goto err; - if ((fb = fopen(b->links->path, "rb")) == NULL) - goto err; - -#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE) - ignore_result( posix_fadvise(fileno(fa), 0, 0, POSIX_FADV_SEQUENTIAL) ); - ignore_result( posix_fadvise(fileno(fb), 0, 0, POSIX_FADV_SEQUENTIAL) ); -#endif - - while (!handle_interrupt() && cmp == 0) { - size_t ca; - size_t cb; - - ca = fread(buf_a, 1, opts.bufsiz, fa); - if (ca < sizeof(buf_a) && ferror(fa)) - goto err; - - cb = fread(buf_b, 1, opts.bufsiz, fb); - if (cb < sizeof(buf_b) && ferror(fb)) - goto err; - - off += ca; - - if ((ca != cb || ca == 0)) { - cmp = CMP(ca, cb); - break; - } - cmp = memcmp(buf_a, buf_b, ca); - } - out: - if (fa != NULL) - fclose(fa); - if (fb != NULL) - fclose(fb); - return !handle_interrupt() && cmp == 0; - err: - if (fa == NULL || fb == NULL) - warn(_("cannot open %s"), fa ? b->links->path : a->links->path); - else - warn(_("cannot read %s"), - ferror(fa) ? a->links->path : b->links->path); - cmp = 1; - goto out; -} - /** * file_may_link_to - Check whether a file may replace another one * @a: The first file * @b: The second file * - * Check whether the two fies are considered equal and can be linked - * together. If the two files are identical, the result will be FALSE, - * as replacing a link with an identical one is stupid. + * Check whether the two files are considered equal attributes and can be + * linked. This function does not compare content od the files! */ static int file_may_link_to(const struct file *a, const struct file *b) { @@ -644,8 +577,7 @@ static int file_may_link_to(const struct file *a, const struct file *b) (!opts.respect_name || strcmp(a->links->path + a->links->basename, b->links->path + b->links->basename) == 0) && - (!opts.respect_xattrs || file_xattrs_equal(a, b)) && - file_contents_equal(a, b)); + (!opts.respect_xattrs || file_xattrs_equal(a, b))); } /** @@ -854,6 +786,16 @@ static int inserter(const char *fpath, const struct stat *sb, return 0; } +static inline size_t count_nodes(struct file *x) +{ + size_t ct = 0; + + for ( ; x != NULL; x = x->next) + ct++; + + return ct; +} + /** * visitor - Callback for twalk() * @nodep: Pointer to a pointer to a #struct file @@ -867,6 +809,7 @@ static int inserter(const char *fpath, const struct stat *sb, static void visitor(const void *nodep, const VISIT which, const int depth) { struct file *master = *(struct file **)nodep; + struct file *begin = master; struct file *other; (void)depth; @@ -875,26 +818,66 @@ static void visitor(const void *nodep, const VISIT which, const int depth) return; for (; master != NULL; master = master->next) { + size_t nnodes, memsiz; if (handle_interrupt()) exit(EXIT_FAILURE); if (master->links == NULL) continue; + /* calculate per file max memory use */ + nnodes = count_nodes(master); + if (!nnodes) + continue; + memsiz = (10*1024*1024)/nnodes; + /* filesiz, readsiz, memsiz */ + ul_fileeq_set_size(&fileeq, master->st.st_size, 1024*1024, memsiz); + for (other = master->next; other != NULL; other = other->next) { + int eq; + if (handle_interrupt()) exit(EXIT_FAILURE); assert(other != other->next); assert(other->st.st_size == master->st.st_size); - if (other->links == NULL - || !file_may_link_to(master, other)) + /* check file attributes, etc. */ + if (!other->links || !file_may_link_to(master, other)) continue; - if (!file_link(master, other) && errno == EMLINK) + /* initialize content comparison */ + if (!ul_fileeq_data_associated(&master->data)) + ul_fileeq_data_set_file(&master->data, master->links->path); + if (!ul_fileeq_data_associated(&other->data)) + ul_fileeq_data_set_file(&other->data, other->links->path); + + /* compare files */ + eq = ul_fileeq(&fileeq, &master->data, &other->data); + + /* reduce number of open files, keep only master open */ + ul_fileeq_data_close_file(&other->data); + + stats.comparisons++; + + if (!eq) + continue; + + /* link files */ + if (!file_link(master, other) && errno == EMLINK) { + ul_fileeq_data_deinit(&master->data); master = other; + } } + + /* don't keep master data in memory */ + ul_fileeq_data_deinit(&master->data); + } + + /* final cleanup */ + for (other = begin; other != NULL; other = other->next) { + if (ul_fileeq_data_associated(&other->data)) + ul_fileeq_data_deinit(&other->data); } } @@ -916,6 +899,8 @@ static void __attribute__((__noreturn__)) usage(void) fputs(_(" -v, --verbose verbose output (repeat for more verbosity)\n"), out); fputs(_(" -q, --quiet quiet mode - don't print anything\n"), out); fputs(_(" -n, --dry-run don't actually link anything\n"), out); + fputs(_(" -y, --method file content comparison method\n"), out); + fputs(_(" -f, --respect-name filenames have to be identical\n"), out); fputs(_(" -p, --ignore-mode ignore changes of file mode\n"), out); fputs(_(" -o, --ignore-owner ignore owner changes\n"), out); @@ -941,19 +926,6 @@ static void __attribute__((__noreturn__)) usage(void) exit(EXIT_SUCCESS); } - -static void init_buffers(size_t bufsiz) -{ - buf_a = xmalloc(bufsiz); - buf_b = xmalloc(bufsiz); -} - -static void deinit_buffers(void) -{ - free(buf_a); - free(buf_b); -} - /** * parse_options - Parse the command line options * @argc: Number of options @@ -961,7 +933,7 @@ static void deinit_buffers(void) */ static int parse_options(int argc, char *argv[]) { - static const char optstr[] = "VhvnfpotXcmMOx:i:s:S:q"; + static const char optstr[] = "VhvnfpotXcmMOx:y:i:s:S:q"; static const struct option long_options[] = { {"version", no_argument, NULL, 'V'}, {"help", no_argument, NULL, 'h'}, @@ -977,6 +949,7 @@ static int parse_options(int argc, char *argv[]) {"keep-oldest", no_argument, NULL, 'O'}, {"exclude", required_argument, NULL, 'x'}, {"include", required_argument, NULL, 'i'}, + {"method", required_argument, NULL, 'y' }, {"minimum-size", required_argument, NULL, 's'}, {"buffer-size", required_argument, NULL, 'S'}, {"content", no_argument, NULL, 'c'}, @@ -1038,6 +1011,9 @@ static int parse_options(int argc, char *argv[]) case 'x': register_regex(&opts.exclude, optarg); break; + case 'y': + opts.method = optarg; + break; case 'i': register_regex(&opts.include, optarg); break; @@ -1082,6 +1058,7 @@ static void sighandler(int i) int main(int argc, char *argv[]) { struct sigaction sa; + int rc; sa.sa_handler = sighandler; sa.sa_flags = SA_RESTART; @@ -1104,7 +1081,14 @@ int main(int argc, char *argv[]) gettime_monotonic(&stats.start_time); - init_buffers(opts.bufsiz); + rc = ul_fileeq_init(&fileeq, opts.method); + if (rc != 0 && strcmp(opts.method, "memcmp") != 0) { + warnx(_("cannot initialize %s method, use 'memcmp' fallback"), opts.method); + opts.method = "memcmp"; + rc = ul_fileeq_init(&fileeq, opts.method); + } + if (rc < 0) + err(EXIT_FAILURE, _("failed to initialize files comparior")); stats.started = TRUE; @@ -1116,7 +1100,6 @@ int main(int argc, char *argv[]) twalk(files, visitor); - deinit_buffers(); - + ul_fileeq_deinit(&fileeq); return 0; }