1 /* hardlink.c - Link multiple identical files together
3 * Copyright (C) 2008 - 2014 Julian Andres Klode <jak@jak-linux.org>
4 * Copyright (C) 2021 Karel Zak <kzak@redhat.com>
6 * SPDX-License-Identifier: MIT
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
26 #define _POSIX_C_SOURCE 200112L /* POSIX functions */
27 #define _XOPEN_SOURCE 600 /* nftw() */
29 #include <sys/types.h> /* stat */
30 #include <sys/stat.h> /* stat */
31 #include <sys/time.h> /* getrlimit, getrusage */
32 #include <sys/resource.h> /* getrlimit, getrusage */
33 #include <fcntl.h> /* posix_fadvise */
34 #include <ftw.h> /* ftw */
35 #include <search.h> /* tsearch() and friends */
36 #include <signal.h> /* SIG*, sigaction */
37 #include <getopt.h> /* getopt_long() */
38 #include <ctype.h> /* tolower() */
39 #include <sys/ioctl.h>
41 #if defined(HAVE_LINUX_FIEMAP_H) && defined(HAVE_SYS_VFS_H)
42 # include <linux/fs.h>
43 # include <linux/fiemap.h>
45 # define USE_REFLINK 1
53 #include "monotonic.h"
58 # include "statfs_magic.h"
61 #include <regex.h> /* regcomp(), regexec() */
63 #if defined(HAVE_SYS_XATTR_H) && defined(HAVE_LLISTXATTR) && defined(HAVE_LGETXATTR)
64 # include <sys/xattr.h>
68 static int quiet
; /* don't print anything */
69 static int rootbasesz
; /* size of the directory for nftw() */
77 static int reflink_mode
= REFLINK_NEVER
;
78 static int reflinks_skip
;
81 static struct ul_fileeq fileeq
;
84 * struct file - Information about a file
85 * @st: The stat buffer associated with the file
86 * @next: Next file with the same size
87 * @basename: The offset off the basename in the filename
88 * @path: The path of the file
90 * This contains all information we need about a file.
94 struct ul_fileeq_data data
;
101 #if __STDC_VERSION__ >= 199901L
112 * enum log_level - Logging levels
113 * @JLOG_SUMMARY: Default log level
114 * @JLOG_INFO: Verbose logging (verbose == 1)
115 * @JLOG_VERBOSE1: Verbosity 2
116 * @JLOG_VERBOSE2: Verbosity 3
126 * struct statistic - Statistics about the file
127 * @started: Whether we are post command-line processing
128 * @files: The number of files worked on
129 * @linked: The number of files replaced by a hardlink to a master
130 * @xattr_comparisons: The number of extended attribute comparisons
131 * @comparisons: The number of comparisons
132 * @saved: The (exaggerated) amount of space saved
133 * @start_time: The time we started at
135 static struct statistics
{
139 size_t xattr_comparisons
;
141 size_t ignored_reflinks
;
143 struct timeval start_time
;
148 regex_t re
; /* POSIX compatible regex handler */
150 struct hdl_regex
*next
;
154 * struct options - Processed command-line options
155 * @include: A linked list of regular expressions for the --include option
156 * @exclude: A linked list of regular expressions for the --exclude option
157 * @verbosity: The verbosity. Should be one of #enum log_level
158 * @respect_mode: Whether to respect file modes (default = TRUE)
159 * @respect_owner: Whether to respect file owners (uid, gid; default = TRUE)
160 * @respect_name: Whether to respect file names (default = FALSE)
161 * @respect_time: Whether to respect file modification times (default = TRUE)
162 * @respect_xattrs: Whether to respect extended attributes (default = FALSE)
163 * @maximise: Chose the file with the highest link count as master
164 * @minimise: Chose the file with the lowest link count as master
165 * @keep_oldest: Choose the file with oldest timestamp as master (default = FALSE)
166 * @dry_run: Specifies whether hardlink should not link files (default = FALSE)
167 * @min_size: Minimum size of files to consider. (default = 1 byte)
168 * @max_size: Maximum size of files to consider, 0 means umlimited. (default = 0 byte)
170 static struct options
{
171 struct hdl_regex
*include
;
172 struct hdl_regex
*exclude
;
175 signed int verbosity
;
176 unsigned int respect_mode
:1;
177 unsigned int respect_owner
:1;
178 unsigned int respect_name
:1;
179 unsigned int respect_dir
:1;
180 unsigned int respect_time
:1;
181 unsigned int respect_xattrs
:1;
182 unsigned int maximise
:1;
183 unsigned int minimise
:1;
184 unsigned int keep_oldest
:1;
185 unsigned int dry_run
:1;
191 /* default setting */
192 #ifdef USE_FILEEQ_CRYPTOAPI
197 .respect_mode
= TRUE
,
198 .respect_owner
= TRUE
,
199 .respect_time
= TRUE
,
200 .respect_xattrs
= FALSE
,
201 .keep_oldest
= FALSE
,
203 .cache_size
= 10*1024*1024
209 * A binary tree of files, managed using tsearch(). To see which nodes
210 * are considered equal, see compare_nodes()
213 static void *files_by_ino
;
218 * The last signal we received. We store the signal here in order to be able
219 * to break out of loops gracefully and to return from our nftw() handler.
221 static volatile sig_atomic_t last_signal
;
224 #define is_log_enabled(_level) (quiet == 0 && (_level) <= (unsigned int)opts.verbosity)
227 * jlog - Logging for hardlink
228 * @level: The log level
229 * @format: A format string for printf()
231 __attribute__((format(printf
, 2, 3)))
232 static void jlog(enum log_level level
, const char *format
, ...)
236 if (!is_log_enabled(level
))
239 va_start(args
, format
);
240 vfprintf(stdout
, format
, args
);
246 * CMP - Compare two numerical values, return 1, 0, or -1
250 * Used to compare two integers of any size while avoiding overflow.
252 #define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
255 * register_regex - Compile and insert a regular expression into list
256 * @pregs: Pointer to a linked list of regular expressions
257 * @regex: String containing the regular expression to be compiled
259 static void register_regex(struct hdl_regex
**pregs
, const char *regex
)
261 struct hdl_regex
*link
;
264 link
= xmalloc(sizeof(*link
));
266 if ((err
= regcomp(&link
->re
, regex
, REG_NOSUB
| REG_EXTENDED
)) != 0) {
267 size_t size
= regerror(err
, &link
->re
, NULL
, 0);
268 char *buf
= xmalloc(size
+ 1);
270 regerror(err
, &link
->re
, buf
, size
);
272 errx(EXIT_FAILURE
, _("could not compile regular expression %s: %s"),
275 link
->next
= *pregs
; *pregs
= link
;
279 * match_any_regex - Match against multiple regular expressions
280 * @pregs: A linked list of regular expressions
281 * @what: The string to match against
283 * Checks whether any of the regular expressions in the list matches the
286 static int match_any_regex(struct hdl_regex
*pregs
, const char *what
)
288 for (; pregs
!= NULL
; pregs
= pregs
->next
) {
289 if (regexec(&pregs
->re
, what
, 0, NULL
, 0) == 0)
296 * compare_nodes - Node comparison function
297 * @_a: The first node (a #struct file)
298 * @_b: The second node (a #struct file)
300 * Compare the two nodes for the binary tree.
302 static int compare_nodes(const void *_a
, const void *_b
)
304 const struct file
*a
= _a
;
305 const struct file
*b
= _b
;
309 diff
= CMP(a
->st
.st_dev
, b
->st
.st_dev
);
311 diff
= CMP(a
->st
.st_size
, b
->st
.st_size
);
316 /* Compare only filenames */
317 static inline int filename_strcmp(const struct file
*a
, const struct file
*b
)
319 return strcmp( a
->links
->path
+ a
->links
->basename
,
320 b
->links
->path
+ b
->links
->basename
);
324 * Compare only directory names (ignores root directory and basename (filename))
326 * The complete path conrains three fragments:
328 * <rootdir> is specified on hardlink command line
329 * <dirname> is all betweehn rootdir and filename
330 * <filename> is last component (aka basename)
332 static inline int dirname_strcmp(const struct file
*a
, const struct file
*b
)
335 int asz
= a
->links
->basename
- a
->links
->dirname
,
336 bsz
= b
->links
->basename
- b
->links
->dirname
;
338 diff
= CMP(asz
, bsz
);
341 const char *a_start
, *b_start
;
343 a_start
= a
->links
->path
+ a
->links
->dirname
;
344 b_start
= b
->links
->path
+ b
->links
->dirname
;
346 diff
= strncmp(a_start
, b_start
, asz
);
352 * compare_nodes_ino - Node comparison function
353 * @_a: The first node (a #struct file)
354 * @_b: The second node (a #struct file)
356 * Compare the two nodes for the binary tree.
358 static int compare_nodes_ino(const void *_a
, const void *_b
)
360 const struct file
*a
= _a
;
361 const struct file
*b
= _b
;
365 diff
= CMP(a
->st
.st_dev
, b
->st
.st_dev
);
367 diff
= CMP(a
->st
.st_ino
, b
->st
.st_ino
);
369 /* If opts.respect_name is used, we will restrict a struct file to
370 * contain only links with the same basename to keep the rest simple.
372 if (diff
== 0 && opts
.respect_name
)
373 diff
= filename_strcmp(a
, b
);
374 if (diff
== 0 && opts
.respect_dir
)
375 diff
= dirname_strcmp(a
, b
);
381 * print_stats - Print statistics to stdout
383 static void print_stats(void)
385 struct timeval end
= { 0, 0 }, delta
= { 0, 0 };
388 gettime_monotonic(&end
);
389 timersub(&end
, &stats
.start_time
, &delta
);
391 jlog(JLOG_SUMMARY
, "%-25s %s", _("Mode:"),
392 opts
.dry_run
? _("dry-run") : _("real"));
393 jlog(JLOG_SUMMARY
, "%-25s %s", _("Method:"), opts
.method
);
394 jlog(JLOG_SUMMARY
, "%-25s %zu", _("Files:"), stats
.files
);
395 jlog(JLOG_SUMMARY
, _("%-25s %zu files"), _("Linked:"), stats
.linked
);
398 jlog(JLOG_SUMMARY
, _("%-25s %zu xattrs"), _("Compared:"),
399 stats
.xattr_comparisons
);
401 jlog(JLOG_SUMMARY
, _("%-25s %zu files"), _("Compared:"),
405 jlog(JLOG_SUMMARY
, _("%-25s %zu files"), _("Skipped reflinks:"),
406 stats
.ignored_reflinks
);
408 ssz
= size_to_human_string(SIZE_SUFFIX_3LETTER
|
410 SIZE_DECIMAL_2DIGITS
, stats
.saved
);
412 jlog(JLOG_SUMMARY
, "%-25s %s", _("Saved:"), ssz
);
415 jlog(JLOG_SUMMARY
, _("%-25s %"PRId64
".%06"PRId64
" seconds"), _("Duration:"),
416 (int64_t)delta
.tv_sec
, (int64_t)delta
.tv_usec
);
420 * handle_interrupt - Handle a signal
422 * Returns: %TRUE on SIGINT, SIGTERM; %FALSE on all other signals.
424 static int handle_interrupt(void)
426 switch (last_signal
) {
443 * llistxattr_or_die - Wrapper for llistxattr()
445 * This does the same thing as llistxattr() except that it aborts if any error
446 * other than "not supported" is detected.
448 static ssize_t
llistxattr_or_die(const char *path
, char *list
, size_t size
)
450 ssize_t len
= llistxattr(path
, list
, size
);
452 if (len
< 0 && errno
!= ENOTSUP
)
453 err(EXIT_FAILURE
, _("cannot get xattr names for %s"), path
);
459 * lgetxattr_or_die - Wrapper for lgetxattr()
461 * This does the same thing as lgetxattr() except that it aborts upon error.
463 static ssize_t
lgetxattr_or_die(const char *path
,
464 const char *name
, void *value
, size_t size
)
466 ssize_t len
= lgetxattr(path
, name
, value
, size
);
469 err(EXIT_FAILURE
, _("cannot get xattr value of %s for %s"),
476 * get_xattr_name_count - Count the number of xattr names
477 * @names: a non-empty table of concatenated, null-terminated xattr names
478 * @len: the total length of the table
480 * @Returns the number of xattr names
482 static int get_xattr_name_count(const char *const names
, ssize_t len
)
487 for (name
= names
; name
< (names
+ len
); name
+= strlen(name
) + 1)
494 * cmp_xattr_name_ptrs - Compare two pointers to xattr names by comparing
495 * the names they point to.
497 static int cmp_xattr_name_ptrs(const void *ptr1
, const void *ptr2
)
499 return strcmp(*(char *const *)ptr1
, *(char *const *)ptr2
);
503 * get_sorted_xattr_name_table - Create a sorted table of xattr names.
504 * @names - table of concatenated, null-terminated xattr names
505 * @n - the number of names
507 * @Returns allocated table of pointers to the names, sorted alphabetically
509 static const char **get_sorted_xattr_name_table(const char *names
, int n
)
511 const char **table
= xmalloc(n
* sizeof(char *));
514 for (i
= 0; i
< n
; i
++) {
516 names
+= strlen(names
) + 1;
519 qsort(table
, n
, sizeof(char *), cmp_xattr_name_ptrs
);
525 * file_xattrs_equal - Compare the extended attributes of two files
527 * @b: The second file
529 * @Returns: %TRUE if and only if extended attributes are equal
531 static int file_xattrs_equal(const struct file
*a
, const struct file
*b
)
535 char *names_a
= NULL
;
536 char *names_b
= NULL
;
539 const char **name_ptrs_a
= NULL
;
540 const char **name_ptrs_b
= NULL
;
541 void *value_a
= NULL
;
542 void *value_b
= NULL
;
546 assert(a
->links
!= NULL
);
547 assert(b
->links
!= NULL
);
549 jlog(JLOG_VERBOSE1
, _("Comparing xattrs of %s to %s"), a
->links
->path
,
552 stats
.xattr_comparisons
++;
554 len_a
= llistxattr_or_die(a
->links
->path
, NULL
, 0);
555 len_b
= llistxattr_or_die(b
->links
->path
, NULL
, 0);
557 if (len_a
<= 0 && len_b
<= 0)
558 return TRUE
; // xattrs not supported or neither file has any
561 return FALSE
; // total lengths of xattr names differ
563 names_a
= xmalloc(len_a
);
564 names_b
= xmalloc(len_b
);
566 len_a
= llistxattr_or_die(a
->links
->path
, names_a
, len_a
);
567 len_b
= llistxattr_or_die(b
->links
->path
, names_b
, len_b
);
568 assert((len_a
> 0) && (len_a
== len_b
));
570 n_a
= get_xattr_name_count(names_a
, len_a
);
571 n_b
= get_xattr_name_count(names_b
, len_b
);
574 goto exit
; // numbers of xattrs differ
576 name_ptrs_a
= get_sorted_xattr_name_table(names_a
, n_a
);
577 name_ptrs_b
= get_sorted_xattr_name_table(names_b
, n_b
);
579 // We now have two sorted tables of xattr names.
581 for (i
= 0; i
< n_a
; i
++) {
582 if (handle_interrupt())
583 goto exit
; // user wants to quit
585 if (strcmp(name_ptrs_a
[i
], name_ptrs_b
[i
]) != 0)
586 goto exit
; // names at same slot differ
589 lgetxattr_or_die(a
->links
->path
, name_ptrs_a
[i
], NULL
, 0);
591 lgetxattr_or_die(b
->links
->path
, name_ptrs_b
[i
], NULL
, 0);
594 goto exit
; // xattrs with same name, different value lengths
596 value_a
= xmalloc(len_a
);
597 value_b
= xmalloc(len_b
);
599 len_a
= lgetxattr_or_die(a
->links
->path
, name_ptrs_a
[i
],
601 len_b
= lgetxattr_or_die(b
->links
->path
, name_ptrs_b
[i
],
603 assert((len_a
>= 0) && (len_a
== len_b
));
605 if (memcmp(value_a
, value_b
, len_a
) != 0)
606 goto exit
; // xattrs with same name, different values
625 #else /* !USE_XATTR */
626 static int file_xattrs_equal(const struct file
*a
, const struct file
*b
)
630 #endif /* USE_XATTR */
633 * file_may_link_to - Check whether a file may replace another one
635 * @b: The second file
637 * Check whether the two files are considered equal attributes and can be
638 * linked. This function does not compare content od the files!
640 static int file_may_link_to(const struct file
*a
, const struct file
*b
)
642 return (a
->st
.st_size
!= 0 &&
643 a
->st
.st_size
== b
->st
.st_size
&&
644 a
->links
!= NULL
&& b
->links
!= NULL
&&
645 a
->st
.st_dev
== b
->st
.st_dev
&&
646 a
->st
.st_ino
!= b
->st
.st_ino
&&
647 (!opts
.respect_mode
|| a
->st
.st_mode
== b
->st
.st_mode
) &&
648 (!opts
.respect_owner
|| a
->st
.st_uid
== b
->st
.st_uid
) &&
649 (!opts
.respect_owner
|| a
->st
.st_gid
== b
->st
.st_gid
) &&
650 (!opts
.respect_time
|| a
->st
.st_mtime
== b
->st
.st_mtime
) &&
651 (!opts
.respect_name
|| filename_strcmp(a
, b
) == 0) &&
652 (!opts
.respect_dir
|| dirname_strcmp(a
, b
) == 0) &&
653 (!opts
.respect_xattrs
|| file_xattrs_equal(a
, b
)));
657 * file_compare - Compare two files to decide which should be master
659 * @b: The second file
661 * Check which of the files should be considered greater and thus serve
662 * as the master when linking (the master is the file that all equal files
663 * will be replaced with).
665 static int file_compare(const struct file
*a
, const struct file
*b
)
668 if (a
->st
.st_dev
== b
->st
.st_dev
&& a
->st
.st_ino
== b
->st
.st_ino
)
671 if (res
== 0 && opts
.maximise
)
672 res
= CMP(a
->st
.st_nlink
, b
->st
.st_nlink
);
673 if (res
== 0 && opts
.minimise
)
674 res
= CMP(b
->st
.st_nlink
, a
->st
.st_nlink
);
676 res
= opts
.keep_oldest
? CMP(b
->st
.st_mtime
, a
->st
.st_mtime
)
677 : CMP(a
->st
.st_mtime
, b
->st
.st_mtime
);
679 res
= CMP(b
->st
.st_ino
, a
->st
.st_ino
);
685 static inline int do_link(struct file
*a
, struct file
*b
,
686 const char *new_name
, int reflink
)
689 int dest
= -1, src
= -1;
691 dest
= open(new_name
, O_CREAT
|O_WRONLY
|O_TRUNC
, 0600);
694 if (fchmod(dest
, b
->st
.st_mode
) != 0)
696 if (fchown(dest
, b
->st
.st_uid
, b
->st
.st_gid
) != 0)
698 src
= open(a
->links
->path
, O_RDONLY
);
701 if (ioctl(dest
, FICLONE
, src
) != 0)
714 if (reflink_mode
== REFLINK_ALWAYS
)
716 jlog(JLOG_VERBOSE2
,_("Reflinking failed, fallback to hardlinking"));
719 return link(a
->links
->path
, new_name
);
722 static inline int do_link(struct file
*a
,
723 struct file
*b
__attribute__((__unused__
)),
724 const char *new_name
,
725 int reflink
__attribute__((__unused__
)))
727 return link(a
->links
->path
, new_name
);
729 #endif /* USE_REFLINK */
732 * file_link - Replace b with a link to a
734 * @b: The second file
736 * Link the file, replacing @b with the current one. The file is first
737 * linked to a temporary name, and then renamed to the name of @b, making
738 * the replace atomic (@b will always exist).
740 static int file_link(struct file
*a
, struct file
*b
, int reflink
)
744 assert(a
->links
!= NULL
);
745 assert(b
->links
!= NULL
);
747 if (is_log_enabled(JLOG_INFO
)) {
748 char *ssz
= size_to_human_string(SIZE_SUFFIX_3LETTER
|
750 SIZE_DECIMAL_2DIGITS
, a
->st
.st_size
);
751 jlog(JLOG_INFO
, _("%s%sLinking %s to %s (-%s)"),
752 opts
.dry_run
? _("[DryRun] ") : "",
753 reflink
? "Ref" : "",
754 a
->links
->path
, b
->links
->path
,
763 xasprintf(&new_path
, "%s.hardlink-temporary", b
->links
->path
);
765 if (do_link(a
, b
, new_path
, reflink
) != 0)
766 warn(_("cannot link %s to %s"), a
->links
->path
, new_path
);
768 else if (rename(new_path
, b
->links
->path
) != 0) {
769 warn(_("cannot rename %s to %s"), a
->links
->path
, new_path
);
779 /* Update statistics */
782 /* Increase the link count of this file, and set stat() of other file */
786 if (b
->st
.st_nlink
== 0)
787 stats
.saved
+= a
->st
.st_size
;
789 /* Move the link from file b to a */
791 struct link
*new_link
= b
->links
;
793 b
->links
= b
->links
->next
;
794 new_link
->next
= a
->links
->next
;
795 a
->links
->next
= new_link
;
805 static int has_fpath(struct file
*node
, const char *path
)
809 for (l
= node
->links
; l
; l
= l
->next
) {
810 if (strcmp(l
->path
, path
) == 0)
819 * inserter - Callback function for nftw()
820 * @fpath: The path of the file being visited
821 * @sb: The stat information of the file
822 * @typeflag: The type flag
823 * @ftwbuf: Contains current level of nesting and offset of basename
825 * Called by nftw() for the files. See the manual page for nftw() for
826 * further information.
828 static int inserter(const char *fpath
, const struct stat
*sb
,
829 int typeflag
, struct FTW
*ftwbuf
)
837 if (handle_interrupt())
839 if (typeflag
== FTW_DNR
|| typeflag
== FTW_NS
)
840 warn(_("cannot read %s"), fpath
);
841 if (typeflag
!= FTW_F
|| !S_ISREG(sb
->st_mode
))
844 included
= match_any_regex(opts
.include
, fpath
);
845 excluded
= match_any_regex(opts
.exclude
, fpath
);
847 if ((opts
.exclude
&& excluded
&& !included
) ||
848 (!opts
.exclude
&& opts
.include
&& !included
))
853 if ((uintmax_t) sb
->st_size
< opts
.min_size
) {
855 _("Skipped %s (smaller than configured size)"), fpath
);
859 jlog(JLOG_VERBOSE2
, " %5zu: [%" PRIu64
"/%" PRIu64
"/%zu] %s",
860 stats
.files
, sb
->st_dev
, sb
->st_ino
,
861 (size_t) sb
->st_nlink
, fpath
);
863 if ((opts
.max_size
> 0) && ((uintmax_t) sb
->st_size
> opts
.max_size
)) {
865 _("Skipped %s (greater than configured size)"), fpath
);
869 pathlen
= strlen(fpath
) + 1;
871 fil
= xcalloc(1, sizeof(*fil
));
872 fil
->links
= xcalloc(1, sizeof(struct link
) + pathlen
);
875 fil
->links
->basename
= ftwbuf
->base
;
876 fil
->links
->dirname
= rootbasesz
;
877 fil
->links
->next
= NULL
;
879 memcpy(fil
->links
->path
, fpath
, pathlen
);
881 node
= tsearch(fil
, &files_by_ino
, compare_nodes_ino
);
887 /* Already known inode, add link to inode information */
888 assert((*node
)->st
.st_dev
== sb
->st_dev
);
889 assert((*node
)->st
.st_ino
== sb
->st_ino
);
891 if (has_fpath(*node
, fpath
)) {
893 _("Skipped %s (specified more than once)"), fpath
);
896 fil
->links
->next
= (*node
)->links
;
897 (*node
)->links
= fil
->links
;
902 /* New inode, insert into by-size table */
903 node
= tsearch(fil
, &files
, compare_nodes
);
911 if (file_compare(fil
, *node
) >= 0) {
915 for (l
= *node
; l
!= NULL
; l
= l
->next
) {
917 && file_compare(fil
, l
->next
) < 0)
932 warn(_("cannot continue")); /* probably ENOMEM */
937 static int is_reflink_compatible(dev_t devno
, const char *filename
)
939 static dev_t last_dev
= 0;
940 static int last_status
= 0;
942 if (last_dev
!= devno
) {
945 if (statfs(filename
, &vfs
) != 0)
949 switch (vfs
.f_type
) {
950 case STATFS_BTRFS_MAGIC
:
951 case STATFS_XFS_MAGIC
:
963 static int is_reflink(struct file
*xa
, struct file
*xb
)
965 int last
= 0, rc
= 0;
966 char abuf
[BUFSIZ
] = { 0 },
967 bbuf
[BUFSIZ
] = { 0 };
969 struct fiemap
*amap
= (struct fiemap
*) abuf
,
970 *bmap
= (struct fiemap
*) bbuf
;
972 int af
= open(xa
->links
->path
, O_RDONLY
),
973 bf
= open(xb
->links
->path
, O_RDONLY
);
975 if (af
< 0 || bf
< 0)
981 amap
->fm_length
= ~0ULL;
982 amap
->fm_flags
= FIEMAP_FLAG_SYNC
;
983 amap
->fm_extent_count
= (sizeof(abuf
) - sizeof(*amap
)) / sizeof(struct fiemap_extent
);
985 bmap
->fm_length
= ~0ULL;
986 bmap
->fm_flags
= FIEMAP_FLAG_SYNC
;
987 bmap
->fm_extent_count
= (sizeof(bbuf
) - sizeof(*bmap
)) / sizeof(struct fiemap_extent
);
989 if (ioctl(af
, FS_IOC_FIEMAP
, (unsigned long) amap
) < 0)
991 if (ioctl(bf
, FS_IOC_FIEMAP
, (unsigned long) bmap
) < 0)
994 if (amap
->fm_mapped_extents
== 0 ||
995 amap
->fm_mapped_extents
!= bmap
->fm_mapped_extents
)
998 for (i
= 0; i
< amap
->fm_mapped_extents
; i
++) {
999 struct fiemap_extent
*a
= &amap
->fm_extents
[i
];
1000 struct fiemap_extent
*b
= &bmap
->fm_extents
[i
];
1002 if (a
->fe_logical
!= b
->fe_logical
||
1003 a
->fe_length
!= b
->fe_length
||
1004 a
->fe_physical
!= b
->fe_physical
)
1006 if (!(a
->fe_flags
& FIEMAP_EXTENT_SHARED
) ||
1007 !(b
->fe_flags
& FIEMAP_EXTENT_SHARED
))
1009 if (a
->fe_flags
& FIEMAP_EXTENT_LAST
)
1013 bmap
->fm_start
= amap
->fm_start
=
1014 amap
->fm_extents
[amap
->fm_mapped_extents
- 1].fe_logical
+
1015 amap
->fm_extents
[amap
->fm_mapped_extents
- 1].fe_length
;
1016 } while (last
== 0);
1026 #endif /* USE_REFLINK */
1028 static inline size_t count_nodes(struct file
*x
)
1032 for ( ; x
!= NULL
; x
= x
->next
)
1039 * visitor - Callback for twalk()
1040 * @nodep: Pointer to a pointer to a #struct file
1041 * @which: At which point this visit is (preorder, postorder, endorder)
1042 * @depth: The depth of the node in the tree
1044 * Visit the nodes in the binary tree. For each node, call hardlinker()
1045 * on each #struct file in the linked list of #struct file instances located
1048 static void visitor(const void *nodep
, const VISIT which
, const int depth
)
1050 struct file
*master
= *(struct file
**)nodep
;
1051 struct file
*begin
= master
;
1056 if (which
!= leaf
&& which
!= endorder
)
1059 for (; master
!= NULL
; master
= master
->next
) {
1060 size_t nnodes
, memsiz
;
1061 int may_reflink
= 0;
1063 if (handle_interrupt())
1065 if (master
->links
== NULL
)
1068 /* calculate per file max memory use */
1069 nnodes
= count_nodes(master
);
1073 /* per-file cache size */
1074 memsiz
= opts
.cache_size
/ nnodes
;
1075 /* filesiz, readsiz, memsiz */
1076 ul_fileeq_set_size(&fileeq
, master
->st
.st_size
, opts
.io_size
, memsiz
);
1079 if (reflink_mode
|| reflinks_skip
) {
1081 reflink_mode
== REFLINK_ALWAYS
? 1 :
1082 is_reflink_compatible(master
->st
.st_dev
,
1083 master
->links
->path
);
1086 for (other
= master
->next
; other
!= NULL
; other
= other
->next
) {
1089 if (handle_interrupt())
1092 assert(other
!= other
->next
);
1093 assert(other
->st
.st_size
== master
->st
.st_size
);
1098 /* check file attributes, etc. */
1099 if (!file_may_link_to(master
, other
)) {
1101 _("Skipped (attributes mismatch) %s"), other
->links
->path
);
1105 if (may_reflink
&& reflinks_skip
&& is_reflink(master
, other
)) {
1107 _("Skipped (already reflink) %s"), other
->links
->path
);
1108 stats
.ignored_reflinks
++;
1112 /* initialize content comparison */
1113 if (!ul_fileeq_data_associated(&master
->data
))
1114 ul_fileeq_data_set_file(&master
->data
, master
->links
->path
);
1115 if (!ul_fileeq_data_associated(&other
->data
))
1116 ul_fileeq_data_set_file(&other
->data
, other
->links
->path
);
1119 eq
= ul_fileeq(&fileeq
, &master
->data
, &other
->data
);
1121 /* reduce number of open files, keep only master open */
1122 ul_fileeq_data_close_file(&other
->data
);
1124 stats
.comparisons
++;
1128 _("Skipped (content mismatch) %s"), other
->links
->path
);
1133 if (!file_link(master
, other
, may_reflink
) && errno
== EMLINK
) {
1134 ul_fileeq_data_deinit(&master
->data
);
1139 /* don't keep master data in memory */
1140 ul_fileeq_data_deinit(&master
->data
);
1144 for (other
= begin
; other
!= NULL
; other
= other
->next
) {
1145 if (ul_fileeq_data_associated(&other
->data
))
1146 ul_fileeq_data_deinit(&other
->data
);
1151 * usage - Print the program help and exit
1153 static void __attribute__((__noreturn__
)) usage(void)
1157 fputs(USAGE_HEADER
, out
);
1158 fprintf(out
, _(" %s [options] <directory>|<file> ...\n"),
1159 program_invocation_short_name
);
1161 fputs(USAGE_SEPARATOR
, out
);
1162 fputs(_("Consolidate duplicate files using hardlinks.\n"), out
);
1164 fputs(USAGE_OPTIONS
, out
);
1165 fputs(_(" -c, --content compare only file contents, same as -pot\n"), out
);
1166 fputs(_(" -b, --io-size <size> I/O buffer size for file reading\n"
1167 " (speedup, using more RAM)\n"), out
);
1168 fputs(_(" -d, --respect-dir directory names have to be identical\n"), out
);
1169 fputs(_(" -f, --respect-name filenames have to be identical\n"), out
);
1170 fputs(_(" -i, --include <regex> regular expression to include files/dirs\n"), out
);
1171 fputs(_(" -m, --maximize maximize the hardlink count, remove the file with\n"
1172 " lowest hardlink count\n"), out
);
1173 fputs(_(" -M, --minimize reverse the meaning of -m\n"), out
);
1174 fputs(_(" -n, --dry-run don't actually link anything\n"), out
);
1175 fputs(_(" -o, --ignore-owner ignore owner changes\n"), out
);
1176 fputs(_(" -O, --keep-oldest keep the oldest file of multiple equal files\n"
1177 " (lower precedence than minimize/maximize)\n"), out
);
1178 fputs(_(" -p, --ignore-mode ignore changes of file mode\n"), out
);
1179 fputs(_(" -q, --quiet quiet mode - don't print anything\n"), out
);
1180 fputs(_(" -r, --cache-size <size> memory limit for cached file content data\n"), out
);
1181 fputs(_(" -s, --minimum-size <size> minimum size for files.\n"), out
);
1182 fputs(_(" -S, --maximum-size <size> maximum size for files.\n"), out
);
1183 fputs(_(" -t, --ignore-time ignore timestamps (when testing for equality)\n"), out
);
1184 fputs(_(" -v, --verbose verbose output (repeat for more verbosity)\n"), out
);
1185 fputs(_(" -x, --exclude <regex> regular expression to exclude files\n"), out
);
1187 fputs(_(" -X, --respect-xattrs respect extended attributes\n"), out
);
1189 fputs(_(" -y, --method <name> file content comparison method\n"), out
);
1192 fputs(_(" --reflink[=<when>] create clone/CoW copies (auto, always, never)\n"), out
);
1193 fputs(_(" --skip-reflinks skip already cloned files (enabled on --reflink)\n"), out
);
1195 fputs(USAGE_SEPARATOR
, out
);
1196 fprintf(out
, USAGE_HELP_OPTIONS(28));
1197 fprintf(out
, USAGE_MAN_TAIL("hardlink(1)"));
1203 * parse_options - Parse the command line options
1204 * @argc: Number of options
1205 * @argv: Array of options
1207 static int parse_options(int argc
, char *argv
[])
1210 OPT_REFLINK
= CHAR_MAX
+ 1,
1213 static const char optstr
[] = "VhvndfpotXcmMOx:y:i:r:S:s:b:q";
1214 static const struct option long_options
[] = {
1215 {"version", no_argument
, NULL
, 'V'},
1216 {"help", no_argument
, NULL
, 'h'},
1217 {"verbose", no_argument
, NULL
, 'v'},
1218 {"dry-run", no_argument
, NULL
, 'n'},
1219 {"respect-name", no_argument
, NULL
, 'f'},
1220 {"respect-dir", no_argument
, NULL
, 'd'},
1221 {"ignore-mode", no_argument
, NULL
, 'p'},
1222 {"ignore-owner", no_argument
, NULL
, 'o'},
1223 {"ignore-time", no_argument
, NULL
, 't'},
1224 {"respect-xattrs", no_argument
, NULL
, 'X'},
1225 {"maximize", no_argument
, NULL
, 'm'},
1226 {"minimize", no_argument
, NULL
, 'M'},
1227 {"keep-oldest", no_argument
, NULL
, 'O'},
1228 {"exclude", required_argument
, NULL
, 'x'},
1229 {"include", required_argument
, NULL
, 'i'},
1230 {"method", required_argument
, NULL
, 'y' },
1231 {"minimum-size", required_argument
, NULL
, 's'},
1232 {"maximum-size", required_argument
, NULL
, 'S'},
1234 {"reflink", optional_argument
, NULL
, OPT_REFLINK
},
1235 {"skip-reflinks", no_argument
, NULL
, OPT_SKIP_RELINKS
},
1237 {"io-size", required_argument
, NULL
, 'b'},
1238 {"content", no_argument
, NULL
, 'c'},
1239 {"quiet", no_argument
, NULL
, 'q'},
1240 {"cache-size", required_argument
, NULL
, 'r'},
1243 static const ul_excl_t excl
[] = {
1247 int excl_st
[ARRAY_SIZE(excl
)] = UL_EXCL_STATUS_INIT
;
1248 int c
, content_only
= 0;
1250 while ((c
= getopt_long(argc
, argv
, optstr
, long_options
, NULL
)) != -1) {
1252 err_exclusive_options(c
, long_options
, excl
, excl_st
);
1256 opts
.respect_mode
= FALSE
;
1259 opts
.respect_owner
= FALSE
;
1262 opts
.respect_time
= FALSE
;
1265 opts
.respect_xattrs
= TRUE
;
1268 opts
.maximise
= TRUE
;
1271 opts
.minimise
= TRUE
;
1274 opts
.keep_oldest
= TRUE
;
1277 opts
.respect_name
= TRUE
;
1280 opts
.respect_dir
= TRUE
;
1295 register_regex(&opts
.exclude
, optarg
);
1298 opts
.method
= optarg
;
1301 register_regex(&opts
.include
, optarg
);
1304 opts
.min_size
= strtosize_or_err(optarg
, _("failed to parse minimum size"));
1307 opts
.max_size
= strtosize_or_err(optarg
, _("failed to parse maximum size"));
1310 opts
.cache_size
= strtosize_or_err(optarg
, _("failed to parse cache size"));
1313 opts
.io_size
= strtosize_or_err(optarg
, _("failed to parse I/O size"));
1317 reflink_mode
= REFLINK_AUTO
;
1319 if (strcmp(optarg
, "auto") == 0)
1320 reflink_mode
= REFLINK_AUTO
;
1321 else if (strcmp(optarg
, "always") == 0)
1322 reflink_mode
= REFLINK_ALWAYS
;
1323 else if (strcmp(optarg
, "never") == 0)
1324 reflink_mode
= REFLINK_NEVER
;
1326 errx(EXIT_FAILURE
, _("unsupported reflink mode; %s"), optarg
);
1328 if (reflink_mode
!= REFLINK_NEVER
)
1331 case OPT_SKIP_RELINKS
:
1339 static const char *features
[] = {
1343 #ifdef USE_FILEEQ_CRYPTOAPI
1348 print_version_with_features(EXIT_SUCCESS
, features
);
1351 errtryhelp(EXIT_FAILURE
);
1356 opts
.respect_mode
= FALSE
;
1357 opts
.respect_name
= FALSE
;
1358 opts
.respect_dir
= FALSE
;
1359 opts
.respect_owner
= FALSE
;
1360 opts
.respect_time
= FALSE
;
1361 opts
.respect_xattrs
= FALSE
;
1367 * to_be_called_atexit - Cleanup handler, also prints statistics.
1369 static void to_be_called_atexit(void)
1376 * sighandler - Signal handler, sets the global last_signal variable
1377 * @i: The signal number
1379 static void sighandler(int i
)
1381 if (last_signal
!= SIGINT
)
1384 /* can't use stdio on signal handler */
1385 ignore_result(write(STDOUT_FILENO
, "\n", sizeof("\n")-1));
1388 int main(int argc
, char *argv
[])
1390 struct sigaction sa
;
1393 sa
.sa_handler
= sighandler
;
1394 sa
.sa_flags
= SA_RESTART
;
1395 sigfillset(&sa
.sa_mask
);
1397 /* If we receive a SIGINT, end the processing */
1398 sigaction(SIGINT
, &sa
, NULL
);
1399 sigaction(SIGUSR1
, &sa
, NULL
);
1401 /* Localize messages, number formatting, and anything else. */
1402 setlocale(LC_ALL
, "");
1403 bindtextdomain(PACKAGE
, LOCALEDIR
);
1404 textdomain(PACKAGE
);
1406 if (atexit(to_be_called_atexit
) != 0)
1407 err(EXIT_FAILURE
, _("cannot register exit handler"));
1409 parse_options(argc
, argv
);
1412 errx(EXIT_FAILURE
, _("no directory or file specified"));
1414 gettime_monotonic(&stats
.start_time
);
1416 rc
= ul_fileeq_init(&fileeq
, opts
.method
);
1417 if (rc
!= 0 && strcmp(opts
.method
, "memcmp") != 0) {
1418 jlog(JLOG_INFO
, _("cannot initialize %s method, use 'memcmp' fallback"), opts
.method
);
1419 opts
.method
= "memcmp";
1420 rc
= ul_fileeq_init(&fileeq
, opts
.method
);
1423 err(EXIT_FAILURE
, _("failed to initialize files comparior"));
1425 /* defautl I/O size */
1426 if (!opts
.io_size
) {
1427 if (strcmp(opts
.method
, "memcmp") == 0)
1428 opts
.io_size
= 8*1024;
1430 opts
.io_size
= 1024*1024;
1433 stats
.started
= TRUE
;
1435 jlog(JLOG_VERBOSE2
, _("Scanning [device/inode/links]:"));
1436 for (; optind
< argc
; optind
++) {
1437 char *path
= realpath(argv
[optind
], NULL
);
1440 warn(_("cannot get realpath: %s"), argv
[optind
]);
1443 if (opts
.respect_dir
)
1444 rootbasesz
= strlen(path
);
1445 if (nftw(path
, inserter
, 20, FTW_PHYS
) == -1)
1446 warn(_("cannot process %s"), path
);
1451 twalk(files
, visitor
);
1453 ul_fileeq_deinit(&fileeq
);