]> git.ipfire.org Git - thirdparty/util-linux.git/blob - misc-utils/hardlink.c
hardlink: cleanup summary
[thirdparty/util-linux.git] / misc-utils / hardlink.c
1 /* hardlink.c - Link multiple identical files together
2 *
3 * Copyright (C) 2008 - 2014 Julian Andres Klode <jak@jak-linux.org>
4 * Copyright (C) 2021 Karel Zak <kzak@redhat.com>
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26 #define _POSIX_C_SOURCE 200112L /* POSIX functions */
27 #define _XOPEN_SOURCE 600 /* nftw() */
28
29 #include <sys/types.h> /* stat */
30 #include <sys/stat.h> /* stat */
31 #include <sys/time.h> /* getrlimit, getrusage */
32 #include <sys/resource.h> /* getrlimit, getrusage */
33 #include <fcntl.h> /* posix_fadvise */
34 #include <ftw.h> /* ftw */
35 #include <search.h> /* tsearch() and friends */
36 #include <signal.h> /* SIG*, sigaction */
37 #include <getopt.h> /* getopt_long() */
38 #include <ctype.h> /* tolower() */
39
40 #include "nls.h"
41 #include "c.h"
42 #include "xalloc.h"
43 #include "strutils.h"
44 #include "monotonic.h"
45
46 /* Use libpcre2posix if it's available */
47 #ifdef HAVE_PCRE2_POSIX
48 # include <pcre2posix.h>
49 # undef REG_NOSUB
50 # define REG_NOSUB 0 /* we do want backreferences in PCRE mode */
51 #else
52 # include <regex.h> /* regcomp(), regsearch() */
53 #endif
54
55 #ifdef HAVE_SYS_XATTR_H
56 # include <sys/xattr.h> /* listxattr, getxattr */
57 #endif
58
59 /**
60 * struct file - Information about a file
61 * @st: The stat buffer associated with the file
62 * @next: Next file with the same size
63 * @basename: The offset off the basename in the filename
64 * @path: The path of the file
65 *
66 * This contains all information we need about a file.
67 */
68 struct file {
69 struct stat st;
70 struct file *next;
71 struct link {
72 struct link *next;
73 int basename;
74 #if __STDC_VERSION__ >= 199901L
75 char path[];
76 #elif __GNUC__
77 char path[0];
78 #else
79 char path[1];
80 #endif
81 } *links;
82 };
83
84 /**
85 * enum log_level - Logging levels
86 * @JLOG_SUMMARY: Default log level
87 * @JLOG_INFO: Verbose logging (verbose == 1)
88 * @JLOG_DEBUG1: Verbosity 2
89 * @JLOG_DEBUG2: Verbosity 3
90 */
91 enum log_level {
92 JLOG_SUMMARY,
93 JLOG_INFO,
94 JLOG_DEBUG1,
95 JLOG_DEBUG2
96 };
97
98 /**
99 * struct statistic - Statistics about the file
100 * @started: Whether we are post command-line processing
101 * @files: The number of files worked on
102 * @linked: The number of files replaced by a hardlink to a master
103 * @xattr_comparisons: The number of extended attribute comparisons
104 * @comparisons: The number of comparisons
105 * @saved: The (exaggerated) amount of space saved
106 * @start_time: The time we started at
107 */
108 static struct statistics {
109 int started;
110 size_t files;
111 size_t linked;
112 size_t xattr_comparisons;
113 size_t comparisons;
114 double saved;
115 struct timeval start_time;
116 } stats;
117
118 /**
119 * struct options - Processed command-line options
120 * @include: A linked list of regular expressions for the --include option
121 * @exclude: A linked list of regular expressions for the --exclude option
122 * @verbosity: The verbosity. Should be one of #enum log_level
123 * @respect_mode: Whether to respect file modes (default = TRUE)
124 * @respect_owner: Whether to respect file owners (uid, gid; default = TRUE)
125 * @respect_name: Whether to respect file names (default = FALSE)
126 * @respect_time: Whether to respect file modification times (default = TRUE)
127 * @respect_xattrs: Whether to respect extended attributes (default = FALSE)
128 * @maximise: Chose the file with the highest link count as master
129 * @minimise: Chose the file with the lowest link count as master
130 * @keep_oldest: Choose the file with oldest timestamp as master (default = FALSE)
131 * @dry_run: Specifies whether hardlink should not link files (default = FALSE)
132 * @min_size: Minimum size of files to consider. (default = 1 byte)
133 */
134 static struct options {
135 struct regex_link {
136 regex_t preg;
137 struct regex_link *next;
138 } *include, *exclude;
139 signed int verbosity;
140 unsigned int respect_mode:1;
141 unsigned int respect_owner:1;
142 unsigned int respect_name:1;
143 unsigned int respect_time:1;
144 unsigned int respect_xattrs:1;
145 unsigned int maximise:1;
146 unsigned int minimise:1;
147 unsigned int keep_oldest:1;
148 unsigned int dry_run:1;
149 uintmax_t min_size;
150 } opts = {
151 /* default setting */
152 .respect_mode = TRUE,
153 .respect_owner = TRUE,
154 .respect_time = TRUE,
155 .respect_xattrs = FALSE,
156 .keep_oldest = FALSE,
157 .min_size = 1
158 };
159
160 /*
161 * files
162 *
163 * A binary tree of files, managed using tsearch(). To see which nodes
164 * are considered equal, see compare_nodes()
165 */
166 static void *files;
167 static void *files_by_ino;
168
169 /*
170 * last_signal
171 *
172 * The last signal we received. We store the signal here in order to be able
173 * to break out of loops gracefully and to return from our nftw() handler.
174 */
175 static int last_signal;
176
177 /**
178 * jlog - Logging for hardlink
179 * @level: The log level
180 * @format: A format string for printf()
181 */
182 __attribute__ ((format(printf, 2, 3)))
183 static void jlog(enum log_level level, const char *format, ...)
184 {
185 va_list args;
186
187 if (level > (unsigned int) opts.verbosity)
188 return;
189
190 va_start(args, format);
191 vfprintf(stdout, format, args);
192 va_end(args);
193 fputc('\n', stdout);
194 }
195
196 /**
197 * CMP - Compare two numerical values, return 1, 0, or -1
198 * @a: First value
199 * @b: Second value
200 *
201 * Used to compare two integers of any size while avoiding overflow.
202 */
203 #define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
204
205 /**
206 * regexec_any - Match against multiple regular expressions
207 * @pregs: A linked list of regular expressions
208 * @what: The string to match against
209 *
210 * Checks whether any of the regular expressions in the list matches the
211 * string.
212 */
213 static int regexec_any(struct regex_link *pregs, const char *what)
214 {
215 for (; pregs != NULL; pregs = pregs->next)
216 if (regexec(&pregs->preg, what, 0, NULL, 0) == 0)
217 return TRUE;
218 return FALSE;
219 }
220
221 /**
222 * compare_nodes - Node comparison function
223 * @_a: The first node (a #struct file)
224 * @_b: The second node (a #struct file)
225 *
226 * Compare the two nodes for the binary tree.
227 */
228 static int compare_nodes(const void *_a, const void *_b)
229 {
230 const struct file *a = _a;
231 const struct file *b = _b;
232 int diff = 0;
233
234 if (diff == 0)
235 diff = CMP(a->st.st_dev, b->st.st_dev);
236 if (diff == 0)
237 diff = CMP(a->st.st_size, b->st.st_size);
238
239 return diff;
240 }
241
242 /**
243 * compare_nodes_ino - Node comparison function
244 * @_a: The first node (a #struct file)
245 * @_b: The second node (a #struct file)
246 *
247 * Compare the two nodes for the binary tree.
248 */
249 static int compare_nodes_ino(const void *_a, const void *_b)
250 {
251 const struct file *a = _a;
252 const struct file *b = _b;
253 int diff = 0;
254
255 if (diff == 0)
256 diff = CMP(a->st.st_dev, b->st.st_dev);
257 if (diff == 0)
258 diff = CMP(a->st.st_ino, b->st.st_ino);
259
260 /* If opts.respect_name is used, we will restrict a struct file to
261 * contain only links with the same basename to keep the rest simple.
262 */
263 if (diff == 0 && opts.respect_name)
264 diff = strcmp(a->links->path + a->links->basename,
265 b->links->path + b->links->basename);
266
267 return diff;
268 }
269
270 /**
271 * print_stats - Print statistics to stdout
272 */
273 static void print_stats(void)
274 {
275 struct timeval end = { 0, 0 }, delta = { 0, 0 };
276 char *ssz;
277
278 gettime_monotonic(&end);
279 timersub(&end, &stats.start_time, &delta);
280
281 jlog(JLOG_SUMMARY, "%-15s %s", _("Mode:"),
282 opts.dry_run ? _("dry-run") : _("real"));
283 jlog(JLOG_SUMMARY, "%-15s %zu", _("Files:"), stats.files);
284 jlog(JLOG_SUMMARY, _("%-15s %zu files"), _("Linked:"), stats.linked);
285 #ifdef HAVE_SYS_XATTR_H
286 jlog(JLOG_SUMMARY, _("%-15s %zu xattrs"), _("Compared:"), stats.xattr_comparisons);
287 #endif
288 jlog(JLOG_SUMMARY, _("%-15s %zu files"), _("Compared:"), stats.comparisons);
289
290 ssz = size_to_human_string(SIZE_SUFFIX_3LETTER|
291 SIZE_SUFFIX_SPACE|
292 SIZE_DECIMAL_2DIGITS, stats.saved);
293
294 jlog(JLOG_SUMMARY, "%-15s %s", _("Saved:"), ssz);
295 free(ssz);
296
297 jlog(JLOG_SUMMARY, _("%-15s %ld.%06ld seconds"), _("Duration:"),
298 (long)delta.tv_sec, (long)delta.tv_usec);
299 }
300
301 /**
302 * handle_interrupt - Handle a signal
303 *
304 * Returns: %TRUE on SIGINT, SIGTERM; %FALSE on all other signals.
305 */
306 static int handle_interrupt(void)
307 {
308 switch (last_signal) {
309 case SIGINT:
310 case SIGTERM:
311 return TRUE;
312 case SIGUSR1:
313 print_stats();
314 putchar('\n');
315 break;
316 }
317
318 last_signal = 0;
319 return FALSE;
320 }
321
322 #ifdef HAVE_SYS_XATTR_H
323
324 /**
325 * llistxattr_or_die - Wrapper for llistxattr()
326 *
327 * This does the same thing as llistxattr() except that it aborts if any error
328 * other than "not supported" is detected.
329 */
330 static ssize_t llistxattr_or_die(const char *path, char *list, size_t size)
331 {
332 ssize_t len = llistxattr(path, list, size);
333
334 if (len < 0 && errno != ENOTSUP)
335 err(EXIT_FAILURE, _("cannot get xattr names for %s"), path);
336
337 return len;
338 }
339
340 /**
341 * lgetxattr_or_die - Wrapper for lgetxattr()
342 *
343 * This does the same thing as lgetxattr() except that it aborts upon error.
344 */
345 static ssize_t lgetxattr_or_die(const char *path, const char *name, void *value,
346 size_t size)
347 {
348 ssize_t len = lgetxattr(path, name, value, size);
349
350 if (len < 0)
351 err(EXIT_FAILURE, _("cannot get xattr value of %s for %s"), name, path);
352
353 return len;
354 }
355
356 /**
357 * get_xattr_name_count - Count the number of xattr names
358 * @names: a non-empty table of concatenated, null-terminated xattr names
359 * @len: the total length of the table
360 *
361 * @Returns the number of xattr names
362 */
363 static int get_xattr_name_count(const char *const names, ssize_t len)
364 {
365 int count = 0;
366 const char *name;
367
368 for (name = names; name < (names + len); name += strlen(name) + 1)
369 count++;
370
371 return count;
372 }
373
374 /**
375 * cmp_xattr_name_ptrs - Compare two pointers to xattr names by comparing
376 * the names they point to.
377 */
378 static int cmp_xattr_name_ptrs(const void *ptr1, const void *ptr2)
379 {
380 return strcmp(*(char *const *) ptr1, *(char *const *) ptr2);
381 }
382
383 /**
384 * get_sorted_xattr_name_table - Create a sorted table of xattr names.
385 * @names - table of concatenated, null-terminated xattr names
386 * @n - the number of names
387 *
388 * @Returns allocated table of pointers to the names, sorted alphabetically
389 */
390 static const char **get_sorted_xattr_name_table(const char *names, int n)
391 {
392 const char **table = xmalloc(n * sizeof(char *));
393 int i;
394
395 for (i = 0; i < n; i++) {
396 table[i] = names;
397 names += strlen(names) + 1;
398 }
399
400 qsort(table, n, sizeof(char *), cmp_xattr_name_ptrs);
401
402 return table;
403 }
404
405 /**
406 * file_xattrs_equal - Compare the extended attributes of two files
407 * @a: The first file
408 * @b: The second file
409 *
410 * @Returns: %TRUE if and only if extended attributes are equal
411 */
412 static int file_xattrs_equal(const struct file *a, const struct file *b)
413 {
414 ssize_t len_a;
415 ssize_t len_b;
416 char *names_a = NULL;
417 char *names_b = NULL;
418 int n_a;
419 int n_b;
420 const char **name_ptrs_a = NULL;
421 const char **name_ptrs_b = NULL;
422 void *value_a = NULL;
423 void *value_b = NULL;
424 int ret = FALSE;
425 int i;
426
427 assert(a->links != NULL);
428 assert(b->links != NULL);
429
430 jlog(JLOG_DEBUG1, "Comparing xattrs of %s to %s", a->links->path,
431 b->links->path);
432
433 stats.xattr_comparisons++;
434
435 len_a = llistxattr_or_die(a->links->path, NULL, 0);
436 len_b = llistxattr_or_die(b->links->path, NULL, 0);
437
438 if (len_a <= 0 && len_b <= 0)
439 return TRUE; // xattrs not supported or neither file has any
440
441 if (len_a != len_b)
442 return FALSE; // total lengths of xattr names differ
443
444 names_a = xmalloc(len_a);
445 names_b = xmalloc(len_b);
446
447 len_a = llistxattr_or_die(a->links->path, names_a, len_a);
448 len_b = llistxattr_or_die(b->links->path, names_b, len_b);
449 assert((len_a > 0) && (len_a == len_b));
450
451 n_a = get_xattr_name_count(names_a, len_a);
452 n_b = get_xattr_name_count(names_b, len_b);
453
454 if (n_a != n_b)
455 goto exit; // numbers of xattrs differ
456
457 name_ptrs_a = get_sorted_xattr_name_table(names_a, n_a);
458 name_ptrs_b = get_sorted_xattr_name_table(names_b, n_b);
459
460 // We now have two sorted tables of xattr names.
461
462 for (i = 0; i < n_a; i++) {
463 if (handle_interrupt())
464 goto exit; // user wants to quit
465
466 if (strcmp(name_ptrs_a[i], name_ptrs_b[i]) != 0)
467 goto exit; // names at same slot differ
468
469 len_a = lgetxattr_or_die(a->links->path, name_ptrs_a[i], NULL, 0);
470 len_b = lgetxattr_or_die(b->links->path, name_ptrs_b[i], NULL, 0);
471
472 if (len_a != len_b)
473 goto exit; // xattrs with same name, different value lengths
474
475 value_a = xmalloc(len_a);
476 value_b = xmalloc(len_b);
477
478 len_a = lgetxattr_or_die(a->links->path, name_ptrs_a[i],
479 value_a, len_a);
480 len_b = lgetxattr_or_die(b->links->path, name_ptrs_b[i],
481 value_b, len_b);
482 assert((len_a >= 0) && (len_a == len_b));
483
484 if (memcmp(value_a, value_b, len_a) != 0)
485 goto exit; // xattrs with same name, different values
486
487 free(value_a);
488 free(value_b);
489 value_a = NULL;
490 value_b = NULL;
491 }
492
493 ret = TRUE;
494
495 exit:
496 free(names_a);
497 free(names_b);
498 free(name_ptrs_a);
499 free(name_ptrs_b);
500 free(value_a);
501 free(value_b);
502 return ret;
503 }
504 #else
505 static int file_xattrs_equal(const struct file *a, const struct file *b)
506 {
507 return TRUE;
508 }
509 #endif
510
511 /**
512 * file_contents_equal - Compare contents of two files for equality
513 * @a: The first file
514 * @b: The second file
515 *
516 * Compare the contents of the files for equality
517 */
518 static int file_contents_equal(const struct file *a, const struct file *b)
519 {
520 FILE *fa = NULL;
521 FILE *fb = NULL;
522 char buf_a[8192];
523 char buf_b[8192];
524 int cmp = 0; /* zero => equal */
525 off_t off = 0; /* current offset */
526
527 assert(a->links != NULL);
528 assert(b->links != NULL);
529
530 jlog(JLOG_DEBUG1, "Comparing %s to %s", a->links->path, b->links->path);
531
532 stats.comparisons++;
533
534 if ((fa = fopen(a->links->path, "rb")) == NULL)
535 goto err;
536 if ((fb = fopen(b->links->path, "rb")) == NULL)
537 goto err;
538
539 #if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE)
540 posix_fadvise(fileno(fa), 0, 0, POSIX_FADV_SEQUENTIAL);
541 posix_fadvise(fileno(fb), 0, 0, POSIX_FADV_SEQUENTIAL);
542 #endif
543
544 while (!handle_interrupt() && cmp == 0) {
545 size_t ca;
546 size_t cb;
547
548 ca = fread(buf_a, 1, sizeof(buf_a), fa);
549 if (ca < sizeof(buf_a) && ferror(fa))
550 goto err;
551
552 cb = fread(buf_b, 1, sizeof(buf_b), fb);
553 if (cb < sizeof(buf_b) && ferror(fb))
554 goto err;
555
556 off += ca;
557
558 if ((ca != cb || ca == 0)) {
559 cmp = CMP(ca, cb);
560 break;
561 }
562 cmp = memcmp(buf_a, buf_b, ca);
563 }
564 out:
565 if (fa != NULL)
566 fclose(fa);
567 if (fb != NULL)
568 fclose(fb);
569 return !handle_interrupt() && cmp == 0;
570 err:
571 if (fa == NULL || fb == NULL)
572 warn(_("cannot open %s"), fa ? b->links->path : a->links->path);
573 else
574 warn(_("cannot read %s"), ferror(fa) ? a->links->path : b->links->path);
575 cmp = 1;
576 goto out;
577 }
578
579 /**
580 * file_may_link_to - Check whether a file may replace another one
581 * @a: The first file
582 * @b: The second file
583 *
584 * Check whether the two fies are considered equal and can be linked
585 * together. If the two files are identical, the result will be FALSE,
586 * as replacing a link with an identical one is stupid.
587 */
588 static int file_may_link_to(const struct file *a, const struct file *b)
589 {
590 return (a->st.st_size != 0 &&
591 a->st.st_size == b->st.st_size &&
592 a->links != NULL && b->links != NULL &&
593 a->st.st_dev == b->st.st_dev &&
594 a->st.st_ino != b->st.st_ino &&
595 (!opts.respect_mode || a->st.st_mode == b->st.st_mode) &&
596 (!opts.respect_owner || a->st.st_uid == b->st.st_uid) &&
597 (!opts.respect_owner || a->st.st_gid == b->st.st_gid) &&
598 (!opts.respect_time || a->st.st_mtime == b->st.st_mtime) &&
599 (!opts.respect_name
600 || strcmp(a->links->path + a->links->basename,
601 b->links->path + b->links->basename) == 0) &&
602 (!opts.respect_xattrs || file_xattrs_equal(a, b)) &&
603 file_contents_equal(a, b));
604 }
605
606 /**
607 * file_compare - Compare two files to decide which should be master
608 * @a: The first file
609 * @b: The second file
610 *
611 * Check which of the files should be considered greater and thus serve
612 * as the master when linking (the master is the file that all equal files
613 * will be replaced with).
614 */
615 static int file_compare(const struct file *a, const struct file *b)
616 {
617 int res = 0;
618 if (a->st.st_dev == b->st.st_dev && a->st.st_ino == b->st.st_ino)
619 return 0;
620
621 if (res == 0 && opts.maximise)
622 res = CMP(a->st.st_nlink, b->st.st_nlink);
623 if (res == 0 && opts.minimise)
624 res = CMP(b->st.st_nlink, a->st.st_nlink);
625 if (res == 0)
626 res = opts.keep_oldest ? CMP(b->st.st_mtime, a->st.st_mtime)
627 : CMP(a->st.st_mtime, b->st.st_mtime);
628 if (res == 0)
629 res = CMP(b->st.st_ino, a->st.st_ino);
630
631 return res;
632 }
633
634 /**
635 * file_link - Replace b with a link to a
636 * @a: The first file
637 * @b: The second file
638 *
639 * Link the file, replacing @b with the current one. The file is first
640 * linked to a temporary name, and then renamed to the name of @b, making
641 * the replace atomic (@b will always exist).
642 */
643 static int file_link(struct file *a, struct file *b)
644 {
645 char *ssz;
646
647 file_link:
648 assert(a->links != NULL);
649 assert(b->links != NULL);
650
651 ssz = size_to_human_string(SIZE_SUFFIX_3LETTER|
652 SIZE_SUFFIX_SPACE|
653 SIZE_DECIMAL_2DIGITS, a->st.st_size);
654 jlog(JLOG_INFO, "%sLinking %s to %s (-%s)",
655 opts.dry_run ? "[DryRun] " : "", a->links->path, b->links->path, ssz);
656 free(ssz);
657
658 if (!opts.dry_run) {
659 size_t len = strlen(b->links->path) + strlen(".hardlink-temporary") + 1;
660 char *new_path = xmalloc(len);
661
662 snprintf(new_path, len, "%s.hardlink-temporary", b->links->path);
663
664 if (link(a->links->path, new_path) != 0) {
665 warn(_("cannot link %s to %s"), a->links->path, new_path);
666 free(new_path);
667 return FALSE;
668 } else if (rename(new_path, b->links->path) != 0) {
669 warn(_("cannot rename %s to %s"), a->links->path, new_path);
670 unlink(new_path); /* cleanup failed rename */
671 free(new_path);
672 return FALSE;
673 }
674 free(new_path);
675 }
676
677 /* Update statistics */
678 stats.linked++;
679
680 /* Increase the link count of this file, and set stat() of other file */
681 a->st.st_nlink++;
682 b->st.st_nlink--;
683
684 if (b->st.st_nlink == 0)
685 stats.saved += a->st.st_size;
686
687 /* Move the link from file b to a */
688 {
689 struct link *new_link = b->links;
690
691 b->links = b->links->next;
692 new_link->next = a->links->next;
693 a->links->next = new_link;
694 }
695
696 // Do it again
697 if (b->links)
698 goto file_link;
699
700 return TRUE;
701 }
702
703 /**
704 * inserter - Callback function for nftw()
705 * @fpath: The path of the file being visited
706 * @sb: The stat information of the file
707 * @typeflag: The type flag
708 * @ftwbuf: Contains current level of nesting and offset of basename
709 *
710 * Called by nftw() for the files. See the manual page for nftw() for
711 * further information.
712 */
713 static int inserter(const char *fpath, const struct stat *sb, int typeflag,
714 struct FTW *ftwbuf)
715 {
716 struct file *fil;
717 struct file **node;
718 size_t pathlen;
719 int included;
720 int excluded;
721
722 if (handle_interrupt())
723 return 1;
724 if (typeflag == FTW_DNR || typeflag == FTW_NS)
725 warn(_("cannot read %s"), fpath);
726 if (typeflag != FTW_F || !S_ISREG(sb->st_mode))
727 return 0;
728
729 included = regexec_any(opts.include, fpath);
730 excluded = regexec_any(opts.exclude, fpath);
731
732 if ((opts.exclude && excluded && !included) ||
733 (!opts.exclude && opts.include && !included))
734 return 0;
735
736 stats.files++;
737
738 if ((uintmax_t) sb->st_size < opts.min_size) {
739 jlog(JLOG_DEBUG1, "Skipped %s (smaller than configured size)", fpath);
740 return 0;
741 }
742
743 jlog(JLOG_DEBUG2, "Visiting %s (file %zu)", fpath, stats.files);
744
745 pathlen = strlen(fpath) + 1;
746
747 fil = xcalloc(1, sizeof(*fil));
748 fil->links = xcalloc(1, sizeof(struct link) + pathlen);
749
750 fil->st = *sb;
751 fil->links->basename = ftwbuf->base;
752 fil->links->next = NULL;
753
754 memcpy(fil->links->path, fpath, pathlen);
755
756 node = tsearch(fil, &files_by_ino, compare_nodes_ino);
757
758 if (node == NULL)
759 goto fail;
760
761 if (*node != fil) {
762 /* Already known inode, add link to inode information */
763 assert((*node)->st.st_dev == sb->st_dev);
764 assert((*node)->st.st_ino == sb->st_ino);
765
766 fil->links->next = (*node)->links;
767 (*node)->links = fil->links;
768
769 free(fil);
770 } else {
771 /* New inode, insert into by-size table */
772 node = tsearch(fil, &files, compare_nodes);
773
774 if (node == NULL)
775 goto fail;
776
777 if (*node != fil) {
778 struct file *l;
779
780 if (file_compare(fil, *node) >= 0) {
781 fil->next = *node;
782 *node = fil;
783 } else {
784 for (l = *node; l != NULL; l = l->next) {
785 if (l->next != NULL && file_compare(fil, l->next) < 0)
786 continue;
787
788 fil->next = l->next;
789 l->next = fil;
790
791 break;
792 }
793 }
794 }
795 }
796
797 return 0;
798
799 fail:
800 warn(_("cannot continue")); /* probably ENOMEM */
801 return 0;
802 }
803
804 /**
805 * visitor - Callback for twalk()
806 * @nodep: Pointer to a pointer to a #struct file
807 * @which: At which point this visit is (preorder, postorder, endorder)
808 * @depth: The depth of the node in the tree
809 *
810 * Visit the nodes in the binary tree. For each node, call hardlinker()
811 * on each #struct file in the linked list of #struct file instances located
812 * at that node.
813 */
814 static void visitor(const void *nodep, const VISIT which, const int depth)
815 {
816 struct file *master = *(struct file **) nodep;
817 struct file *other;
818
819 (void) depth;
820
821 if (which != leaf && which != endorder)
822 return;
823
824 for (; master != NULL; master = master->next) {
825 if (handle_interrupt())
826 exit(EXIT_FAILURE);
827 if (master->links == NULL)
828 continue;
829
830 for (other = master->next; other != NULL; other = other->next) {
831 if (handle_interrupt())
832 exit(EXIT_FAILURE);
833
834 assert(other != other->next);
835 assert(other->st.st_size == master->st.st_size);
836
837 if (other->links == NULL || !file_may_link_to(master, other))
838 continue;
839
840 if (!file_link(master, other) && errno == EMLINK)
841 master = other;
842 }
843 }
844 }
845
846 /**
847 * usage - Print the program help and exit
848 */
849 static void __attribute__((__noreturn__)) usage(void)
850 {
851 FILE *out = stdout;
852
853 fputs(USAGE_HEADER, out);
854 fprintf(out, _(" %s [options] <directory>|<file> ...\n"), program_invocation_short_name);
855
856 fputs(USAGE_SEPARATOR, out);
857 fputs(_("Consolidate duplicate files using hardlinks.\n"), out);
858
859 fputs(USAGE_OPTIONS, out);
860 fputs(_(" -v, --verbose verbose output (repeat for more verbosity)\n"), out);
861 fputs(_(" -n, --dry-run don't actually link anything\n"), out);
862 fputs(_(" -f, --respect-name filenames have to be identical\n"), out);
863 fputs(_(" -p, --ignore-mode ignore changes of file mode\n"), out);
864 fputs(_(" -o, --ignore-owner ignore owner changes\n"), out);
865 fputs(_(" -t, --ignore-time ignore timestamps (when testing for equality)\n"), out);
866 #ifdef HAVE_SYS_XATTR_H
867 fputs(_(" -X, --respect-xattrs respect extended attributes\n"), out);
868 #endif
869 fputs(_(" -m, --maximize maximize the hardlink count, remove the file with\n"
870 " lowest hardlink count\n"), out);
871 fputs(_(" -M, --minimize reverse the meaning of -m\n"), out);
872 fputs(_(" -O, --keep-oldest keep the oldest file of multiple equal files\n"
873 " (lower precedence than minimize/maximize)\n"), out);
874 fputs(_(" -x, --exclude <regex> regular expression to exclude files\n"), out);
875 fputs(_(" -i, --include <regex> regular expression to include files/dirs\n"), out);
876 fputs(_(" -s, --minimum-size <size> minimum size for files.\n"), out);
877 fputs(_(" -c, --content compare only file contents, same as -pot\n"), out);
878
879 fputs(USAGE_SEPARATOR, out);
880 printf(USAGE_HELP_OPTIONS(28));
881
882 printf(USAGE_MAN_TAIL("hardlink(1)"));
883
884 exit(EXIT_SUCCESS);
885 }
886
887 /**
888 * register_regex - Compile and insert a regular expression into list
889 * @pregs: Pointer to a linked list of regular expressions
890 * @regex: String containing the regular expression to be compiled
891 */
892 static void register_regex(struct regex_link **pregs, const char *regex)
893 {
894 struct regex_link *link = xmalloc(sizeof(*link));
895 int err;
896
897 if ((err = regcomp(&link->preg, regex, REG_NOSUB | REG_EXTENDED)) != 0) {
898 size_t size = regerror(err, &link->preg, NULL, 0);
899 char *buf = xmalloc(size + 1);
900
901 regerror(err, &link->preg, buf, size);
902
903 errx(EXIT_FAILURE, _("could not compile regular expression %s: %s"),
904 regex, buf);
905 }
906
907 link->next = *pregs;
908 *pregs = link;
909 }
910
911 /**
912 * parse_options - Parse the command line options
913 * @argc: Number of options
914 * @argv: Array of options
915 */
916 static int parse_options(int argc, char *argv[])
917 {
918 static const char optstr[] = "VhvnfpotXcmMOx:i:s:";
919 static const struct option long_options[] = {
920 {"version", no_argument, NULL, 'V'},
921 {"help", no_argument, NULL, 'h'},
922 {"verbose", no_argument, NULL, 'v'},
923 {"dry-run", no_argument, NULL, 'n'},
924 {"respect-name", no_argument, NULL, 'f'},
925 {"ignore-mode", no_argument, NULL, 'p'},
926 {"ignore-owner", no_argument, NULL, 'o'},
927 {"ignore-time", no_argument, NULL, 't'},
928 {"respect-xattrs", no_argument, NULL, 'X'},
929 {"maximize", no_argument, NULL, 'm'},
930 {"minimize", no_argument, NULL, 'M'},
931 {"keep-oldest", no_argument, NULL, 'O'},
932 {"exclude", required_argument, NULL, 'x'},
933 {"include", required_argument, NULL, 'i'},
934 {"minimum-size", required_argument, NULL, 's'},
935 {"content", no_argument, NULL, 'c'},
936 {NULL, 0, NULL, 0}
937 };
938 int c;
939
940 while ((c = getopt_long(argc, argv, optstr, long_options, NULL)) != -1) {
941 switch (c) {
942 case 'p':
943 opts.respect_mode = FALSE;
944 break;
945 case 'o':
946 opts.respect_owner = FALSE;
947 break;
948 case 't':
949 opts.respect_time = FALSE;
950 break;
951 case 'X':
952 opts.respect_xattrs = TRUE;
953 break;
954 case 'm':
955 opts.maximise = TRUE;
956 break;
957 case 'M':
958 opts.minimise = TRUE;
959 break;
960 case 'O':
961 opts.keep_oldest = TRUE;
962 break;
963 case 'f':
964 opts.respect_name = TRUE;
965 break;
966 case 'v':
967 opts.verbosity++;
968 break;
969 case 'c':
970 opts.respect_mode = FALSE;
971 opts.respect_name = FALSE;
972 opts.respect_owner = FALSE;
973 opts.respect_time = FALSE;
974 opts.respect_xattrs = FALSE;
975 break;
976 case 'n':
977 opts.dry_run = 1;
978 break;
979 case 'x':
980 register_regex(&opts.exclude, optarg);
981 break;
982 case 'i':
983 register_regex(&opts.include, optarg);
984 break;
985 case 's':
986 opts.min_size = strtosize_or_err(optarg, _("failed to parse size"));
987 break;
988
989 case 'h':
990 usage();
991 case 'V':
992 print_version(EXIT_SUCCESS);
993 default:
994 errtryhelp(EXIT_FAILURE);
995 }
996 }
997 return 0;
998 }
999
1000 /**
1001 * to_be_called_atexit - Cleanup handler, also prints statistics.
1002 */
1003 static void to_be_called_atexit(void)
1004 {
1005 if (stats.started)
1006 print_stats();
1007 }
1008
1009 /**
1010 * sighandler - Signal handler, sets the global last_signal variable
1011 * @i: The signal number
1012 */
1013 static void sighandler(int i)
1014 {
1015 if (last_signal != SIGINT)
1016 last_signal = i;
1017 if (i == SIGINT)
1018 putchar('\n');
1019 }
1020
1021 int main(int argc, char *argv[])
1022 {
1023 struct sigaction sa;
1024
1025 sa.sa_handler = sighandler;
1026 sa.sa_flags = SA_RESTART;
1027 sigfillset(&sa.sa_mask);
1028
1029 /* If we receive a SIGINT, end the processing */
1030 sigaction(SIGINT, &sa, NULL);
1031 sigaction(SIGUSR1, &sa, NULL);
1032
1033 /* Pretty print numeric output */
1034 setlocale(LC_NUMERIC, "");
1035
1036 if (atexit(to_be_called_atexit) != 0)
1037 err(EXIT_FAILURE, _("cannot register exit handler"));
1038
1039 parse_options(argc, argv);
1040
1041 if (optind == argc)
1042 errx(EXIT_FAILURE, _("no directory of dile specified"));
1043
1044 gettime_monotonic(&stats.start_time);
1045 stats.started = TRUE;
1046
1047 for (; optind < argc; optind++)
1048 if (nftw(argv[optind], inserter, 20, FTW_PHYS) == -1)
1049 warn(_("cannot process %s"), argv[optind]);
1050
1051 twalk(files, visitor);
1052
1053 return 0;
1054 }