]> git.ipfire.org Git - thirdparty/util-linux.git/blob - misc-utils/hardlink.c
hardlink: fix compiler warning [-Wformat=]
[thirdparty/util-linux.git] / misc-utils / hardlink.c
1 /* hardlink.c - Link multiple identical files together
2 *
3 * Copyright (C) 2008 - 2014 Julian Andres Klode <jak@jak-linux.org>
4 * Copyright (C) 2021 Karel Zak <kzak@redhat.com>
5 *
6 * SPDX-License-Identifier: MIT
7 *
8 * Permission is hereby granted, free of charge, to any person obtaining a copy
9 * of this software and associated documentation files (the "Software"), to deal
10 * in the Software without restriction, including without limitation the rights
11 * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
12 * copies of the Software, and to permit persons to whom the Software is
13 * furnished to do so, subject to the following conditions:
14 *
15 * The above copyright notice and this permission notice shall be included in
16 * all copies or substantial portions of the Software.
17 *
18 * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
19 * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
20 * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
21 * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
22 * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
23 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
24 * THE SOFTWARE.
25 */
26 #define _POSIX_C_SOURCE 200112L /* POSIX functions */
27 #define _XOPEN_SOURCE 600 /* nftw() */
28
29 #include <sys/types.h> /* stat */
30 #include <sys/stat.h> /* stat */
31 #include <sys/time.h> /* getrlimit, getrusage */
32 #include <sys/resource.h> /* getrlimit, getrusage */
33 #include <fcntl.h> /* posix_fadvise */
34 #include <ftw.h> /* ftw */
35 #include <search.h> /* tsearch() and friends */
36 #include <signal.h> /* SIG*, sigaction */
37 #include <getopt.h> /* getopt_long() */
38 #include <ctype.h> /* tolower() */
39
40 #include "nls.h"
41 #include "c.h"
42 #include "xalloc.h"
43 #include "strutils.h"
44 #include "monotonic.h"
45 #include "optutils.h"
46 #include "fileeq.h"
47
48 #include <regex.h> /* regcomp(), regexec() */
49
50 #if defined(HAVE_SYS_XATTR_H) && defined(HAVE_LLISTXATTR) && defined(HAVE_LGETXATTR)
51 # include <sys/xattr.h>
52 # define USE_XATTR 1
53 #endif
54
55 static int quiet; /* don't print anything */
56
57 static struct ul_fileeq fileeq;
58
59 /**
60 * struct file - Information about a file
61 * @st: The stat buffer associated with the file
62 * @next: Next file with the same size
63 * @basename: The offset off the basename in the filename
64 * @path: The path of the file
65 *
66 * This contains all information we need about a file.
67 */
68 struct file {
69 struct stat st;
70 struct ul_fileeq_data data;
71
72 struct file *next;
73 struct link {
74 struct link *next;
75 int basename;
76 #if __STDC_VERSION__ >= 199901L
77 char path[];
78 #elif __GNUC__
79 char path[0];
80 #else
81 char path[1];
82 #endif
83 } *links;
84 };
85
86 /**
87 * enum log_level - Logging levels
88 * @JLOG_SUMMARY: Default log level
89 * @JLOG_INFO: Verbose logging (verbose == 1)
90 * @JLOG_VERBOSE1: Verbosity 2
91 * @JLOG_VERBOSE2: Verbosity 3
92 */
93 enum log_level {
94 JLOG_SUMMARY,
95 JLOG_INFO,
96 JLOG_VERBOSE1,
97 JLOG_VERBOSE2
98 };
99
100 /**
101 * struct statistic - Statistics about the file
102 * @started: Whether we are post command-line processing
103 * @files: The number of files worked on
104 * @linked: The number of files replaced by a hardlink to a master
105 * @xattr_comparisons: The number of extended attribute comparisons
106 * @comparisons: The number of comparisons
107 * @saved: The (exaggerated) amount of space saved
108 * @start_time: The time we started at
109 */
110 static struct statistics {
111 int started;
112 size_t files;
113 size_t linked;
114 size_t xattr_comparisons;
115 size_t comparisons;
116 double saved;
117 struct timeval start_time;
118 } stats;
119
120
121 struct hdl_regex {
122 regex_t re; /* POSIX compatible regex handler */
123
124 struct hdl_regex *next;
125 };
126
127 /**
128 * struct options - Processed command-line options
129 * @include: A linked list of regular expressions for the --include option
130 * @exclude: A linked list of regular expressions for the --exclude option
131 * @verbosity: The verbosity. Should be one of #enum log_level
132 * @respect_mode: Whether to respect file modes (default = TRUE)
133 * @respect_owner: Whether to respect file owners (uid, gid; default = TRUE)
134 * @respect_name: Whether to respect file names (default = FALSE)
135 * @respect_time: Whether to respect file modification times (default = TRUE)
136 * @respect_xattrs: Whether to respect extended attributes (default = FALSE)
137 * @maximise: Chose the file with the highest link count as master
138 * @minimise: Chose the file with the lowest link count as master
139 * @keep_oldest: Choose the file with oldest timestamp as master (default = FALSE)
140 * @dry_run: Specifies whether hardlink should not link files (default = FALSE)
141 * @min_size: Minimum size of files to consider. (default = 1 byte)
142 * @max_size: Maximum size of files to consider, 0 means umlimited. (default = 0 byte)
143 */
144 static struct options {
145 struct hdl_regex *include;
146 struct hdl_regex *exclude;
147
148 const char *method;
149 signed int verbosity;
150 unsigned int respect_mode:1;
151 unsigned int respect_owner:1;
152 unsigned int respect_name:1;
153 unsigned int respect_time:1;
154 unsigned int respect_xattrs:1;
155 unsigned int maximise:1;
156 unsigned int minimise:1;
157 unsigned int keep_oldest:1;
158 unsigned int dry_run:1;
159 uintmax_t min_size;
160 uintmax_t max_size;
161 size_t io_size;
162 size_t cache_size;
163 } opts = {
164 /* default setting */
165 .method = "sha256",
166 .respect_mode = TRUE,
167 .respect_owner = TRUE,
168 .respect_time = TRUE,
169 .respect_xattrs = FALSE,
170 .keep_oldest = FALSE,
171 .min_size = 1,
172 .cache_size = 10*1024*1024
173 };
174
175 /*
176 * files
177 *
178 * A binary tree of files, managed using tsearch(). To see which nodes
179 * are considered equal, see compare_nodes()
180 */
181 static void *files;
182 static void *files_by_ino;
183
184 /*
185 * last_signal
186 *
187 * The last signal we received. We store the signal here in order to be able
188 * to break out of loops gracefully and to return from our nftw() handler.
189 */
190 static int last_signal;
191
192 /**
193 * jlog - Logging for hardlink
194 * @level: The log level
195 * @format: A format string for printf()
196 */
197 __attribute__((format(printf, 2, 3)))
198 static void jlog(enum log_level level, const char *format, ...)
199 {
200 va_list args;
201
202 if (quiet || level > (unsigned int)opts.verbosity)
203 return;
204
205 va_start(args, format);
206 vfprintf(stdout, format, args);
207 va_end(args);
208 fputc('\n', stdout);
209 }
210
211 /**
212 * CMP - Compare two numerical values, return 1, 0, or -1
213 * @a: First value
214 * @b: Second value
215 *
216 * Used to compare two integers of any size while avoiding overflow.
217 */
218 #define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
219
220 /**
221 * register_regex - Compile and insert a regular expression into list
222 * @pregs: Pointer to a linked list of regular expressions
223 * @regex: String containing the regular expression to be compiled
224 */
225 static void register_regex(struct hdl_regex **pregs, const char *regex)
226 {
227 struct hdl_regex *link;
228 int err;
229
230 link = xmalloc(sizeof(*link));
231
232 if ((err = regcomp(&link->re, regex, REG_NOSUB | REG_EXTENDED)) != 0) {
233 size_t size = regerror(err, &link->re, NULL, 0);
234 char *buf = xmalloc(size + 1);
235
236 regerror(err, &link->re, buf, size);
237
238 errx(EXIT_FAILURE, _("could not compile regular expression %s: %s"),
239 regex, buf);
240 }
241 link->next = *pregs; *pregs = link;
242 }
243
244 /**
245 * match_any_regex - Match against multiple regular expressions
246 * @pregs: A linked list of regular expressions
247 * @what: The string to match against
248 *
249 * Checks whether any of the regular expressions in the list matches the
250 * string.
251 */
252 static int match_any_regex(struct hdl_regex *pregs, const char *what)
253 {
254 for (; pregs != NULL; pregs = pregs->next) {
255 if (regexec(&pregs->re, what, 0, NULL, 0) == 0)
256 return TRUE;
257 }
258 return FALSE;
259 }
260
261 /**
262 * compare_nodes - Node comparison function
263 * @_a: The first node (a #struct file)
264 * @_b: The second node (a #struct file)
265 *
266 * Compare the two nodes for the binary tree.
267 */
268 static int compare_nodes(const void *_a, const void *_b)
269 {
270 const struct file *a = _a;
271 const struct file *b = _b;
272 int diff = 0;
273
274 if (diff == 0)
275 diff = CMP(a->st.st_dev, b->st.st_dev);
276 if (diff == 0)
277 diff = CMP(a->st.st_size, b->st.st_size);
278
279 return diff;
280 }
281
282 /**
283 * compare_nodes_ino - Node comparison function
284 * @_a: The first node (a #struct file)
285 * @_b: The second node (a #struct file)
286 *
287 * Compare the two nodes for the binary tree.
288 */
289 static int compare_nodes_ino(const void *_a, const void *_b)
290 {
291 const struct file *a = _a;
292 const struct file *b = _b;
293 int diff = 0;
294
295 if (diff == 0)
296 diff = CMP(a->st.st_dev, b->st.st_dev);
297 if (diff == 0)
298 diff = CMP(a->st.st_ino, b->st.st_ino);
299
300 /* If opts.respect_name is used, we will restrict a struct file to
301 * contain only links with the same basename to keep the rest simple.
302 */
303 if (diff == 0 && opts.respect_name)
304 diff = strcmp(a->links->path + a->links->basename,
305 b->links->path + b->links->basename);
306
307 return diff;
308 }
309
310 /**
311 * print_stats - Print statistics to stdout
312 */
313 static void print_stats(void)
314 {
315 struct timeval end = { 0, 0 }, delta = { 0, 0 };
316 char *ssz;
317
318 gettime_monotonic(&end);
319 timersub(&end, &stats.start_time, &delta);
320
321 jlog(JLOG_SUMMARY, "%-15s %s", _("Mode:"),
322 opts.dry_run ? _("dry-run") : _("real"));
323 jlog(JLOG_SUMMARY, "%-15s %s", _("Method:"), opts.method);
324 jlog(JLOG_SUMMARY, "%-15s %zu", _("Files:"), stats.files);
325 jlog(JLOG_SUMMARY, _("%-15s %zu files"), _("Linked:"), stats.linked);
326
327 #ifdef USE_XATTR
328 jlog(JLOG_SUMMARY, _("%-15s %zu xattrs"), _("Compared:"),
329 stats.xattr_comparisons);
330 #endif
331 jlog(JLOG_SUMMARY, _("%-15s %zu files"), _("Compared:"),
332 stats.comparisons);
333
334 ssz = size_to_human_string(SIZE_SUFFIX_3LETTER |
335 SIZE_SUFFIX_SPACE |
336 SIZE_DECIMAL_2DIGITS, stats.saved);
337
338 jlog(JLOG_SUMMARY, "%-15s %s", _("Saved:"), ssz);
339 free(ssz);
340
341 jlog(JLOG_SUMMARY, _("%-15s %"PRId64".%06"PRId64" seconds"), _("Duration:"),
342 (int64_t)delta.tv_sec, (int64_t)delta.tv_usec);
343 }
344
345 /**
346 * handle_interrupt - Handle a signal
347 *
348 * Returns: %TRUE on SIGINT, SIGTERM; %FALSE on all other signals.
349 */
350 static int handle_interrupt(void)
351 {
352 switch (last_signal) {
353 case SIGINT:
354 case SIGTERM:
355 return TRUE;
356 case SIGUSR1:
357 print_stats();
358 putchar('\n');
359 break;
360 }
361
362 last_signal = 0;
363 return FALSE;
364 }
365
366 #ifdef USE_XATTR
367
368 /**
369 * llistxattr_or_die - Wrapper for llistxattr()
370 *
371 * This does the same thing as llistxattr() except that it aborts if any error
372 * other than "not supported" is detected.
373 */
374 static ssize_t llistxattr_or_die(const char *path, char *list, size_t size)
375 {
376 ssize_t len = llistxattr(path, list, size);
377
378 if (len < 0 && errno != ENOTSUP)
379 err(EXIT_FAILURE, _("cannot get xattr names for %s"), path);
380
381 return len;
382 }
383
384 /**
385 * lgetxattr_or_die - Wrapper for lgetxattr()
386 *
387 * This does the same thing as lgetxattr() except that it aborts upon error.
388 */
389 static ssize_t lgetxattr_or_die(const char *path,
390 const char *name, void *value, size_t size)
391 {
392 ssize_t len = lgetxattr(path, name, value, size);
393
394 if (len < 0)
395 err(EXIT_FAILURE, _("cannot get xattr value of %s for %s"),
396 name, path);
397
398 return len;
399 }
400
401 /**
402 * get_xattr_name_count - Count the number of xattr names
403 * @names: a non-empty table of concatenated, null-terminated xattr names
404 * @len: the total length of the table
405 *
406 * @Returns the number of xattr names
407 */
408 static int get_xattr_name_count(const char *const names, ssize_t len)
409 {
410 int count = 0;
411 const char *name;
412
413 for (name = names; name < (names + len); name += strlen(name) + 1)
414 count++;
415
416 return count;
417 }
418
419 /**
420 * cmp_xattr_name_ptrs - Compare two pointers to xattr names by comparing
421 * the names they point to.
422 */
423 static int cmp_xattr_name_ptrs(const void *ptr1, const void *ptr2)
424 {
425 return strcmp(*(char *const *)ptr1, *(char *const *)ptr2);
426 }
427
428 /**
429 * get_sorted_xattr_name_table - Create a sorted table of xattr names.
430 * @names - table of concatenated, null-terminated xattr names
431 * @n - the number of names
432 *
433 * @Returns allocated table of pointers to the names, sorted alphabetically
434 */
435 static const char **get_sorted_xattr_name_table(const char *names, int n)
436 {
437 const char **table = xmalloc(n * sizeof(char *));
438 int i;
439
440 for (i = 0; i < n; i++) {
441 table[i] = names;
442 names += strlen(names) + 1;
443 }
444
445 qsort(table, n, sizeof(char *), cmp_xattr_name_ptrs);
446
447 return table;
448 }
449
450 /**
451 * file_xattrs_equal - Compare the extended attributes of two files
452 * @a: The first file
453 * @b: The second file
454 *
455 * @Returns: %TRUE if and only if extended attributes are equal
456 */
457 static int file_xattrs_equal(const struct file *a, const struct file *b)
458 {
459 ssize_t len_a;
460 ssize_t len_b;
461 char *names_a = NULL;
462 char *names_b = NULL;
463 int n_a;
464 int n_b;
465 const char **name_ptrs_a = NULL;
466 const char **name_ptrs_b = NULL;
467 void *value_a = NULL;
468 void *value_b = NULL;
469 int ret = FALSE;
470 int i;
471
472 assert(a->links != NULL);
473 assert(b->links != NULL);
474
475 jlog(JLOG_VERBOSE1, _("Comparing xattrs of %s to %s"), a->links->path,
476 b->links->path);
477
478 stats.xattr_comparisons++;
479
480 len_a = llistxattr_or_die(a->links->path, NULL, 0);
481 len_b = llistxattr_or_die(b->links->path, NULL, 0);
482
483 if (len_a <= 0 && len_b <= 0)
484 return TRUE; // xattrs not supported or neither file has any
485
486 if (len_a != len_b)
487 return FALSE; // total lengths of xattr names differ
488
489 names_a = xmalloc(len_a);
490 names_b = xmalloc(len_b);
491
492 len_a = llistxattr_or_die(a->links->path, names_a, len_a);
493 len_b = llistxattr_or_die(b->links->path, names_b, len_b);
494 assert((len_a > 0) && (len_a == len_b));
495
496 n_a = get_xattr_name_count(names_a, len_a);
497 n_b = get_xattr_name_count(names_b, len_b);
498
499 if (n_a != n_b)
500 goto exit; // numbers of xattrs differ
501
502 name_ptrs_a = get_sorted_xattr_name_table(names_a, n_a);
503 name_ptrs_b = get_sorted_xattr_name_table(names_b, n_b);
504
505 // We now have two sorted tables of xattr names.
506
507 for (i = 0; i < n_a; i++) {
508 if (handle_interrupt())
509 goto exit; // user wants to quit
510
511 if (strcmp(name_ptrs_a[i], name_ptrs_b[i]) != 0)
512 goto exit; // names at same slot differ
513
514 len_a =
515 lgetxattr_or_die(a->links->path, name_ptrs_a[i], NULL, 0);
516 len_b =
517 lgetxattr_or_die(b->links->path, name_ptrs_b[i], NULL, 0);
518
519 if (len_a != len_b)
520 goto exit; // xattrs with same name, different value lengths
521
522 value_a = xmalloc(len_a);
523 value_b = xmalloc(len_b);
524
525 len_a = lgetxattr_or_die(a->links->path, name_ptrs_a[i],
526 value_a, len_a);
527 len_b = lgetxattr_or_die(b->links->path, name_ptrs_b[i],
528 value_b, len_b);
529 assert((len_a >= 0) && (len_a == len_b));
530
531 if (memcmp(value_a, value_b, len_a) != 0)
532 goto exit; // xattrs with same name, different values
533
534 free(value_a);
535 free(value_b);
536 value_a = NULL;
537 value_b = NULL;
538 }
539
540 ret = TRUE;
541
542 exit:
543 free(names_a);
544 free(names_b);
545 free(name_ptrs_a);
546 free(name_ptrs_b);
547 free(value_a);
548 free(value_b);
549 return ret;
550 }
551 #else /* !USE_XATTR */
552 static int file_xattrs_equal(const struct file *a, const struct file *b)
553 {
554 return TRUE;
555 }
556 #endif /* USE_XATTR */
557
558 /**
559 * file_may_link_to - Check whether a file may replace another one
560 * @a: The first file
561 * @b: The second file
562 *
563 * Check whether the two files are considered equal attributes and can be
564 * linked. This function does not compare content od the files!
565 */
566 static int file_may_link_to(const struct file *a, const struct file *b)
567 {
568 return (a->st.st_size != 0 &&
569 a->st.st_size == b->st.st_size &&
570 a->links != NULL && b->links != NULL &&
571 a->st.st_dev == b->st.st_dev &&
572 a->st.st_ino != b->st.st_ino &&
573 (!opts.respect_mode || a->st.st_mode == b->st.st_mode) &&
574 (!opts.respect_owner || a->st.st_uid == b->st.st_uid) &&
575 (!opts.respect_owner || a->st.st_gid == b->st.st_gid) &&
576 (!opts.respect_time || a->st.st_mtime == b->st.st_mtime) &&
577 (!opts.respect_name
578 || strcmp(a->links->path + a->links->basename,
579 b->links->path + b->links->basename) == 0) &&
580 (!opts.respect_xattrs || file_xattrs_equal(a, b)));
581 }
582
583 /**
584 * file_compare - Compare two files to decide which should be master
585 * @a: The first file
586 * @b: The second file
587 *
588 * Check which of the files should be considered greater and thus serve
589 * as the master when linking (the master is the file that all equal files
590 * will be replaced with).
591 */
592 static int file_compare(const struct file *a, const struct file *b)
593 {
594 int res = 0;
595 if (a->st.st_dev == b->st.st_dev && a->st.st_ino == b->st.st_ino)
596 return 0;
597
598 if (res == 0 && opts.maximise)
599 res = CMP(a->st.st_nlink, b->st.st_nlink);
600 if (res == 0 && opts.minimise)
601 res = CMP(b->st.st_nlink, a->st.st_nlink);
602 if (res == 0)
603 res = opts.keep_oldest ? CMP(b->st.st_mtime, a->st.st_mtime)
604 : CMP(a->st.st_mtime, b->st.st_mtime);
605 if (res == 0)
606 res = CMP(b->st.st_ino, a->st.st_ino);
607
608 return res;
609 }
610
611 /**
612 * file_link - Replace b with a link to a
613 * @a: The first file
614 * @b: The second file
615 *
616 * Link the file, replacing @b with the current one. The file is first
617 * linked to a temporary name, and then renamed to the name of @b, making
618 * the replace atomic (@b will always exist).
619 */
620 static int file_link(struct file *a, struct file *b)
621 {
622 char *ssz;
623
624 file_link:
625 assert(a->links != NULL);
626 assert(b->links != NULL);
627
628 ssz = size_to_human_string(SIZE_SUFFIX_3LETTER |
629 SIZE_SUFFIX_SPACE |
630 SIZE_DECIMAL_2DIGITS, a->st.st_size);
631 jlog(JLOG_INFO, _("%sLinking %s to %s (-%s)"),
632 opts.dry_run ? _("[DryRun] ") : "", a->links->path, b->links->path,
633 ssz);
634 free(ssz);
635
636 if (!opts.dry_run) {
637 size_t len =
638 strlen(b->links->path) + strlen(".hardlink-temporary") + 1;
639 char *new_path = xmalloc(len);
640
641 snprintf(new_path, len, "%s.hardlink-temporary",
642 b->links->path);
643
644 if (link(a->links->path, new_path) != 0) {
645 warn(_("cannot link %s to %s"), a->links->path,
646 new_path);
647 free(new_path);
648 return FALSE;
649 } else if (rename(new_path, b->links->path) != 0) {
650 warn(_("cannot rename %s to %s"), a->links->path,
651 new_path);
652 unlink(new_path); /* cleanup failed rename */
653 free(new_path);
654 return FALSE;
655 }
656 free(new_path);
657 }
658
659 /* Update statistics */
660 stats.linked++;
661
662 /* Increase the link count of this file, and set stat() of other file */
663 a->st.st_nlink++;
664 b->st.st_nlink--;
665
666 if (b->st.st_nlink == 0)
667 stats.saved += a->st.st_size;
668
669 /* Move the link from file b to a */
670 {
671 struct link *new_link = b->links;
672
673 b->links = b->links->next;
674 new_link->next = a->links->next;
675 a->links->next = new_link;
676 }
677
678 /* Do it again */
679 if (b->links)
680 goto file_link;
681
682 return TRUE;
683 }
684
685 /**
686 * inserter - Callback function for nftw()
687 * @fpath: The path of the file being visited
688 * @sb: The stat information of the file
689 * @typeflag: The type flag
690 * @ftwbuf: Contains current level of nesting and offset of basename
691 *
692 * Called by nftw() for the files. See the manual page for nftw() for
693 * further information.
694 */
695 static int inserter(const char *fpath, const struct stat *sb,
696 int typeflag, struct FTW *ftwbuf)
697 {
698 struct file *fil;
699 struct file **node;
700 size_t pathlen;
701 int included;
702 int excluded;
703
704 if (handle_interrupt())
705 return 1;
706 if (typeflag == FTW_DNR || typeflag == FTW_NS)
707 warn(_("cannot read %s"), fpath);
708 if (typeflag != FTW_F || !S_ISREG(sb->st_mode))
709 return 0;
710
711 included = match_any_regex(opts.include, fpath);
712 excluded = match_any_regex(opts.exclude, fpath);
713
714 if ((opts.exclude && excluded && !included) ||
715 (!opts.exclude && opts.include && !included))
716 return 0;
717
718 stats.files++;
719
720 if ((uintmax_t) sb->st_size < opts.min_size) {
721 jlog(JLOG_VERBOSE1,
722 _("Skipped %s (smaller than configured size)"), fpath);
723 return 0;
724 }
725
726 jlog(JLOG_VERBOSE2, " %5zu: [%ld/%ld/%zu] %s",
727 stats.files, sb->st_dev, sb->st_ino,
728 (size_t) sb->st_nlink, fpath);
729
730 if ((opts.max_size > 0) && ((uintmax_t) sb->st_size > opts.max_size)) {
731 jlog(JLOG_VERBOSE1,
732 _("Skipped %s (greater than configured size)"), fpath);
733 return 0;
734 }
735
736 pathlen = strlen(fpath) + 1;
737
738 fil = xcalloc(1, sizeof(*fil));
739 fil->links = xcalloc(1, sizeof(struct link) + pathlen);
740
741 fil->st = *sb;
742 fil->links->basename = ftwbuf->base;
743 fil->links->next = NULL;
744
745 memcpy(fil->links->path, fpath, pathlen);
746
747 node = tsearch(fil, &files_by_ino, compare_nodes_ino);
748
749 if (node == NULL)
750 goto fail;
751
752 if (*node != fil) {
753 /* Already known inode, add link to inode information */
754 assert((*node)->st.st_dev == sb->st_dev);
755 assert((*node)->st.st_ino == sb->st_ino);
756
757 fil->links->next = (*node)->links;
758 (*node)->links = fil->links;
759
760 free(fil);
761 } else {
762 /* New inode, insert into by-size table */
763 node = tsearch(fil, &files, compare_nodes);
764
765 if (node == NULL)
766 goto fail;
767
768 if (*node != fil) {
769 struct file *l;
770
771 if (file_compare(fil, *node) >= 0) {
772 fil->next = *node;
773 *node = fil;
774 } else {
775 for (l = *node; l != NULL; l = l->next) {
776 if (l->next != NULL
777 && file_compare(fil, l->next) < 0)
778 continue;
779
780 fil->next = l->next;
781 l->next = fil;
782
783 break;
784 }
785 }
786 }
787 }
788
789 return 0;
790
791 fail:
792 warn(_("cannot continue")); /* probably ENOMEM */
793 return 0;
794 }
795
796 static inline size_t count_nodes(struct file *x)
797 {
798 size_t ct = 0;
799
800 for ( ; x != NULL; x = x->next)
801 ct++;
802
803 return ct;
804 }
805
806 /**
807 * visitor - Callback for twalk()
808 * @nodep: Pointer to a pointer to a #struct file
809 * @which: At which point this visit is (preorder, postorder, endorder)
810 * @depth: The depth of the node in the tree
811 *
812 * Visit the nodes in the binary tree. For each node, call hardlinker()
813 * on each #struct file in the linked list of #struct file instances located
814 * at that node.
815 */
816 static void visitor(const void *nodep, const VISIT which, const int depth)
817 {
818 struct file *master = *(struct file **)nodep;
819 struct file *begin = master;
820 struct file *other;
821
822 (void)depth;
823
824 if (which != leaf && which != endorder)
825 return;
826
827 for (; master != NULL; master = master->next) {
828 size_t nnodes, memsiz;
829
830 if (handle_interrupt())
831 exit(EXIT_FAILURE);
832 if (master->links == NULL)
833 continue;
834
835 /* calculate per file max memory use */
836 nnodes = count_nodes(master);
837 if (!nnodes)
838 continue;
839
840 /* per-file cache size */
841 memsiz = opts.cache_size / nnodes;
842 /* filesiz, readsiz, memsiz */
843 ul_fileeq_set_size(&fileeq, master->st.st_size, opts.io_size, memsiz);
844
845 for (other = master->next; other != NULL; other = other->next) {
846 int eq;
847
848 if (handle_interrupt())
849 exit(EXIT_FAILURE);
850
851 assert(other != other->next);
852 assert(other->st.st_size == master->st.st_size);
853
854 if (!other->links)
855 continue;
856
857 /* check file attributes, etc. */
858 if (!file_may_link_to(master, other)) {
859 jlog(JLOG_VERBOSE2,
860 _("Skipped (attributes mismatch) %s"), other->links->path);
861 continue;
862 }
863
864 /* initialize content comparison */
865 if (!ul_fileeq_data_associated(&master->data))
866 ul_fileeq_data_set_file(&master->data, master->links->path);
867 if (!ul_fileeq_data_associated(&other->data))
868 ul_fileeq_data_set_file(&other->data, other->links->path);
869
870 /* compare files */
871 eq = ul_fileeq(&fileeq, &master->data, &other->data);
872
873 /* reduce number of open files, keep only master open */
874 ul_fileeq_data_close_file(&other->data);
875
876 stats.comparisons++;
877
878 if (!eq) {
879 jlog(JLOG_VERBOSE2,
880 _("Skipped (content mismatch) %s"), other->links->path);
881 continue;
882 }
883
884 /* link files */
885 if (!file_link(master, other) && errno == EMLINK) {
886 ul_fileeq_data_deinit(&master->data);
887 master = other;
888 }
889 }
890
891 /* don't keep master data in memory */
892 ul_fileeq_data_deinit(&master->data);
893 }
894
895 /* final cleanup */
896 for (other = begin; other != NULL; other = other->next) {
897 if (ul_fileeq_data_associated(&other->data))
898 ul_fileeq_data_deinit(&other->data);
899 }
900 }
901
902 /**
903 * usage - Print the program help and exit
904 */
905 static void __attribute__((__noreturn__)) usage(void)
906 {
907 FILE *out = stdout;
908
909 fputs(USAGE_HEADER, out);
910 fprintf(out, _(" %s [options] <directory>|<file> ...\n"),
911 program_invocation_short_name);
912
913 fputs(USAGE_SEPARATOR, out);
914 fputs(_("Consolidate duplicate files using hardlinks.\n"), out);
915
916 fputs(USAGE_OPTIONS, out);
917 fputs(_(" -v, --verbose verbose output (repeat for more verbosity)\n"), out);
918 fputs(_(" -q, --quiet quiet mode - don't print anything\n"), out);
919 fputs(_(" -n, --dry-run don't actually link anything\n"), out);
920 fputs(_(" -y, --method <name> file content comparison method\n"), out);
921
922 fputs(_(" -f, --respect-name filenames have to be identical\n"), out);
923 fputs(_(" -p, --ignore-mode ignore changes of file mode\n"), out);
924 fputs(_(" -o, --ignore-owner ignore owner changes\n"), out);
925 fputs(_(" -t, --ignore-time ignore timestamps (when testing for equality)\n"), out);
926 #ifdef USE_XATTR
927 fputs(_(" -X, --respect-xattrs respect extended attributes\n"), out);
928 #endif
929 fputs(_(" -m, --maximize maximize the hardlink count, remove the file with\n"
930 " lowest hardlink count\n"), out);
931 fputs(_(" -M, --minimize reverse the meaning of -m\n"), out);
932 fputs(_(" -O, --keep-oldest keep the oldest file of multiple equal files\n"
933 " (lower precedence than minimize/maximize)\n"), out);
934 fputs(_(" -x, --exclude <regex> regular expression to exclude files\n"), out);
935 fputs(_(" -i, --include <regex> regular expression to include files/dirs\n"), out);
936 fputs(_(" -s, --minimum-size <size> minimum size for files.\n"), out);
937 fputs(_(" -S, --maximum-size <size> maximum size for files.\n"), out);
938 fputs(_(" -b, --io-size <size> I/O buffer size for file reading (speedup, using more RAM)\n"), out);
939 fputs(_(" -r, --cache-size <size> memory limit for cached file content data\n"), out);
940 fputs(_(" -c, --content compare only file contents, same as -pot\n"), out);
941
942 fputs(USAGE_SEPARATOR, out);
943 printf(USAGE_HELP_OPTIONS(28));
944 printf(USAGE_MAN_TAIL("hardlink(1)"));
945
946 exit(EXIT_SUCCESS);
947 }
948
949 /**
950 * parse_options - Parse the command line options
951 * @argc: Number of options
952 * @argv: Array of options
953 */
954 static int parse_options(int argc, char *argv[])
955 {
956 static const char optstr[] = "VhvnfpotXcmMOx:y:i:r:S:s:b:q";
957 static const struct option long_options[] = {
958 {"version", no_argument, NULL, 'V'},
959 {"help", no_argument, NULL, 'h'},
960 {"verbose", no_argument, NULL, 'v'},
961 {"dry-run", no_argument, NULL, 'n'},
962 {"respect-name", no_argument, NULL, 'f'},
963 {"ignore-mode", no_argument, NULL, 'p'},
964 {"ignore-owner", no_argument, NULL, 'o'},
965 {"ignore-time", no_argument, NULL, 't'},
966 {"respect-xattrs", no_argument, NULL, 'X'},
967 {"maximize", no_argument, NULL, 'm'},
968 {"minimize", no_argument, NULL, 'M'},
969 {"keep-oldest", no_argument, NULL, 'O'},
970 {"exclude", required_argument, NULL, 'x'},
971 {"include", required_argument, NULL, 'i'},
972 {"method", required_argument, NULL, 'y' },
973 {"minimum-size", required_argument, NULL, 's'},
974 {"maximum-size", required_argument, NULL, 'S'},
975 {"io-size", required_argument, NULL, 'b'},
976 {"content", no_argument, NULL, 'c'},
977 {"quiet", no_argument, NULL, 'q'},
978 {"cache-size", required_argument, NULL, 'r'},
979 {NULL, 0, NULL, 0}
980 };
981 static const ul_excl_t excl[] = {
982 {'q', 'v'},
983 {0}
984 };
985 int excl_st[ARRAY_SIZE(excl)] = UL_EXCL_STATUS_INIT;
986 int c;
987
988 while ((c = getopt_long(argc, argv, optstr, long_options, NULL)) != -1) {
989
990 err_exclusive_options(c, long_options, excl, excl_st);
991
992 switch (c) {
993 case 'p':
994 opts.respect_mode = FALSE;
995 break;
996 case 'o':
997 opts.respect_owner = FALSE;
998 break;
999 case 't':
1000 opts.respect_time = FALSE;
1001 break;
1002 case 'X':
1003 opts.respect_xattrs = TRUE;
1004 break;
1005 case 'm':
1006 opts.maximise = TRUE;
1007 break;
1008 case 'M':
1009 opts.minimise = TRUE;
1010 break;
1011 case 'O':
1012 opts.keep_oldest = TRUE;
1013 break;
1014 case 'f':
1015 opts.respect_name = TRUE;
1016 break;
1017 case 'v':
1018 opts.verbosity++;
1019 break;
1020 case 'q':
1021 quiet = TRUE;
1022 break;
1023 case 'c':
1024 opts.respect_mode = FALSE;
1025 opts.respect_name = FALSE;
1026 opts.respect_owner = FALSE;
1027 opts.respect_time = FALSE;
1028 opts.respect_xattrs = FALSE;
1029 break;
1030 case 'n':
1031 opts.dry_run = 1;
1032 break;
1033 case 'x':
1034 register_regex(&opts.exclude, optarg);
1035 break;
1036 case 'y':
1037 opts.method = optarg;
1038 break;
1039 case 'i':
1040 register_regex(&opts.include, optarg);
1041 break;
1042 case 's':
1043 opts.min_size = strtosize_or_err(optarg, _("failed to parse minimum size"));
1044 break;
1045 case 'S':
1046 opts.max_size = strtosize_or_err(optarg, _("failed to parse maximum size"));
1047 break;
1048 case 'r':
1049 opts.cache_size = strtosize_or_err(optarg, _("failed to cache size"));
1050 break;
1051 case 'b':
1052 opts.io_size = strtosize_or_err(optarg, _("failed to parse I/O size"));
1053 break;
1054 case 'h':
1055 usage();
1056 case 'V':
1057 print_version(EXIT_SUCCESS);
1058 default:
1059 errtryhelp(EXIT_FAILURE);}
1060 }
1061
1062 return 0;
1063 }
1064
1065 /**
1066 * to_be_called_atexit - Cleanup handler, also prints statistics.
1067 */
1068 static void to_be_called_atexit(void)
1069 {
1070 if (stats.started)
1071 print_stats();
1072 }
1073
1074 /**
1075 * sighandler - Signal handler, sets the global last_signal variable
1076 * @i: The signal number
1077 */
1078 static void sighandler(int i)
1079 {
1080 if (last_signal != SIGINT)
1081 last_signal = i;
1082 if (i == SIGINT)
1083 putchar('\n');
1084 }
1085
1086 int main(int argc, char *argv[])
1087 {
1088 struct sigaction sa;
1089 int rc;
1090
1091 sa.sa_handler = sighandler;
1092 sa.sa_flags = SA_RESTART;
1093 sigfillset(&sa.sa_mask);
1094
1095 /* If we receive a SIGINT, end the processing */
1096 sigaction(SIGINT, &sa, NULL);
1097 sigaction(SIGUSR1, &sa, NULL);
1098
1099 /* Pretty print numeric output */
1100 setlocale(LC_NUMERIC, "");
1101
1102 if (atexit(to_be_called_atexit) != 0)
1103 err(EXIT_FAILURE, _("cannot register exit handler"));
1104
1105 parse_options(argc, argv);
1106
1107 if (optind == argc)
1108 errx(EXIT_FAILURE, _("no directory or file specified"));
1109
1110 gettime_monotonic(&stats.start_time);
1111
1112 rc = ul_fileeq_init(&fileeq, opts.method);
1113 if (rc != 0 && strcmp(opts.method, "memcmp") != 0) {
1114 warnx(_("cannot initialize %s method, use 'memcmp' fallback"), opts.method);
1115 opts.method = "memcmp";
1116 rc = ul_fileeq_init(&fileeq, opts.method);
1117 }
1118 if (rc < 0)
1119 err(EXIT_FAILURE, _("failed to initialize files comparior"));
1120
1121 /* defautl I/O size */
1122 if (!opts.io_size) {
1123 if (strcmp(opts.method, "memcmp") == 0)
1124 opts.io_size = 8*1024;
1125 else
1126 opts.io_size = 1024*1024;
1127 }
1128
1129 stats.started = TRUE;
1130
1131 jlog(JLOG_VERBOSE2, _("Scanning [device/inode/links]:"));
1132 for (; optind < argc; optind++) {
1133 if (nftw(argv[optind], inserter, 20, FTW_PHYS) == -1)
1134 warn(_("cannot process %s"), argv[optind]);
1135 }
1136
1137 twalk(files, visitor);
1138
1139 ul_fileeq_deinit(&fileeq);
1140 return 0;
1141 }