2 * hardlink - consolidate duplicate files via hardlinks
4 * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
5 * Written by Jakub Jelinek <jakub@redhat.com>
7 * Copyright (C) 2019 Karel Zak <kzak@redhat.com>
9 * This program is free software; you can redistribute it and/or modify
10 * it under the terms of the GNU General Public License as published by
11 * the Free Software Foundation; either version 2 of the License, or
12 * (at your option) any later version.
14 * This program is distributed in the hope that it would be useful,
15 * but WITHOUT ANY WARRANTY; without even the implied warranty of
16 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
17 * GNU General Public License for more details.
19 * You should have received a copy of the GNU General Public License along
20 * with this program; if not, write to the Free Software Foundation, Inc.,
21 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
23 #include <sys/types.h>
35 # define PCRE2_CODE_UNIT_WIDTH 8
42 #include "closestream.h"
44 #define NHASH (1<<17) /* Must be a power of 2! */
49 struct hardlink_hash
{
50 struct hardlink_hash
*next
;
51 struct hardlink_file
*chain
;
57 struct hardlink_dir
*next
;
61 struct hardlink_file
{
62 struct hardlink_file
*next
;
69 struct hardlink_dynstr
{
75 struct hardlink_dir
*dirs
;
76 struct hardlink_hash
*hps
[NHASH
];
79 /* summary counters */
80 unsigned long long ndirs
;
81 unsigned long long nobjects
;
82 unsigned long long nregfiles
;
83 unsigned long long ncomp
;
84 unsigned long long nlinks
;
85 unsigned long long nsaved
;
95 /* ctl is in global scope due use in atexit() */
96 struct hardlink_ctl global_ctl
;
98 __attribute__ ((always_inline
))
99 static inline unsigned int hash(off_t size
, time_t mtime
)
101 return (size
^ mtime
) & (NHASH
- 1);
104 __attribute__ ((always_inline
))
105 static inline int stcmp(struct stat
*st1
, struct stat
*st2
, int content_scope
)
108 return st1
->st_size
!= st2
->st_size
;
110 return st1
->st_mode
!= st2
->st_mode
111 || st1
->st_uid
!= st2
->st_uid
112 || st1
->st_gid
!= st2
->st_gid
113 || st1
->st_size
!= st2
->st_size
114 || st1
->st_mtime
!= st2
->st_mtime
;
117 static void print_summary(void)
119 struct hardlink_ctl
const *const ctl
= &global_ctl
;
124 if (ctl
->verbose
> 1 && ctl
->nlinks
)
127 printf(_("Directories: %9lld\n"), ctl
->ndirs
);
128 printf(_("Objects: %9lld\n"), ctl
->nobjects
);
129 printf(_("Regular files: %9lld\n"), ctl
->nregfiles
);
130 printf(_("Comparisons: %9lld\n"), ctl
->ncomp
);
131 printf( "%s%9lld\n", (ctl
->no_link
?
133 _("Linked: ")), ctl
->nlinks
);
134 printf( "%s %9lld\n", (ctl
->no_link
?
136 _("Saved: ")), ctl
->nsaved
);
139 static void __attribute__((__noreturn__
)) usage(void)
141 fputs(USAGE_HEADER
, stdout
);
142 printf(_(" %s [options] directory...\n"), program_invocation_short_name
);
144 fputs(USAGE_SEPARATOR
, stdout
);
145 puts(_("Consolidate duplicate files using hardlinks."));
147 fputs(USAGE_OPTIONS
, stdout
);
148 puts(_(" -c, --content compare only contents, ignore permission, etc."));
149 puts(_(" -n, --dry-run don't actually link anything"));
150 puts(_(" -v, --verbose print summary after hardlinking"));
151 puts(_(" -vv print every hardlinked file and summary"));
152 puts(_(" -f, --force force hardlinking across filesystems"));
153 puts(_(" -x, --exclude <regex> exclude files matching pattern"));
155 fputs(USAGE_SEPARATOR
, stdout
);
156 printf(USAGE_HELP_OPTIONS(16)); /* char offset to align option descriptions */
157 printf(USAGE_MAN_TAIL("hardlink(1)"));
161 __attribute__ ((always_inline
))
162 static inline size_t add2(size_t a
, size_t b
)
167 errx(EXIT_FAILURE
, _("integer overflow"));
171 __attribute__ ((always_inline
))
172 static inline size_t add3(size_t a
, size_t b
, size_t c
)
174 return add2(add2(a
, b
), c
);
177 static void growstr(struct hardlink_dynstr
*str
, size_t newlen
)
179 if (newlen
< str
->alloc
)
181 str
->buf
= xrealloc(str
->buf
, str
->alloc
= add2(newlen
, 1));
184 static void process_path(struct hardlink_ctl
*ctl
, const char *name
)
186 struct stat st
, st2
, st3
;
187 const size_t namelen
= strlen(name
);
190 if (lstat(name
, &st
))
193 if (st
.st_dev
!= ctl
->dev
&& !ctl
->force
) {
196 _("%s is on different filesystem than the rest "
197 "(use -f option to override)."), name
);
198 ctl
->dev
= st
.st_dev
;
200 if (S_ISDIR(st
.st_mode
)) {
201 struct hardlink_dir
*dp
= xmalloc(add3(sizeof(*dp
), namelen
, 1));
202 memcpy(dp
->name
, name
, namelen
+ 1);
203 dp
->next
= ctl
->dirs
;
206 } else if (S_ISREG(st
.st_mode
)) {
208 struct hardlink_file
*fp
, *fp2
;
209 struct hardlink_hash
*hp
;
211 unsigned int buf
[NBUF
];
212 int cksumsize
= sizeof(buf
);
214 time_t mtime
= ctl
->content_only
? 0 : st
.st_mtime
;
215 unsigned int hsh
= hash(st
.st_size
, mtime
);
219 if (ctl
->verbose
> 1)
220 printf("%s\n", name
);
222 fd
= open(name
, O_RDONLY
);
226 if ((size_t)st
.st_size
< sizeof(buf
)) {
227 cksumsize
= st
.st_size
;
228 memset(((char *)buf
) + cksumsize
, 0,
229 (sizeof(buf
) - cksumsize
) % sizeof(buf
[0]));
231 if (read(fd
, buf
, cksumsize
) != cksumsize
) {
235 cksumsize
= (cksumsize
+ sizeof(buf
[0]) - 1) / sizeof(buf
[0]);
236 for (i
= 0, cksum
= 0; i
< cksumsize
; i
++) {
237 if (cksum
+ buf
[i
] < cksum
)
242 for (hp
= ctl
->hps
[hsh
]; hp
; hp
= hp
->next
) {
243 if (hp
->size
== st
.st_size
&& hp
->mtime
== mtime
)
247 hp
= xmalloc(sizeof(*hp
));
248 hp
->size
= st
.st_size
;
251 hp
->next
= ctl
->hps
[hsh
];
254 for (fp
= hp
->chain
; fp
; fp
= fp
->next
) {
255 if (fp
->cksum
== cksum
)
258 for (fp2
= fp
; fp2
&& fp2
->cksum
== cksum
; fp2
= fp2
->next
) {
259 if (fp2
->ino
== st
.st_ino
&& fp2
->dev
== st
.st_dev
) {
264 for (fp2
= fp
; fp2
&& fp2
->cksum
== cksum
; fp2
= fp2
->next
) {
266 if (!lstat(fp2
->name
, &st2
) && S_ISREG(st2
.st_mode
) &&
267 !stcmp(&st
, &st2
, ctl
->content_only
) &&
268 st2
.st_ino
!= st
.st_ino
&&
269 st2
.st_dev
== st
.st_dev
) {
271 int fd2
= open(fp2
->name
, O_RDONLY
);
275 if (fstat(fd2
, &st2
) || !S_ISREG(st2
.st_mode
)
276 || st2
.st_size
== 0) {
281 lseek(fd
, 0, SEEK_SET
);
283 for (fsize
= st
.st_size
; fsize
> 0;
284 fsize
-= (off_t
)sizeof(ctl
->iobuf1
)) {
286 ssize_t rsize
= fsize
> (ssize_t
) sizeof(ctl
->iobuf1
) ?
287 (ssize_t
) sizeof(ctl
->iobuf1
) : fsize
;
289 if ((xsz
= read(fd
, ctl
->iobuf1
, rsize
)) != rsize
)
290 warn(_("cannot read %s"), name
);
291 else if ((xsz
= read(fd2
, ctl
->iobuf2
, rsize
)) != rsize
)
292 warn(_("cannot read %s"), fp2
->name
);
299 if (memcmp(ctl
->iobuf1
, ctl
->iobuf2
, rsize
) != 0)
305 if (lstat(name
, &st3
)) {
306 warn(_("cannot stat %s"), name
);
310 st3
.st_atime
= st
.st_atime
;
311 if (stcmp(&st
, &st3
, 0)) {
312 warnx(_("file %s changed underneath us"), name
);
321 ".$$$___cleanit___$$$";
322 const size_t suffixlen
= strlen(suffix
);
323 size_t n2len
= strlen(n2
);
324 struct hardlink_dynstr nam2
= { NULL
, 0 };
326 growstr(&nam2
, add2(n2len
, suffixlen
));
327 memcpy(nam2
.buf
, n2
, n2len
);
328 memcpy(&nam2
.buf
[n2len
], suffix
,
330 /* First create a temporary link to n1 under a new name */
331 if (link(n1
, nam2
.buf
)) {
332 warn(_("failed to hardlink %s to %s (create temporary link as %s failed)"),
337 /* Then rename into place over the existing n2 */
338 if (rename(nam2
.buf
, n2
)) {
339 warn(_("failed to hardlink %s to %s (rename temporary link to %s failed)"),
341 /* Something went wrong, try to remove the now redundant temporary link */
342 if (unlink(nam2
.buf
))
343 warn(_("failed to remove temporary link %s"), nam2
.buf
);
350 if (st3
.st_nlink
> 1) {
351 /* We actually did not save anything this time, since the link second argument
352 had some other links as well. */
353 if (ctl
->verbose
> 1)
354 printf(_(" %s %s to %s\n"),
355 (ctl
->no_link
? _("Would link") : _("Linked")),
358 ctl
->nsaved
+= ((st
.st_size
+ 4095) / 4096) * 4096;
359 if (ctl
->verbose
> 1)
360 printf(_(" %s %s to %s, %s %jd\n"),
361 (ctl
->no_link
? _("Would link") : _("Linked")),
363 (ctl
->no_link
? _("would save") : _("saved")),
364 (intmax_t)st
.st_size
);
370 fp2
= xmalloc(add3(sizeof(*fp2
), namelen
, 1));
372 fp2
->ino
= st
.st_ino
;
373 fp2
->dev
= st
.st_dev
;
375 memcpy(fp2
->name
, name
, namelen
+ 1);
378 fp2
->next
= fp
->next
;
381 fp2
->next
= hp
->chain
;
388 int main(int argc
, char **argv
)
394 PCRE2_SIZE erroroffset
;
395 pcre2_code
*re
= NULL
;
396 PCRE2_SPTR exclude_pattern
= NULL
;
397 pcre2_match_data
*match_data
= NULL
;
399 struct hardlink_dynstr nam1
= { NULL
, 0 };
400 struct hardlink_ctl
*ctl
= &global_ctl
;
402 static const struct option longopts
[] = {
403 { "content", no_argument
, NULL
, 'c' },
404 { "dry-run", no_argument
, NULL
, 'n' },
405 { "exclude", required_argument
, NULL
, 'x' },
406 { "force", no_argument
, NULL
, 'f' },
407 { "help", no_argument
, NULL
, 'h' },
408 { "verbose", no_argument
, NULL
, 'v' },
409 { "version", no_argument
, NULL
, 'V' },
410 { NULL
, 0, NULL
, 0 },
413 setlocale(LC_ALL
, "");
414 bindtextdomain(PACKAGE
, LOCALEDIR
);
416 close_stdout_atexit();
418 while ((ch
= getopt_long(argc
, argv
, "cnvfx:Vh", longopts
, NULL
)) != -1) {
427 ctl
->content_only
= 1;
434 exclude_pattern
= (PCRE2_SPTR
) optarg
;
437 _("option --exclude not supported (built without pcre2)"));
441 print_version(EXIT_SUCCESS
);
445 errtryhelp(EXIT_FAILURE
);
449 if (optind
== argc
) {
450 warnx(_("no directory specified"));
451 errtryhelp(EXIT_FAILURE
);
455 if (exclude_pattern
) {
456 re
= pcre2_compile(exclude_pattern
, /* the pattern */
457 PCRE2_ZERO_TERMINATED
, /* indicates pattern is zero-terminate */
458 0, /* default options */
459 &errornumber
, &erroroffset
, NULL
); /* use default compile context */
461 PCRE2_UCHAR buffer
[256];
462 pcre2_get_error_message(errornumber
, buffer
,
464 errx(EXIT_FAILURE
, _("pattern error at offset %d: %s"),
465 (int)erroroffset
, buffer
);
467 match_data
= pcre2_match_data_create_from_pattern(re
, NULL
);
470 atexit(print_summary
);
472 for (i
= optind
; i
< argc
; i
++)
473 process_path(ctl
, argv
[i
]);
478 struct hardlink_dir
*dp
= ctl
->dirs
;
479 size_t nam1baselen
= strlen(dp
->name
);
481 ctl
->dirs
= dp
->next
;
482 growstr(&nam1
, add2(nam1baselen
, 1));
483 memcpy(nam1
.buf
, dp
->name
, nam1baselen
);
485 nam1
.buf
[nam1baselen
++] = '/';
486 nam1
.buf
[nam1baselen
] = 0;
487 dh
= opendir(nam1
.buf
);
493 while ((di
= readdir(dh
)) != NULL
) {
496 if (di
->d_name
[0] == '.') {
497 if (!di
->d_name
[1] || !strcmp(di
->d_name
, ".."))
501 if (re
&& pcre2_match(re
, /* compiled regex */
502 (PCRE2_SPTR
) di
->d_name
, strlen(di
->d_name
), 0, /* start at offset 0 */
503 0, /* default options */
504 match_data
, /* block for storing the result */
505 NULL
) /* use default match context */
508 nam1
.buf
[nam1baselen
] = 0;
509 printf(_("Skipping %s%s\n"), nam1
.buf
, di
->d_name
);
517 add2(nam1baselen
, subdirlen
=
518 strlen(di
->d_name
)));
519 memcpy(&nam1
.buf
[nam1baselen
], di
->d_name
,
522 process_path(ctl
, nam1
.buf
);