2 * hardlink - consolidate duplicate files via hardlinks
4 * Copyright (C) 2018 Red Hat, Inc. All rights reserved.
5 * Written by Jakub Jelinek <jakub@redhat.com>
7 * This program is free software; you can redistribute it and/or modify
8 * it under the terms of the GNU General Public License as published by
9 * the Free Software Foundation; either version 2 of the License, or
10 * (at your option) any later version.
12 * This program is distributed in the hope that it would be useful,
13 * but WITHOUT ANY WARRANTY; without even the implied warranty of
14 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
15 * GNU General Public License for more details.
17 * You should have received a copy of the GNU General Public License along
18 * with this program; if not, write to the Free Software Foundation, Inc.,
19 * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA.
22 #include <sys/types.h>
33 # define PCRE2_CODE_UNIT_WIDTH 8
37 #define NHASH (1<<17) /* Must be a power of 2! */
38 #define NIOBUF (1<<12)
44 PCRE2_SPTR exclude_pattern
;
45 pcre2_match_data
*match_data
;
78 __attribute__ ((always_inline
))
79 static inline unsigned int hash(off_t size
, time_t mtime
)
81 return (size
^ mtime
) & (NHASH
- 1);
84 __attribute__ ((always_inline
))
85 static inline int stcmp(struct stat
*st1
, struct stat
*st2
, int content_only
)
88 return st1
->st_size
!= st2
->st_size
;
89 return st1
->st_mode
!= st2
->st_mode
|| st1
->st_uid
!= st2
->st_uid
||
90 st1
->st_gid
!= st2
->st_gid
|| st1
->st_size
!= st2
->st_size
||
91 st1
->st_mtime
!= st2
->st_mtime
;
94 long long ndirs
, nobjects
, nregfiles
, ncomp
, nlinks
, nsaved
;
96 static void doexit(int i
)
99 fprintf(stderr
, "\n\n");
100 fprintf(stderr
, "Directories %lld\n", ndirs
);
101 fprintf(stderr
, "Objects %lld\n", nobjects
);
102 fprintf(stderr
, "IFREG %lld\n", nregfiles
);
103 fprintf(stderr
, "Comparisons %lld\n", ncomp
);
104 fprintf(stderr
, "%s %lld\n",
105 (no_link
? "Would link" : "Linked"), nlinks
);
106 fprintf(stderr
, "%s %lld\n", (no_link
? "Would save" : "saved"),
112 static void usage(char *prog
)
114 fprintf(stderr
, "Usage: %s [-cnvhf] [-x pat] directories...\n", prog
);
116 " -c When finding candidates for linking, compare only file contents.\n");
118 " -n Don't actually link anything, just report what would be done.\n");
119 fprintf(stderr
, " -v Print summary after hardlinking.\n");
121 " -vv Print every hardlinked file and bytes saved + summary.\n");
122 fprintf(stderr
, " -f Force hardlinking across filesystems.\n");
123 fprintf(stderr
, " -x pat Exclude files matching pattern.\n");
124 fprintf(stderr
, " -h Show help.\n");
128 unsigned int buf
[NBUF
];
129 char iobuf1
[NIOBUF
], iobuf2
[NIOBUF
];
131 __attribute__ ((always_inline
))
132 static inline size_t add2(size_t a
, size_t b
)
136 fprintf(stderr
, "\nInteger overflow\n");
142 __attribute__ ((always_inline
))
143 static inline size_t add3(size_t a
, size_t b
, size_t c
)
145 return add2(add2(a
, b
), c
);
153 static void growstr(dynstr
* str
, size_t newlen
)
155 if (newlen
< str
->alloc
)
157 str
->buf
= realloc(str
->buf
, str
->alloc
= add2(newlen
, 1));
159 fprintf(stderr
, "\nOut of memory 4\n");
165 static void rf(const char *name
)
167 struct stat st
, st2
, st3
;
168 const size_t namelen
= strlen(name
);
170 if (lstat(name
, &st
))
172 if (st
.st_dev
!= dev
&& !force
) {
175 "%s is on different filesystem than the rest.\nUse -f option to override.\n",
181 if (S_ISDIR(st
.st_mode
)) {
182 d
*dp
= malloc(add3(sizeof(d
), namelen
, 1));
184 fprintf(stderr
, "\nOut of memory 3\n");
187 memcpy(dp
->name
, name
, namelen
+ 1);
190 } else if (S_ISREG(st
.st_mode
)) {
195 int cksumsize
= sizeof(buf
);
197 time_t mtime
= content_only
? 0 : st
.st_mtime
;
198 unsigned int hsh
= hash(st
.st_size
, mtime
);
202 fprintf(stderr
, " %s", name
);
203 fd
= open(name
, O_RDONLY
);
206 if ((size_t)st
.st_size
< sizeof(buf
)) {
207 cksumsize
= st
.st_size
;
208 memset(((char *)buf
) + cksumsize
, 0,
209 (sizeof(buf
) - cksumsize
) % sizeof(buf
[0]));
211 if (read(fd
, buf
, cksumsize
) != cksumsize
) {
213 if (verbose
> 1 && namelen
<= NAMELEN
)
214 fprintf(stderr
, "\r%*s\r", (int)(namelen
+ 2),
218 cksumsize
= (cksumsize
+ sizeof(buf
[0]) - 1) / sizeof(buf
[0]);
219 for (i
= 0, cksum
= 0; i
< cksumsize
; i
++) {
220 if (cksum
+ buf
[i
] < cksum
)
225 for (hp
= hps
[hsh
]; hp
; hp
= hp
->next
)
226 if (hp
->size
== st
.st_size
&& hp
->mtime
== mtime
)
229 hp
= malloc(sizeof(h
));
231 fprintf(stderr
, "\nOut of memory 1\n");
234 hp
->size
= st
.st_size
;
240 for (fp
= hp
->chain
; fp
; fp
= fp
->next
)
241 if (fp
->cksum
== cksum
)
243 for (fp2
= fp
; fp2
&& fp2
->cksum
== cksum
; fp2
= fp2
->next
)
244 if (fp2
->ino
== st
.st_ino
&& fp2
->dev
== st
.st_dev
) {
246 if (verbose
> 1 && namelen
<= NAMELEN
)
247 fprintf(stderr
, "\r%*s\r",
248 (int)(namelen
+ 2), "");
251 for (fp2
= fp
; fp2
&& fp2
->cksum
== cksum
; fp2
= fp2
->next
)
252 if (!lstat(fp2
->name
, &st2
) && S_ISREG(st2
.st_mode
) &&
253 !stcmp(&st
, &st2
, content_only
) &&
254 st2
.st_ino
!= st
.st_ino
&&
255 st2
.st_dev
== st
.st_dev
) {
256 int fd2
= open(fp2
->name
, O_RDONLY
);
259 if (fstat(fd2
, &st2
) || !S_ISREG(st2
.st_mode
)
260 || st2
.st_size
== 0) {
265 lseek(fd
, 0, SEEK_SET
);
266 for (fsize
= st
.st_size
; fsize
> 0;
269 fsize
>= NIOBUF
? NIOBUF
: fsize
;
270 if (read(fd
, iobuf1
, rsize
) != rsize
276 "\nReading error\n");
279 if (memcmp(iobuf1
, iobuf2
, rsize
))
285 if (lstat(name
, &st3
)) {
287 "\nCould not stat %s again\n",
292 st3
.st_atime
= st
.st_atime
;
293 if (stcmp(&st
, &st3
, 0)) {
295 "\nFile %s changed underneath us\n",
304 ".$$$___cleanit___$$$";
305 const size_t suffixlen
= strlen(suffix
);
306 size_t n2len
= strlen(n2
);
307 dynstr nam2
= { NULL
, 0 };
308 growstr(&nam2
, add2(n2len
, suffixlen
));
309 memcpy(nam2
.buf
, n2
, n2len
);
310 memcpy(&nam2
.buf
[n2len
], suffix
,
312 /* First create a temporary link to n1 under a new name */
313 if (link(n1
, nam2
.buf
)) {
315 "\nFailed to hardlink %s to %s (create temporary link as %s failed - %s)\n",
321 /* Then rename into place over the existing n2 */
322 if (rename(nam2
.buf
, n2
)) {
324 "\nFailed to hardlink %s to %s (rename temporary link to %s failed - %s)\n",
327 /* Something went wrong, try to remove the now redundant temporary link */
328 if (unlink(nam2
.buf
)) {
330 "\nFailed to remove temporary link %s - %s\n",
341 if (st3
.st_nlink
> 1) {
342 /* We actually did not save anything this time, since the link second argument
343 had some other links as well. */
346 "\r%*s\r%s %s to %s\n",
351 (no_link
? "Would link"
352 : "Linked"), n1
, n2
);
355 ((st
.st_size
+ 4095) / 4096) * 4096;
358 "\r%*s\r%s %s to %s, %s %jd\n",
363 (no_link
? "Would link"
365 (no_link
? "would save"
367 (intmax_t)st
.st_size
);
372 fp2
= malloc(add3(sizeof(f
), namelen
, 1));
374 fprintf(stderr
, "\nOut of memory 2\n");
378 fp2
->ino
= st
.st_ino
;
379 fp2
->dev
= st
.st_dev
;
381 memcpy(fp2
->name
, name
, namelen
+ 1);
383 fp2
->next
= fp
->next
;
386 fp2
->next
= hp
->chain
;
389 if (verbose
> 1 && namelen
<= NAMELEN
)
390 fprintf(stderr
, "\r%*s\r", (int)(namelen
+ 2), "");
395 int main(int argc
, char **argv
)
401 PCRE2_SIZE erroroffset
;
403 dynstr nam1
= { NULL
, 0 };
404 while ((ch
= getopt(argc
, argv
, "cnvhfx:")) != -1) {
420 exclude_pattern
= (PCRE2_SPTR
) optarg
;
422 fprintf(stderr
, "option x not supported (built without pcre2)\n");
434 if (exclude_pattern
) {
435 re
= pcre2_compile(exclude_pattern
, /* the pattern */
436 PCRE2_ZERO_TERMINATED
, /* indicates pattern is zero-terminate */
437 0, /* default options */
438 &errornumber
, &erroroffset
, NULL
); /* use default compile context */
440 PCRE2_UCHAR buffer
[256];
441 pcre2_get_error_message(errornumber
, buffer
,
443 fprintf(stderr
, "pattern error at offset %d: %s\n",
444 (int)erroroffset
, buffer
);
447 match_data
= pcre2_match_data_create_from_pattern(re
, NULL
);
450 for (i
= optind
; i
< argc
; i
++)
456 size_t nam1baselen
= strlen(dp
->name
);
458 growstr(&nam1
, add2(nam1baselen
, 1));
459 memcpy(nam1
.buf
, dp
->name
, nam1baselen
);
461 nam1
.buf
[nam1baselen
++] = '/';
462 nam1
.buf
[nam1baselen
] = 0;
463 dh
= opendir(nam1
.buf
);
467 while ((di
= readdir(dh
)) != NULL
) {
470 if (di
->d_name
[0] == '.') {
471 if (!di
->d_name
[1] || !strcmp(di
->d_name
, ".."))
475 if (re
&& pcre2_match(re
, /* compiled regex */
476 (PCRE2_SPTR
) di
->d_name
, strlen(di
->d_name
), 0, /* start at offset 0 */
477 0, /* default options */
478 match_data
, /* block for storing the result */
479 NULL
) /* use default match context */
482 nam1
.buf
[nam1baselen
] = 0;
483 fprintf(stderr
, "Skipping %s%s\n",
484 nam1
.buf
, di
->d_name
);
492 add2(nam1baselen
, subdirlen
=
493 strlen(di
->d_name
)));
494 memcpy(&nam1
.buf
[nam1baselen
], di
->d_name
,