2 * compare files content
4 * The goal is to minimize number of data we need to read from the files and be
5 * ready to compare large set of files, it means reuse the previous data if
6 * possible. It never read entire file if not necessary.
8 * The another goal is to minimize number of open files (imagine "hardlink /"),
9 * the code can open only two files and reopen the file next time if
12 * This code supports multiple comparison methods. The very basic step which is
13 * generic for all methods is to read and compare an "intro" (a few bytes from
14 * the begging of the file). This intro buffer is always cached in 'struct
15 * ul_fileeq_data', this intro buffer is addresses as block=0. This primitive
16 * thing can reduce a lot ...
18 * The next steps depend on selected method:
20 * * memcmp method: always read data to userspace, nothing is cached, compare
21 * directly files content; fast for small sets of the small files.
23 * * Linux crypto API: zero-copy method based on sendfile(), data blocks are
24 * send to the kernel hash functions (sha1, ...), and only hash digest is read
25 * and cached in usersapce. Fast for large set of (large) files.
28 * No copyright is claimed. This code is in the public domain; do with
31 * Written by Karel Zak <kzak@redhat.com> [October 2021]
36 #include <sys/types.h>
41 #ifdef HAVE_LINUX_IF_ALG_H
42 # include <sys/socket.h>
43 # include <linux/if_alg.h>
44 # include <sys/param.h>
45 # include <sys/sendfile.h>
53 static UL_DEBUG_DEFINE_MASK(ulfileeq
);
54 UL_DEBUG_DEFINE_MASKNAMES(ulfileeq
) = UL_DEBUG_EMPTY_MASKNAMES
;
56 #define ULFILEEQ_DEBUG_INIT (1 << 1)
57 #define ULFILEEQ_DEBUG_CRYPTO (1 << 2)
58 #define ULFILEEQ_DEBUG_DATA (1 << 3)
59 #define ULFILEEQ_DEBUG_EQ (1 << 4)
61 #define DBG(m, x) __UL_DBG(ulfileeq, ULFILEEQ_DEBUG_, m, x)
62 #define ON_DBG(m, x) __UL_DBG_CALL(ulfileeq, ULFILEEQ_DEBUG_, m, x)
64 #define UL_DEBUG_CURRENT_MASK UL_DEBUG_MASK(ulfileeq)
67 static void ul_fileeq_init_debug(void)
69 if (ulfileeq_debug_mask
)
71 __UL_INIT_DEBUG_FROM_ENV(ulfileeq
, ULFILEEQ_DEBUG_
, 0, ULFILEEQ_DEBUG
);
81 struct ul_fileeq_method
{
83 const char *name
; /* name used by applications */
84 const char *kname
; /* name used by kernel crypto */
88 static const struct ul_fileeq_method ul_eq_methods
[] = {
89 [UL_FILEEQ_MEMCMP
] = {
90 .id
= UL_FILEEQ_MEMCMP
, .name
= "memcmp"
92 #ifdef USE_FILEEQ_CRYPTOAPI
94 .id
= UL_FILEEQ_SHA1
, .name
= "sha1",
95 .digsiz
= 20, .kname
= "sha1"
97 [UL_FILEEQ_SHA256
] = {
98 .id
= UL_FILEEQ_SHA256
, .name
= "sha256",
99 .digsiz
= 32, .kname
= "sha256"
102 [UL_FILEEQ_CRC32
] = {
103 .id
= UL_FILEEQ_CRC32
, .name
= "crc32",
104 .digsiz
= 4, .kname
= "crc32c"
109 #ifdef USE_FILEEQ_CRYPTOAPI
110 static void deinit_crypto_api(struct ul_fileeq
*eq
)
115 DBG(CRYPTO
, ul_debugobj(eq
, "deinit"));
122 eq
->fd_cip
= eq
->fd_api
= -1;
126 static int init_crypto_api(struct ul_fileeq
*eq
)
128 struct sockaddr_alg sa
= {
129 .salg_family
= AF_ALG
,
134 assert(eq
->method
->kname
);
135 assert(eq
->fd_api
== -1);
136 assert(eq
->fd_cip
== -1);
138 DBG(CRYPTO
, ul_debugobj(eq
, "init [%s]", eq
->method
->kname
));
140 assert(sizeof(sa
.salg_name
) > strlen(eq
->method
->kname
) + 1);
141 memcpy(&sa
.salg_name
, eq
->method
->kname
, strlen(eq
->method
->kname
) + 1);
143 if ((eq
->fd_api
= socket(AF_ALG
, SOCK_SEQPACKET
, 0)) < 0)
145 if (bind(eq
->fd_api
, (struct sockaddr
*) &sa
, sizeof(sa
)) != 0)
147 if ((eq
->fd_cip
= accept(eq
->fd_api
, NULL
, 0)) < 0)
151 deinit_crypto_api(eq
);
156 int ul_fileeq_init(struct ul_fileeq
*eq
, const char *method
)
160 ul_fileeq_init_debug();
161 DBG(EQ
, ul_debugobj(eq
, "init [%s]", method
));
163 memset(eq
, 0, sizeof(*eq
));
167 for (i
= 0; i
< ARRAY_SIZE(ul_eq_methods
); i
++) {
168 const struct ul_fileeq_method
*m
= &ul_eq_methods
[i
];
170 if (strcmp(m
->name
, method
) == 0) {
178 #ifdef USE_FILEEQ_CRYPTOAPI
179 if (eq
->method
->id
!= UL_FILEEQ_MEMCMP
180 && init_crypto_api(eq
) != 0)
186 void ul_fileeq_deinit(struct ul_fileeq
*eq
)
191 DBG(EQ
, ul_debugobj(eq
, "deinit"));
192 #ifdef USE_FILEEQ_CRYPTOAPI
193 deinit_crypto_api(eq
);
199 void ul_fileeq_data_close_file(struct ul_fileeq_data
*data
)
204 DBG(DATA
, ul_debugobj(data
, "close"));
210 void ul_fileeq_data_init(struct ul_fileeq_data
*data
)
212 DBG(DATA
, ul_debugobj(data
, "init"));
213 memset(data
, 0, sizeof(*data
));
217 void ul_fileeq_data_deinit(struct ul_fileeq_data
*data
)
221 DBG(DATA
, ul_debugobj(data
, "deinit"));
229 ul_fileeq_data_close_file(data
);
232 int ul_fileeq_data_associated(struct ul_fileeq_data
*data
)
234 return data
->name
!= NULL
;
237 void ul_fileeq_data_set_file(struct ul_fileeq_data
*data
, const char *name
)
242 DBG(DATA
, ul_debugobj(data
, "set file: %s", name
));
243 ul_fileeq_data_init(data
);
247 size_t ul_fileeq_set_size(struct ul_fileeq
*eq
, uint64_t filesiz
,
248 size_t readsiz
, size_t memsiz
)
250 uint64_t nreads
, maxdigs
;
255 eq
->filesiz
= filesiz
;
257 switch (eq
->method
->id
) {
258 case UL_FILEEQ_MEMCMP
:
259 /* align file size */
260 filesiz
= (filesiz
+ readsiz
) / readsiz
* readsiz
;
263 digsiz
= eq
->method
->digsiz
;
264 if (readsiz
< digsiz
)
266 /* align file size */
267 filesiz
= (filesiz
+ readsiz
) / readsiz
* readsiz
;
268 /* calculate limits */
269 maxdigs
= memsiz
/ digsiz
;
272 nreads
= filesiz
/ readsiz
;
273 /* enlarge readsize for large files */
274 if (nreads
> maxdigs
)
275 readsiz
= filesiz
/ maxdigs
;
279 eq
->readsiz
= readsiz
;
280 eq
->blocksmax
= filesiz
/ readsiz
;
282 DBG(EQ
, ul_debugobj(eq
, "set sizes: filesiz=%ju, maxblocks=%" PRIu64
", readsiz=%zu",
283 eq
->filesiz
, eq
->blocksmax
, eq
->readsiz
));
284 return eq
->blocksmax
;
287 static unsigned char *get_buffer(struct ul_fileeq
*eq
)
290 eq
->buf_a
= malloc(eq
->readsiz
);
292 eq
->buf_b
= malloc(eq
->readsiz
);
294 if (!eq
->buf_a
|| !eq
->buf_b
)
297 if (eq
->buf_last
== eq
->buf_b
)
298 eq
->buf_last
= eq
->buf_a
;
300 eq
->buf_last
= eq
->buf_b
;
305 #define get_cached_nblocks(_d) \
306 ((_d)->nblocks ? (_d)->nblocks - 1 : 0)
308 #define get_cached_offset(_e, _d) \
309 ((_d)->nblocks == 0 ? 0 : \
310 sizeof((_d)->intro) \
311 + (get_cached_nblocks(_d) * (_e)->readsiz))
314 static int get_fd(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
, off_t
*off
)
316 off_t o
= get_cached_offset(eq
, data
);
323 DBG(DATA
, ul_debugobj(data
, "open: %s", data
->name
));
324 data
->fd
= open(data
->name
, O_RDONLY
);
328 #if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE)
329 ignore_result( posix_fadvise(data
->fd
, o
, 0, POSIX_FADV_SEQUENTIAL
) );
332 DBG(DATA
, ul_debugobj(data
, "lseek off=%ju", (uintmax_t) o
));
333 lseek(data
->fd
, o
, SEEK_SET
);
343 static void memcmp_reset(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
)
345 /* only intro[] is cached */
348 /* reset file possition */
350 lseek(data
->fd
, get_cached_offset(eq
, data
), SEEK_SET
);
354 static ssize_t
read_block(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
,
355 size_t n
, unsigned char **block
)
361 if (data
->is_eof
|| n
> eq
->blocksmax
)
364 fd
= get_fd(eq
, data
, &off
);
368 DBG(DATA
, ul_debugobj(data
, " read block off=%ju", (uintmax_t) off
));
370 *block
= get_buffer(eq
);
374 rsz
= read_all(data
->fd
, (char *) *block
, eq
->readsiz
);
376 DBG(DATA
, ul_debugobj(data
, " read failed"));
382 if (rsz
== 0 || (uint64_t) off
>= eq
->filesiz
) {
384 ul_fileeq_data_close_file(data
);
387 DBG(DATA
, ul_debugobj(data
, " read sz=%zu", rsz
));
391 #ifdef USE_FILEEQ_CRYPTOAPI
392 static ssize_t
get_digest(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
,
393 size_t n
, unsigned char **block
)
400 if (n
> eq
->blocksmax
)
403 /* return already cached if alvalable */
404 if (n
< get_cached_nblocks(data
)) {
405 DBG(DATA
, ul_debugobj(data
, " digest cached"));
406 assert(data
->blocks
);
407 *block
= data
->blocks
+ (n
* eq
->method
->digsiz
);
408 return eq
->method
->digsiz
;
412 DBG(DATA
, ul_debugobj(data
, " file EOF"));
417 fd
= get_fd(eq
, data
, &off
);
421 DBG(DATA
, ul_debugobj(data
, " read digest off=%ju", (uintmax_t) off
));
423 sz
= eq
->method
->digsiz
;
426 DBG(DATA
, ul_debugobj(data
, " alloc cache %" PRIu64
, eq
->blocksmax
* sz
));
427 data
->blocks
= malloc(eq
->blocksmax
* sz
);
432 assert(n
<= eq
->blocksmax
);
434 rsz
= sendfile(eq
->fd_cip
, data
->fd
, NULL
, eq
->readsiz
);
435 DBG(DATA
, ul_debugobj(data
, " sent %zu [%zu wanted] to cipher", rsz
, eq
->readsiz
));
442 /* get block digest (note 1st block is data->intro */
443 *block
= data
->blocks
+ (n
* eq
->method
->digsiz
);
444 rsz
= read_all(eq
->fd_cip
, (char *) *block
, sz
);
448 if (rsz
== 0 || (uint64_t) off
>= eq
->filesiz
) {
450 ul_fileeq_data_close_file(data
);
452 DBG(DATA
, ul_debugobj(data
, " get %zuB digest", rsz
));
457 static ssize_t
get_intro(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
,
458 unsigned char **block
)
460 if (data
->nblocks
== 0) {
461 int fd
= get_fd(eq
, data
, NULL
);
466 rsz
= read_all(fd
, (char *) data
->intro
, sizeof(data
->intro
));
467 DBG(DATA
, ul_debugobj(data
, " read %zu bytes intro", sizeof(data
->intro
)));
473 DBG(DATA
, ul_debugobj(data
, " return intro"));
474 *block
= data
->intro
;
475 return sizeof(data
->intro
);
478 static ssize_t
get_cmp_data(struct ul_fileeq
*eq
, struct ul_fileeq_data
*data
,
479 size_t blockno
, unsigned char **block
)
482 return get_intro(eq
, data
, block
);
486 switch (eq
->method
->id
) {
487 case UL_FILEEQ_MEMCMP
:
488 return read_block(eq
, data
, blockno
, block
);
492 #ifdef USE_FILEEQ_CRYPTOAPI
493 return get_digest(eq
, data
, blockno
, block
);
499 #define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
501 int ul_fileeq(struct ul_fileeq
*eq
,
502 struct ul_fileeq_data
*a
, struct ul_fileeq_data
*b
)
507 DBG(EQ
, ul_debugobj(eq
, "--> compare %s %s", a
->name
, b
->name
));
509 if (eq
->method
->id
== UL_FILEEQ_MEMCMP
) {
515 unsigned char *da
, *db
;
518 DBG(EQ
, ul_debugobj(eq
, "compare block #%zu", n
));
520 ca
= get_cmp_data(eq
, a
, n
, &da
);
523 cb
= get_cmp_data(eq
, b
, n
, &db
);
526 if (ca
!= cb
|| ca
== 0) {
531 cmp
= memcmp(da
, db
, ca
);
532 DBG(EQ
, ul_debugobj(eq
, "#%zu=%s", n
, cmp
== 0 ? "match" : "not-match"));
537 if (!a
->is_eof
|| !b
->is_eof
)
538 goto done
; /* filesize chnaged? */
540 DBG(EQ
, ul_debugobj(eq
, "<-- MATCH"));
544 DBG(EQ
, ul_debugobj(eq
, " <-- NOT-MATCH"));
548 #ifdef TEST_PROGRAM_FILEEQ
552 int main(int argc
, char *argv
[])
555 struct ul_fileeq_data a
, b
, c
;
556 const char *method
= "sha1";
557 static const struct option longopts
[] = {
558 { "method", required_argument
, NULL
, 'm' },
559 { "help", no_argument
, NULL
, 'h' },
563 const char *file_a
= NULL
, *file_b
= NULL
, *file_c
= NULL
;
564 struct stat st_a
, st_b
, st_c
;
566 while ((ch
= getopt_long(argc
, argv
, "m:", longopts
, NULL
)) != -1) {
572 printf("usage: %s [options] <file> <file>\n"
573 " -m, --method <memcmp|sha1|crc32> compare method\n",
574 program_invocation_short_name
);
580 file_a
= argv
[optind
++];
582 file_b
= argv
[optind
++];
584 file_c
= argv
[optind
++];
586 if (!file_a
|| !file_b
)
587 errx(EXIT_FAILURE
, "no files specified, see --help");
589 if (stat(file_a
, &st_a
) != 0 || !S_ISREG(st_a
.st_mode
))
590 errx(EXIT_FAILURE
, "%s: wrong file", file_a
);
591 if (stat(file_b
, &st_b
) != 0 || !S_ISREG(st_b
.st_mode
))
592 errx(EXIT_FAILURE
, "%s: wrong file", file_b
);
593 if (file_c
&& (stat(file_c
, &st_c
) != 0 || !S_ISREG(st_c
.st_mode
)))
594 errx(EXIT_FAILURE
, "%s: wrong file", file_c
);
597 if (st_a
.st_size
!= st_b
.st_size
||
598 (file_c
&& st_a
.st_size
!= st_c
.st_size
))
599 errx(EXIT_FAILURE
, "size of the files does not match");
602 rc
= ul_fileeq_init(&eq
, method
);
603 if (rc
!= 0 && strcmp(method
, "memcmp") != 0) {
605 rc
= ul_fileeq_init(&eq
, method
);
608 err(EXIT_FAILURE
, "failed to initialize files comparior");
610 ul_fileeq_data_set_file(&a
, file_a
);
611 ul_fileeq_data_set_file(&b
, file_b
);
613 /* 3rd is optional */
615 ul_fileeq_data_set_file(&c
, file_c
);
617 /* filesiz, readsiz, memsiz */
618 ul_fileeq_set_size(&eq
, st_a
.st_size
, 1024*1024, 4*1024);
620 rc
= ul_fileeq(&eq
, &a
, &b
);
622 printf("1st vs. 2nd: %s\n", rc
== 1 ? "MATCH" : "NOT-MATCH");
624 rc
= ul_fileeq(&eq
, &a
, &c
);
625 printf("1st vs. 3rd: %s\n", rc
== 1 ? "MATCH" : "NOT-MATCH");
627 rc
= ul_fileeq(&eq
, &b
, &c
);
628 printf("2st vs. 3rd: %s\n", rc
== 1 ? "MATCH" : "NOT-MATCH");
631 ul_fileeq_data_deinit(&a
);
632 ul_fileeq_data_deinit(&b
);
635 ul_fileeq_data_deinit(&c
);
637 ul_fileeq_deinit(&eq
);
640 #endif /* TEST_PROGRAM_FILEEQ */