]> git.ipfire.org Git - thirdparty/util-linux.git/blame - lib/fileeq.c
Merge branch 'uuid-time64_t' of https://github.com/thkukuk/util-linux
[thirdparty/util-linux.git] / lib / fileeq.c
CommitLineData
ee2d371c
KZ
1/*
2 * compare files content
3 *
4 * The goal is to minimize number of data we need to read from the files and be
5 * ready to compare large set of files, it means reuse the previous data if
6 * possible. It never read entire file if not necessary.
7 *
8 * The another goal is to minimize number of open files (imagine "hardlink /"),
9 * the code can open only two files and reopen the file next time if
10 * necessary.
11 *
12 * This code supports multiple comparison methods. The very basic step which is
13 * generic for all methods is to read and compare an "intro" (a few bytes from
14 * the begging of the file). This intro buffer is always cached in 'struct
15 * ul_fileeq_data', this intro buffer is addresses as block=0. This primitive
16 * thing can reduce a lot ...
17 *
18 * The next steps depend on selected method:
19 *
20 * * memcmp method: always read data to userspace, nothing is cached, compare
21 * directly files content; fast for small sets of the small files.
22 *
23 * * Linux crypto API: zero-copy method based on sendfile(), data blocks are
24 * send to the kernel hash functions (sha1, ...), and only hash digest is read
25 * and cached in usersapce. Fast for large set of (large) files.
26 *
27 *
28 * No copyright is claimed. This code is in the public domain; do with
29 * it what you wish.
30 *
31 * Written by Karel Zak <kzak@redhat.com> [October 2021]
32 */
33
710e8ecb 34#include <inttypes.h>
ee2d371c
KZ
35#include <stdio.h>
36#include <sys/types.h>
37#include <sys/stat.h>
38#include <unistd.h>
39
40/* Linux crypto */
41#ifdef HAVE_LINUX_IF_ALG_H
42# include <sys/socket.h>
43# include <linux/if_alg.h>
44# include <sys/param.h>
45# include <sys/sendfile.h>
ee2d371c
KZ
46#endif
47
48#include "c.h"
49#include "all-io.h"
50#include "fileeq.h"
51#include "debug.h"
52
53static UL_DEBUG_DEFINE_MASK(ulfileeq);
54UL_DEBUG_DEFINE_MASKNAMES(ulfileeq) = UL_DEBUG_EMPTY_MASKNAMES;
55
56#define ULFILEEQ_DEBUG_INIT (1 << 1)
57#define ULFILEEQ_DEBUG_CRYPTO (1 << 2)
58#define ULFILEEQ_DEBUG_DATA (1 << 3)
59#define ULFILEEQ_DEBUG_EQ (1 << 4)
60
61#define DBG(m, x) __UL_DBG(ulfileeq, ULFILEEQ_DEBUG_, m, x)
62#define ON_DBG(m, x) __UL_DBG_CALL(ulfileeq, ULFILEEQ_DEBUG_, m, x)
63
64#define UL_DEBUG_CURRENT_MASK UL_DEBUG_MASK(ulfileeq)
65#include "debugobj.h"
66
67static void ul_fileeq_init_debug(void)
68{
69 if (ulfileeq_debug_mask)
70 return;
71 __UL_INIT_DEBUG_FROM_ENV(ulfileeq, ULFILEEQ_DEBUG_, 0, ULFILEEQ_DEBUG);
72}
73
74enum {
75 UL_FILEEQ_MEMCMP,
76 UL_FILEEQ_SHA1,
77 UL_FILEEQ_SHA256,
78 UL_FILEEQ_CRC32
79};
80
81struct ul_fileeq_method {
82 int id;
83 const char *name; /* name used by applications */
84 const char *kname; /* name used by kernel crypto */
85 short digsiz;
86};
87
88static const struct ul_fileeq_method ul_eq_methods[] = {
89 [UL_FILEEQ_MEMCMP] = {
90 .id = UL_FILEEQ_MEMCMP, .name = "memcmp"
91 },
4f5e14fd 92#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
93 [UL_FILEEQ_SHA1] = {
94 .id = UL_FILEEQ_SHA1, .name = "sha1",
95 .digsiz = 20, .kname = "sha1"
96 },
97 [UL_FILEEQ_SHA256] = {
98 .id = UL_FILEEQ_SHA256, .name = "sha256",
99 .digsiz = 32, .kname = "sha256"
100 },
101
102 [UL_FILEEQ_CRC32] = {
103 .id = UL_FILEEQ_CRC32, .name = "crc32",
104 .digsiz = 4, .kname = "crc32c"
105 }
106#endif
107};
108
4f5e14fd 109#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
110static void deinit_crypto_api(struct ul_fileeq *eq)
111{
112 if (!eq)
113 return;
114
115 DBG(CRYPTO, ul_debugobj(eq, "deinit"));
116
117 if (eq->fd_cip >= 0)
118 close(eq->fd_cip);
119 if (eq->fd_api >= 0)
120 close(eq->fd_api);
121
122 eq->fd_cip = eq->fd_api = -1;
123
124}
125
126static int init_crypto_api(struct ul_fileeq *eq)
127{
128 struct sockaddr_alg sa = {
129 .salg_family = AF_ALG,
130 .salg_type = "hash",
131 };
132
133 assert(eq->method);
134 assert(eq->method->kname);
135 assert(eq->fd_api == -1);
136 assert(eq->fd_cip == -1);
137
138 DBG(CRYPTO, ul_debugobj(eq, "init [%s]", eq->method->kname));
139
140 assert(sizeof(sa.salg_name) > strlen(eq->method->kname) + 1);
141 memcpy(&sa.salg_name, eq->method->kname, strlen(eq->method->kname) + 1);
142
143 if ((eq->fd_api = socket(AF_ALG, SOCK_SEQPACKET, 0)) < 0)
144 goto fail;
145 if (bind(eq->fd_api, (struct sockaddr *) &sa, sizeof(sa)) != 0)
146 goto fail;
147 if ((eq->fd_cip = accept(eq->fd_api, NULL, 0)) < 0)
148 goto fail;
149 return 0;
150fail:
151 deinit_crypto_api(eq);
152 return -1;
153}
154#endif
155
156int ul_fileeq_init(struct ul_fileeq *eq, const char *method)
157{
158 size_t i;
159
160 ul_fileeq_init_debug();
161 DBG(EQ, ul_debugobj(eq, "init [%s]", method));
162
163 memset(eq, 0, sizeof(*eq));
164 eq->fd_api = -1;
165 eq->fd_cip = -1;
166
167 for (i = 0; i < ARRAY_SIZE(ul_eq_methods); i++) {
168 const struct ul_fileeq_method *m = &ul_eq_methods[i];
169
170 if (strcmp(m->name, method) == 0) {
171 eq->method = m;
172 break;
173 }
174 }
175
176 if (!eq->method)
177 return -1;
4f5e14fd 178#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
179 if (eq->method->id != UL_FILEEQ_MEMCMP
180 && init_crypto_api(eq) != 0)
181 return -1;
182#endif
183 return 0;
184}
185
186void ul_fileeq_deinit(struct ul_fileeq *eq)
187{
188 if (!eq)
189 return;
190
191 DBG(EQ, ul_debugobj(eq, "deinit"));
4f5e14fd 192#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
193 deinit_crypto_api(eq);
194#endif
195 free(eq->buf_a);
196 free(eq->buf_b);
197}
198
199void ul_fileeq_data_close_file(struct ul_fileeq_data *data)
200{
201 assert(data);
202
203 if (data->fd >= 0) {
204 DBG(DATA, ul_debugobj(data, "close"));
205 close(data->fd);
206 }
207 data->fd = -1;
208}
209
210void ul_fileeq_data_init(struct ul_fileeq_data *data)
211{
212 DBG(DATA, ul_debugobj(data, "init"));
213 memset(data, 0, sizeof(*data));
214 data->fd = -1;
215}
216
217void ul_fileeq_data_deinit(struct ul_fileeq_data *data)
218{
219 assert(data);
220
221 DBG(DATA, ul_debugobj(data, "deinit"));
222 free(data->blocks);
223 data->blocks = NULL;
224 data->nblocks = 0;
225 data->maxblocks = 0;
226 data->is_eof = 0;
227 data->name = NULL;
228
229 ul_fileeq_data_close_file(data);
230}
231
232int ul_fileeq_data_associated(struct ul_fileeq_data *data)
233{
234 return data->name != NULL;
235}
236
237void ul_fileeq_data_set_file(struct ul_fileeq_data *data, const char *name)
238{
239 assert(data);
240 assert(name);
241
242 DBG(DATA, ul_debugobj(data, "set file: %s", name));
243 ul_fileeq_data_init(data);
244 data->name = name;
245}
246
247size_t ul_fileeq_set_size(struct ul_fileeq *eq, uint64_t filesiz,
248 size_t readsiz, size_t memsiz)
249{
250 uint64_t nreads, maxdigs;
251 size_t digsiz;
252
253 assert(eq);
254
255 eq->filesiz = filesiz;
256
257 switch (eq->method->id) {
258 case UL_FILEEQ_MEMCMP:
259 /* align file size */
260 filesiz = (filesiz + readsiz) / readsiz * readsiz;
261 break;
262 default:
263 digsiz = eq->method->digsiz;
264 if (readsiz < digsiz)
265 readsiz = digsiz;
266 /* align file size */
267 filesiz = (filesiz + readsiz) / readsiz * readsiz;
268 /* calculate limits */
269 maxdigs = memsiz / digsiz;
b547b4a7
KZ
270 if (maxdigs == 0)
271 maxdigs = 1;
ee2d371c
KZ
272 nreads = filesiz / readsiz;
273 /* enlarge readsize for large files */
274 if (nreads > maxdigs)
275 readsiz = filesiz / maxdigs;
276 break;
277 }
278
279 eq->readsiz = readsiz;
280 eq->blocksmax = filesiz / readsiz;
281
710e8ecb 282 DBG(EQ, ul_debugobj(eq, "set sizes: filesiz=%ju, maxblocks=%" PRIu64 ", readsiz=%zu",
ee2d371c
KZ
283 eq->filesiz, eq->blocksmax, eq->readsiz));
284 return eq->blocksmax;
285}
286
287static unsigned char *get_buffer(struct ul_fileeq *eq)
288{
289 if (!eq->buf_a)
290 eq->buf_a = malloc(eq->readsiz);
291 if (!eq->buf_b)
292 eq->buf_b = malloc(eq->readsiz);
293
294 if (!eq->buf_a || !eq->buf_b)
295 return NULL;
296
297 if (eq->buf_last == eq->buf_b)
298 eq->buf_last = eq->buf_a;
299 else
300 eq->buf_last = eq->buf_b;
301
302 return eq->buf_last;
303}
304
305#define get_cached_nblocks(_d) \
306 ((_d)->nblocks ? (_d)->nblocks - 1 : 0)
307
308#define get_cached_offset(_e, _d) \
309 ((_d)->nblocks == 0 ? 0 : \
310 sizeof((_d)->intro) \
311 + (get_cached_nblocks(_d) * (_e)->readsiz))
312
313
314static int get_fd(struct ul_fileeq *eq, struct ul_fileeq_data *data, off_t *off)
315{
316 off_t o = get_cached_offset(eq, data);
317
318 assert(eq);
319 assert(data);
320
321
322 if (data->fd < 0) {
323 DBG(DATA, ul_debugobj(data, "open: %s", data->name));
324 data->fd = open(data->name, O_RDONLY);
325 if (data->fd < 0)
326 return data->fd;
327
328#if defined(POSIX_FADV_SEQUENTIAL) && defined(HAVE_POSIX_FADVISE)
329 ignore_result( posix_fadvise(data->fd, o, 0, POSIX_FADV_SEQUENTIAL) );
330#endif
331 if (o) {
332 DBG(DATA, ul_debugobj(data, "lseek off=%ju", (uintmax_t) o));
333 lseek(data->fd, o, SEEK_SET);
334 }
335 }
336
337 if (off)
338 *off = o;
339
340 return data->fd;
341}
342
343static void memcmp_reset(struct ul_fileeq *eq, struct ul_fileeq_data *data)
344{
345 /* only intro[] is cached */
346 if (data->nblocks)
347 data->nblocks = 1;
348 /* reset file possition */
349 if (data->fd >= 0)
350 lseek(data->fd, get_cached_offset(eq, data), SEEK_SET);
351 data->is_eof = 0;
352}
353
354static ssize_t read_block(struct ul_fileeq *eq, struct ul_fileeq_data *data,
355 size_t n, unsigned char **block)
356{
357 int fd;
358 off_t off = 0;
359 ssize_t rsz;
360
361 if (data->is_eof || n > eq->blocksmax)
362 return 0;
363
364 fd = get_fd(eq, data, &off);
365 if (fd < 0)
366 return fd;
367
368 DBG(DATA, ul_debugobj(data, " read block off=%ju", (uintmax_t) off));
369
370 *block = get_buffer(eq);
371 if (!*block)
372 return -ENOMEM;
373
374 rsz = read_all(data->fd, (char *) *block, eq->readsiz);
375 if (rsz < 0) {
376 DBG(DATA, ul_debugobj(data, " read failed"));
377 return rsz;
378 }
379 off += rsz;
380 data->nblocks++;
381
382 if (rsz == 0 || (uint64_t) off >= eq->filesiz) {
383 data->is_eof = 1;
384 ul_fileeq_data_close_file(data);
385 }
386
387 DBG(DATA, ul_debugobj(data, " read sz=%zu", rsz));
388 return rsz;
389}
390
4f5e14fd 391#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
392static ssize_t get_digest(struct ul_fileeq *eq, struct ul_fileeq_data *data,
393 size_t n, unsigned char **block)
394{
395 off_t off = 0;
396 ssize_t rsz;
397 size_t sz;
398 int fd;
399
400 if (n > eq->blocksmax)
401 return 0;
402
403 /* return already cached if alvalable */
404 if (n < get_cached_nblocks(data)) {
405 DBG(DATA, ul_debugobj(data, " digest cached"));
406 assert(data->blocks);
407 *block = data->blocks + (n * eq->method->digsiz);
408 return eq->method->digsiz;
409 }
410
411 if (data->is_eof) {
412 DBG(DATA, ul_debugobj(data, " file EOF"));
413 return 0;
414 }
415
416 /* read new block */
417 fd = get_fd(eq, data, &off);
418 if (fd < 0)
419 return fd;
420
421 DBG(DATA, ul_debugobj(data, " read digest off=%ju", (uintmax_t) off));
422
423 sz = eq->method->digsiz;
424
425 if (!data->blocks) {
006612cd 426 DBG(DATA, ul_debugobj(data, " alloc cache %" PRIu64, eq->blocksmax * sz));
ee2d371c
KZ
427 data->blocks = malloc(eq->blocksmax * sz);
428 if (!data->blocks)
429 return -ENOMEM;
430 }
431
432 assert(n <= eq->blocksmax);
433
434 rsz = sendfile(eq->fd_cip, data->fd, NULL, eq->readsiz);
435 DBG(DATA, ul_debugobj(data, " sent %zu [%zu wanted] to cipher", rsz, eq->readsiz));
436
437 if (rsz < 0)
438 return rsz;
439
440 off += rsz;
441
442 /* get block digest (note 1st block is data->intro */
443 *block = data->blocks + (n * eq->method->digsiz);
444 rsz = read_all(eq->fd_cip, (char *) *block, sz);
445
446 if (rsz > 0)
447 data->nblocks++;
448 if (rsz == 0 || (uint64_t) off >= eq->filesiz) {
449 data->is_eof = 1;
450 ul_fileeq_data_close_file(data);
451 }
452 DBG(DATA, ul_debugobj(data, " get %zuB digest", rsz));
453 return rsz;
454}
455#endif
456
457static ssize_t get_intro(struct ul_fileeq *eq, struct ul_fileeq_data *data,
458 unsigned char **block)
459{
460 if (data->nblocks == 0) {
461 int fd = get_fd(eq, data, NULL);
462 ssize_t rsz;
463
464 if (fd < 0)
465 return -1;
466 rsz = read_all(fd, (char *) data->intro, sizeof(data->intro));
467 DBG(DATA, ul_debugobj(data, " read %zu bytes intro", sizeof(data->intro)));
468 if (rsz <= 0)
469 return -1;
470 data->nblocks = 1;
471 }
472
473 DBG(DATA, ul_debugobj(data, " return intro"));
474 *block = data->intro;
475 return sizeof(data->intro);
476}
477
478static ssize_t get_cmp_data(struct ul_fileeq *eq, struct ul_fileeq_data *data,
479 size_t blockno, unsigned char **block)
480{
481 if (blockno == 0)
482 return get_intro(eq, data, block);
483
484 blockno--;
485
486 switch (eq->method->id) {
487 case UL_FILEEQ_MEMCMP:
488 return read_block(eq, data, blockno, block);
489 default:
490 break;
491 }
4f5e14fd 492#ifdef USE_FILEEQ_CRYPTOAPI
ee2d371c
KZ
493 return get_digest(eq, data, blockno, block);
494#else
495 return -1;
496#endif
497}
498
499#define CMP(a, b) ((a) > (b) ? 1 : ((a) < (b) ? -1 : 0))
500
501int ul_fileeq(struct ul_fileeq *eq,
502 struct ul_fileeq_data *a, struct ul_fileeq_data *b)
503{
504 int cmp;
505 size_t n = 0;
506
507 DBG(EQ, ul_debugobj(eq, "--> compare %s %s", a->name, b->name));
508
509 if (eq->method->id == UL_FILEEQ_MEMCMP) {
510 memcmp_reset(eq, a);
511 memcmp_reset(eq, b);
512 }
513
514 do {
515 unsigned char *da, *db;
516 ssize_t ca, cb;
517
518 DBG(EQ, ul_debugobj(eq, "compare block #%zu", n));
519
520 ca = get_cmp_data(eq, a, n, &da);
521 if (ca < 0)
522 goto done;
523 cb = get_cmp_data(eq, b, n, &db);
524 if (cb < 0)
525 goto done;
526 if (ca != cb || ca == 0) {
527 cmp = CMP(ca, cb);
528 break;
529
530 }
531 cmp = memcmp(da, db, ca);
532 DBG(EQ, ul_debugobj(eq, "#%zu=%s", n, cmp == 0 ? "match" : "not-match"));
533 n++;
534 } while (cmp == 0);
535
536 if (cmp == 0) {
537 if (!a->is_eof || !b->is_eof)
538 goto done; /* filesize chnaged? */
539
540 DBG(EQ, ul_debugobj(eq, "<-- MATCH"));
541 return 1;
542 }
543done:
544 DBG(EQ, ul_debugobj(eq, " <-- NOT-MATCH"));
545 return 0;
546}
547
548#ifdef TEST_PROGRAM_FILEEQ
549# include <getopt.h>
550# include <err.h>
551
552int main(int argc, char *argv[])
553{
554 struct ul_fileeq eq;
555 struct ul_fileeq_data a, b, c;
556 const char *method = "sha1";
557 static const struct option longopts[] = {
558 { "method", required_argument, NULL, 'm' },
559 { "help", no_argument, NULL, 'h' },
560 { NULL, 0, NULL, 0 }
561 };
562 int ch, rc;
563 const char *file_a = NULL, *file_b = NULL, *file_c = NULL;
564 struct stat st_a, st_b, st_c;
565
566 while ((ch = getopt_long(argc, argv, "m:", longopts, NULL)) != -1) {
567 switch (ch) {
568 case 'm':
569 method = optarg;
570 break;
571 case 'h':
572 printf("usage: %s [options] <file> <file>\n"
573 " -m, --method <memcmp|sha1|crc32> compare method\n",
574 program_invocation_short_name);
575 return EXIT_FAILURE;
576 }
577 }
578
579 if (optind < argc)
580 file_a = argv[optind++];
581 if (optind < argc)
582 file_b = argv[optind++];
583 if (optind < argc)
584 file_c = argv[optind++];
585
586 if (!file_a || !file_b)
587 errx(EXIT_FAILURE, "no files specified, see --help");
588
589 if (stat(file_a, &st_a) != 0 || !S_ISREG(st_a.st_mode))
590 errx(EXIT_FAILURE, "%s: wrong file", file_a);
591 if (stat(file_b, &st_b) != 0 || !S_ISREG(st_b.st_mode))
592 errx(EXIT_FAILURE, "%s: wrong file", file_b);
593 if (file_c && (stat(file_c, &st_c) != 0 || !S_ISREG(st_c.st_mode)))
594 errx(EXIT_FAILURE, "%s: wrong file", file_c);
595
596
597 if (st_a.st_size != st_b.st_size ||
598 (file_c && st_a.st_size != st_c.st_size))
599 errx(EXIT_FAILURE, "size of the files does not match");
600
601
602 rc = ul_fileeq_init(&eq, method);
603 if (rc != 0 && strcmp(method, "memcmp") != 0) {
604 method = "memcmp";
605 rc = ul_fileeq_init(&eq, method);
606 }
607 if (rc < 0)
608 err(EXIT_FAILURE, "failed to initialize files comparior");
609
610 ul_fileeq_data_set_file(&a, file_a);
611 ul_fileeq_data_set_file(&b, file_b);
612
613 /* 3rd is optional */
614 if (file_c)
615 ul_fileeq_data_set_file(&c, file_c);
616
617 /* filesiz, readsiz, memsiz */
618 ul_fileeq_set_size(&eq, st_a.st_size, 1024*1024, 4*1024);
619
620 rc = ul_fileeq(&eq, &a, &b);
621
622 printf("1st vs. 2nd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
623 if (file_c) {
624 rc = ul_fileeq(&eq, &a, &c);
625 printf("1st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
626
627 rc = ul_fileeq(&eq, &b, &c);
628 printf("2st vs. 3rd: %s\n", rc == 1 ? "MATCH" : "NOT-MATCH");
629 }
630
631 ul_fileeq_data_deinit(&a);
632 ul_fileeq_data_deinit(&b);
633
634 if (file_c)
635 ul_fileeq_data_deinit(&c);
636
637 ul_fileeq_deinit(&eq);
638 return EXIT_FAILURE;
639}
640#endif /* TEST_PROGRAM_FILEEQ */