]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
hashmap: introduce hash_ops to make struct Hashmap smaller
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45 #include <math.h>
46
47 #ifdef HAVE_LINUX_BTRFS_H
48 #include <linux/btrfs.h>
49 #endif
50
51 #ifdef HAVE_FANOTIFY_INIT
52 #include <sys/fanotify.h>
53 #endif
54
55 #include "systemd/sd-daemon.h"
56
57 #include "missing.h"
58 #include "util.h"
59 #include "set.h"
60 #include "ioprio.h"
61 #include "readahead-common.h"
62 #include "virt.h"
63
64 /* fixme:
65 *
66 * - detect ssd on btrfs/lvm...
67 * - read ahead directories
68 * - gzip?
69 * - remount rw?
70 * - handle files where nothing is in mincore
71 * - does ioprio_set work with fadvise()?
72 */
73
74 static ReadaheadShared *shared = NULL;
75 static usec_t starttime;
76
77 /* Avoid collisions with the NULL pointer */
78 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
80
81 static int btrfs_defrag(int fd) {
82 struct btrfs_ioctl_vol_args data = { .fd = fd };
83
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
85 }
86
87 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
88 struct stat st;
89 void *start = MAP_FAILED;
90 uint8_t *vec;
91 uint32_t b, c;
92 uint64_t inode;
93 size_t l, pages;
94 bool mapped;
95 int r = 0, fd = -1, k;
96
97 assert(pack);
98 assert(fn);
99
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
101 if (fd < 0) {
102
103 if (errno == ENOENT)
104 return 0;
105
106 if (errno == EPERM || errno == EACCES)
107 return 0;
108
109 log_warning("open(%s) failed: %m", fn);
110 r = -errno;
111 goto finish;
112 }
113
114 k = file_verify(fd, fn, arg_file_size_max, &st);
115 if (k <= 0) {
116 r = k;
117 goto finish;
118 }
119
120 if (on_btrfs)
121 btrfs_defrag(fd);
122
123 l = PAGE_ALIGN(st.st_size);
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
126 log_warning("mmap(%s) failed: %m", fn);
127 r = -errno;
128 goto finish;
129 }
130
131 pages = l / page_size();
132 vec = alloca0(pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
135 r = -errno;
136 goto finish;
137 }
138
139 fputs(fn, pack);
140 fputc('\n', pack);
141
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
145
146 mapped = false;
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
149
150 if (!mapped && new_mapped)
151 b = c;
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
155
156 log_debug("%s: page %u to %u", fn, b, c);
157 }
158
159 mapped = new_mapped;
160 }
161
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
166
167 log_debug("%s: page %u to %u", fn, b, c);
168 }
169
170 /* End marker */
171 b = 0;
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
174
175 finish:
176 if (start != MAP_FAILED)
177 munmap(start, l);
178
179 safe_close(fd);
180
181 return r;
182 }
183
184 static unsigned long fd_first_block(int fd) {
185 struct {
186 struct fiemap fiemap;
187 struct fiemap_extent extent;
188 } data = {
189 .fiemap.fm_length = ~0ULL,
190 .fiemap.fm_extent_count = 1,
191 };
192
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
194 return 0;
195
196 if (data.fiemap.fm_mapped_extents <= 0)
197 return 0;
198
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
200 return 0;
201
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
203 }
204
205 struct item {
206 const char *path;
207 unsigned long block;
208 unsigned long bin;
209 };
210
211 static int qsort_compare(const void *a, const void *b) {
212 const struct item *i, *j;
213
214 i = a;
215 j = b;
216
217 /* sort by bin first */
218 if (i->bin < j->bin)
219 return -1;
220 if (i->bin > j->bin)
221 return 1;
222
223 /* then sort by sector */
224 if (i->block < j->block)
225 return -1;
226 if (i->block > j->block)
227 return 1;
228
229 return strcmp(i->path, j->path);
230 }
231
232 static int collect(const char *root) {
233 enum {
234 FD_FANOTIFY, /* Get the actual fs events */
235 FD_SIGNAL,
236 FD_INOTIFY, /* We get notifications to quit early via this fd */
237 _FD_MAX
238 };
239 struct pollfd pollfd[_FD_MAX] = {};
240 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
241 pid_t my_pid;
242 Hashmap *files = NULL;
243 Iterator i;
244 char *p, *q;
245 sigset_t mask;
246 FILE *pack = NULL;
247 char *pack_fn_new = NULL, *pack_fn = NULL;
248 bool on_ssd, on_btrfs;
249 struct statfs sfs;
250 usec_t not_after;
251 uint64_t previous_block_readahead;
252 bool previous_block_readahead_set = false;
253
254 assert(root);
255
256 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
257 r = log_oom();
258 goto finish;
259 }
260
261 starttime = now(CLOCK_MONOTONIC);
262
263 /* If there's no pack file yet we lower the kernel readahead
264 * so that mincore() is accurate. If there is a pack file
265 * already we assume it is accurate enough so that kernel
266 * readahead is never triggered. */
267 previous_block_readahead_set =
268 access(pack_fn, F_OK) < 0 &&
269 block_get_readahead(root, &previous_block_readahead) >= 0 &&
270 block_set_readahead(root, 8*1024) >= 0;
271
272 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
273 log_warning("Failed to set IDLE IO priority class: %m");
274
275 assert_se(sigemptyset(&mask) == 0);
276 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
277 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
278
279 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
280 log_error("signalfd(): %m");
281 r = -errno;
282 goto finish;
283 }
284
285 files = hashmap_new(&string_hash_ops);
286 if (!files) {
287 log_error("Failed to allocate set.");
288 r = -ENOMEM;
289 goto finish;
290 }
291
292 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
293 if (fanotify_fd < 0) {
294 log_error("Failed to create fanotify object: %m");
295 r = -errno;
296 goto finish;
297 }
298
299 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
300 log_error("Failed to mark %s: %m", root);
301 r = -errno;
302 goto finish;
303 }
304
305 inotify_fd = open_inotify();
306 if (inotify_fd < 0) {
307 r = inotify_fd;
308 goto finish;
309 }
310
311 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
312
313 my_pid = getpid();
314
315 pollfd[FD_FANOTIFY].fd = fanotify_fd;
316 pollfd[FD_FANOTIFY].events = POLLIN;
317 pollfd[FD_SIGNAL].fd = signal_fd;
318 pollfd[FD_SIGNAL].events = POLLIN;
319 pollfd[FD_INOTIFY].fd = inotify_fd;
320 pollfd[FD_INOTIFY].events = POLLIN;
321
322 sd_notify(0,
323 "READY=1\n"
324 "STATUS=Collecting readahead data");
325
326 log_debug("Collecting...");
327
328 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
329 log_debug("Collection canceled");
330 r = -ECANCELED;
331 goto finish;
332 }
333
334 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
335 log_debug("Got termination request");
336 goto done;
337 }
338
339 for (;;) {
340 union {
341 struct fanotify_event_metadata metadata;
342 char buffer[4096];
343 } data;
344 ssize_t n;
345 struct fanotify_event_metadata *m;
346 usec_t t;
347 int h;
348
349 if (hashmap_size(files) > arg_files_max) {
350 log_debug("Reached maximum number of read ahead files, ending collection.");
351 break;
352 }
353
354 t = now(CLOCK_MONOTONIC);
355 if (t >= not_after) {
356 log_debug("Reached maximum collection time, ending collection.");
357 break;
358 }
359
360 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
361
362 if (errno == EINTR)
363 continue;
364
365 log_error("poll(): %m");
366 r = -errno;
367 goto finish;
368 }
369
370 if (h == 0) {
371 log_debug("Reached maximum collection time, ending collection.");
372 break;
373 }
374
375 if (pollfd[FD_SIGNAL].revents) {
376 log_debug("Got signal.");
377 break;
378 }
379
380 if (pollfd[FD_INOTIFY].revents) {
381 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
382 struct inotify_event *e;
383
384 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
386 continue;
387
388 log_error("Failed to read inotify event: %m");
389 r = -errno;
390 goto finish;
391 }
392
393 e = (struct inotify_event*) inotify_buffer;
394 while (n > 0) {
395 size_t step;
396
397 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
398 log_debug("Collection canceled");
399 r = -ECANCELED;
400 goto finish;
401 }
402
403 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
404 log_debug("Got termination request");
405 goto done;
406 }
407
408 step = sizeof(struct inotify_event) + e->len;
409 assert(step <= (size_t) n);
410
411 e = (struct inotify_event*) ((uint8_t*) e + step);
412 n -= step;
413 }
414 }
415
416 n = read(fanotify_fd, &data, sizeof(data));
417 if (n < 0) {
418
419 if (errno == EINTR || errno == EAGAIN)
420 continue;
421
422 /* fanotify sometimes returns EACCES on read()
423 * where it shouldn't. For now let's just
424 * ignore it here (which is safe), but
425 * eventually this should be
426 * dropped when the kernel is fixed.
427 *
428 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
429 if (errno == EACCES)
430 continue;
431
432 log_error("Failed to read event: %m");
433 r = -errno;
434 goto finish;
435 }
436
437 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
438 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
439 int k;
440
441 if (m->fd < 0)
442 goto next_iteration;
443
444 if (m->pid == my_pid)
445 goto next_iteration;
446
447 __sync_synchronize();
448 if (m->pid == shared->replay)
449 goto next_iteration;
450
451 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
452 k = readlink_malloc(fn, &p);
453 if (k >= 0) {
454 if (startswith(p, "/tmp") ||
455 endswith(p, " (deleted)") ||
456 hashmap_get(files, p))
457 /* Not interesting, or
458 * already read */
459 free(p);
460 else {
461 unsigned long ul;
462 usec_t entrytime;
463 struct item *entry;
464
465 entry = new0(struct item, 1);
466 if (!entry) {
467 r = log_oom();
468 goto finish;
469 }
470
471 ul = fd_first_block(m->fd);
472
473 entrytime = now(CLOCK_MONOTONIC);
474
475 entry->block = ul;
476 entry->path = strdup(p);
477 if (!entry->path) {
478 free(entry);
479 r = log_oom();
480 goto finish;
481 }
482 entry->bin = (entrytime - starttime) / 2000000;
483
484 k = hashmap_put(files, p, entry);
485 if (k < 0) {
486 log_warning("hashmap_put() failed: %s", strerror(-k));
487 free(p);
488 }
489 }
490
491 } else
492 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
493
494 next_iteration:
495 safe_close(m->fd);
496 }
497 }
498
499 done:
500 fanotify_fd = safe_close(fanotify_fd);
501
502 log_debug("Writing Pack File...");
503
504 on_ssd = fs_on_ssd(root) > 0;
505 log_debug("On SSD: %s", yes_no(on_ssd));
506
507 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
508 log_debug("On btrfs: %s", yes_no(on_btrfs));
509
510 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
511 r = log_oom();
512 goto finish;
513 }
514
515 pack = fopen(pack_fn_new, "we");
516 if (!pack) {
517 log_error("Failed to open pack file: %m");
518 r = -errno;
519 goto finish;
520 }
521
522 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
523 putc(on_ssd ? 'S' : 'R', pack);
524
525 if (on_ssd || on_btrfs) {
526
527 /* On SSD or on btrfs, just write things out in the
528 * order the files were accessed. */
529
530 HASHMAP_FOREACH_KEY(q, p, files, i)
531 pack_file(pack, p, on_btrfs);
532 } else {
533 unsigned n;
534
535 /* On rotating media, order things by the block
536 * numbers */
537
538 log_debug("Ordering...");
539
540 n = hashmap_size(files);
541 if (n) {
542 _cleanup_free_ struct item *ordered;
543 struct item *j;
544 unsigned k;
545
546 ordered = new(struct item, n);
547 if (!ordered) {
548 r = log_oom();
549 goto finish;
550 }
551
552 j = ordered;
553 HASHMAP_FOREACH_KEY(q, p, files, i) {
554 memcpy(j, q, sizeof(struct item));
555 j++;
556 }
557
558 assert(ordered + n == j);
559
560 qsort(ordered, n, sizeof(struct item), qsort_compare);
561
562 for (k = 0; k < n; k++)
563 pack_file(pack, ordered[k].path, on_btrfs);
564 } else
565 log_warning("No pack files");
566 }
567
568 log_debug("Finalizing...");
569
570 fflush(pack);
571
572 if (ferror(pack)) {
573 log_error("Failed to write pack file.");
574 r = -EIO;
575 goto finish;
576 }
577
578 if (rename(pack_fn_new, pack_fn) < 0) {
579 log_error("Failed to rename readahead file: %m");
580 r = -errno;
581 goto finish;
582 }
583
584 fclose(pack);
585 pack = NULL;
586
587 log_debug("Done.");
588
589 finish:
590 safe_close(fanotify_fd);
591 safe_close(signal_fd);
592 safe_close(inotify_fd);
593
594 if (pack) {
595 fclose(pack);
596 unlink(pack_fn_new);
597 }
598 free(pack_fn_new);
599 free(pack_fn);
600
601 while ((p = hashmap_steal_first_key(files)))
602 free(p);
603
604 hashmap_free(files);
605
606 if (previous_block_readahead_set) {
607 uint64_t bytes;
608
609 /* Restore the original kernel readahead setting if we
610 * changed it, and nobody has overwritten it since
611 * yet. */
612 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
613 block_set_readahead(root, previous_block_readahead);
614 }
615
616 return r;
617 }
618
619 int main_collect(const char *root) {
620
621 if (!root)
622 root = "/";
623
624 /* Skip this step on read-only media. Note that we check the
625 * underlying block device here, not he read-only flag of the
626 * file system on top, since that one is most likely mounted
627 * read-only anyway at boot, even if the underlying block
628 * device is theoretically writable. */
629 if (fs_on_read_only(root) > 0) {
630 log_info("Disabling readahead collector due to read-only media.");
631 return EXIT_SUCCESS;
632 }
633
634 if (!enough_ram()) {
635 log_info("Disabling readahead collector due to low memory.");
636 return EXIT_SUCCESS;
637 }
638
639 shared = shared_get();
640 if (!shared)
641 return EXIT_FAILURE;
642
643 shared->collect = getpid();
644 __sync_synchronize();
645
646 if (collect(root) < 0)
647 return EXIT_FAILURE;
648
649 return EXIT_SUCCESS;
650 }