]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
relicense to LGPLv2.1 (with exceptions)
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45
46 #include <systemd/sd-daemon.h>
47
48 #include "missing.h"
49 #include "util.h"
50 #include "set.h"
51 #include "ioprio.h"
52 #include "readahead-common.h"
53 #include "virt.h"
54
55 /* fixme:
56 *
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
59 * - gzip?
60 * - remount rw?
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
63 */
64
65 static unsigned arg_files_max = 16*1024;
66 static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67 static usec_t arg_timeout = 2*USEC_PER_MINUTE;
68
69 static ReadaheadShared *shared = NULL;
70
71 /* Avoid collisions with the NULL pointer */
72 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
74
75 static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
77
78 zero(data);
79 data.fd = fd;
80
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
82 }
83
84 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
85 struct stat st;
86 void *start = MAP_FAILED;
87 uint8_t *vec;
88 uint32_t b, c;
89 size_t l, pages;
90 bool mapped;
91 int r = 0, fd = -1, k;
92
93 assert(pack);
94 assert(fn);
95
96 if ((fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW)) < 0) {
97
98 if (errno == ENOENT)
99 return 0;
100
101 if (errno == EPERM || errno == EACCES)
102 return 0;
103
104 log_warning("open(%s) failed: %m", fn);
105 r = -errno;
106 goto finish;
107 }
108
109 if ((k = file_verify(fd, fn, arg_file_size_max, &st)) <= 0) {
110 r = k;
111 goto finish;
112 }
113
114 if (on_btrfs)
115 btrfs_defrag(fd);
116
117 l = PAGE_ALIGN(st.st_size);
118 if ((start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
120 r = -errno;
121 goto finish;
122 }
123
124 pages = l / page_size();
125
126 vec = alloca(pages);
127 memset(vec, 0, pages);
128 if (mincore(start, l, vec) < 0) {
129 log_warning("mincore(%s) failed: %m", fn);
130 r = -errno;
131 goto finish;
132 }
133
134 fputs(fn, pack);
135 fputc('\n', pack);
136
137 mapped = false;
138 for (c = 0; c < pages; c++) {
139 bool new_mapped = !!(vec[c] & 1);
140
141 if (!mapped && new_mapped)
142 b = c;
143 else if (mapped && !new_mapped) {
144 fwrite(&b, sizeof(b), 1, pack);
145 fwrite(&c, sizeof(c), 1, pack);
146
147 log_debug("%s: page %u to %u", fn, b, c);
148 }
149
150 mapped = new_mapped;
151 }
152
153 /* We don't write any range data if we should read the entire file */
154 if (mapped && b > 0) {
155 fwrite(&b, sizeof(b), 1, pack);
156 fwrite(&c, sizeof(c), 1, pack);
157
158 log_debug("%s: page %u to %u", fn, b, c);
159 }
160
161 /* End marker */
162 b = 0;
163 fwrite(&b, sizeof(b), 1, pack);
164 fwrite(&b, sizeof(b), 1, pack);
165
166 finish:
167 if (start != MAP_FAILED)
168 munmap(start, l);
169
170 if (fd >= 0)
171 close_nointr_nofail(fd);
172
173 return r;
174 }
175
176 static unsigned long fd_first_block(int fd) {
177 struct {
178 struct fiemap fiemap;
179 struct fiemap_extent extent;
180 } data;
181
182 zero(data);
183 data.fiemap.fm_length = ~0ULL;
184 data.fiemap.fm_extent_count = 1;
185
186 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
187 return 0;
188
189 if (data.fiemap.fm_mapped_extents <= 0)
190 return 0;
191
192 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
193 return 0;
194
195 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
196 }
197
198 struct item {
199 const char *path;
200 unsigned long block;
201 };
202
203 static int qsort_compare(const void *a, const void *b) {
204 const struct item *i, *j;
205
206 i = a;
207 j = b;
208
209 if (i->block < j->block)
210 return -1;
211 if (i->block > j->block)
212 return 1;
213
214 return strcmp(i->path, j->path);
215 }
216
217 static int collect(const char *root) {
218 enum {
219 FD_FANOTIFY, /* Get the actual fs events */
220 FD_SIGNAL,
221 FD_INOTIFY, /* We get notifications to quit early via this fd */
222 _FD_MAX
223 };
224 struct pollfd pollfd[_FD_MAX];
225 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
226 pid_t my_pid;
227 Hashmap *files = NULL;
228 Iterator i;
229 char *p, *q;
230 sigset_t mask;
231 FILE *pack = NULL;
232 char *pack_fn_new = NULL, *pack_fn = NULL;
233 bool on_ssd, on_btrfs;
234 struct statfs sfs;
235 usec_t not_after;
236
237 assert(root);
238
239 write_one_line_file("/proc/self/oom_score_adj", "1000");
240
241 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
242 log_warning("Failed to set IDLE IO priority class: %m");
243
244 assert_se(sigemptyset(&mask) == 0);
245 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
246 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
247
248 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
249 log_error("signalfd(): %m");
250 r = -errno;
251 goto finish;
252 }
253
254 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
255 log_error("Failed to allocate set.");
256 r = -ENOMEM;
257 goto finish;
258 }
259
260 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
261 log_error("Failed to create fanotify object: %m");
262 r = -errno;
263 goto finish;
264 }
265
266 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
267 log_error("Failed to mark %s: %m", root);
268 r = -errno;
269 goto finish;
270 }
271
272 if ((inotify_fd = open_inotify()) < 0) {
273 r = inotify_fd;
274 goto finish;
275 }
276
277 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
278
279 my_pid = getpid();
280
281 zero(pollfd);
282 pollfd[FD_FANOTIFY].fd = fanotify_fd;
283 pollfd[FD_FANOTIFY].events = POLLIN;
284 pollfd[FD_SIGNAL].fd = signal_fd;
285 pollfd[FD_SIGNAL].events = POLLIN;
286 pollfd[FD_INOTIFY].fd = inotify_fd;
287 pollfd[FD_INOTIFY].events = POLLIN;
288
289 sd_notify(0,
290 "READY=1\n"
291 "STATUS=Collecting readahead data");
292
293 log_debug("Collecting...");
294
295 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
296 log_debug("Collection canceled");
297 r = -ECANCELED;
298 goto finish;
299 }
300
301 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
302 log_debug("Got termination request");
303 goto done;
304 }
305
306 for (;;) {
307 union {
308 struct fanotify_event_metadata metadata;
309 char buffer[4096];
310 } data;
311 ssize_t n;
312 struct fanotify_event_metadata *m;
313 usec_t t;
314 int h;
315
316 if (hashmap_size(files) > arg_files_max) {
317 log_debug("Reached maximum number of read ahead files, ending collection.");
318 break;
319 }
320
321 t = now(CLOCK_MONOTONIC);
322 if (t >= not_after) {
323 log_debug("Reached maximum collection time, ending collection.");
324 break;
325 }
326
327 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
328
329 if (errno == EINTR)
330 continue;
331
332 log_error("poll(): %m");
333 r = -errno;
334 goto finish;
335 }
336
337 if (h == 0) {
338 log_debug("Reached maximum collection time, ending collection.");
339 break;
340 }
341
342 if (pollfd[FD_SIGNAL].revents) {
343 log_debug("Got signal.");
344 break;
345 }
346
347 if (pollfd[FD_INOTIFY].revents) {
348 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
349 struct inotify_event *e;
350
351 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
352 if (errno == EINTR || errno == EAGAIN)
353 continue;
354
355 log_error("Failed to read inotify event: %m");
356 r = -errno;
357 goto finish;
358 }
359
360 e = (struct inotify_event*) inotify_buffer;
361 while (n > 0) {
362 size_t step;
363
364 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
365 log_debug("Collection canceled");
366 r = -ECANCELED;
367 goto finish;
368 }
369
370 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
371 log_debug("Got termination request");
372 goto done;
373 }
374
375 step = sizeof(struct inotify_event) + e->len;
376 assert(step <= (size_t) n);
377
378 e = (struct inotify_event*) ((uint8_t*) e + step);
379 n -= step;
380 }
381 }
382
383 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
384
385 if (errno == EINTR || errno == EAGAIN)
386 continue;
387
388 /* fanotify sometimes returns EACCES on read()
389 * where it shouldn't. For now let's just
390 * ignore it here (which is safe), but
391 * eventually this should be
392 * dropped when the kernel is fixed.
393 *
394 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
395 if (errno == EACCES)
396 continue;
397
398 log_error("Failed to read event: %m");
399 r = -errno;
400 goto finish;
401 }
402
403 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
404 char fn[PATH_MAX];
405 int k;
406
407 if (m->fd < 0)
408 goto next_iteration;
409
410 if (m->pid == my_pid)
411 goto next_iteration;
412
413 __sync_synchronize();
414 if (m->pid == shared->replay)
415 goto next_iteration;
416
417 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
418 char_array_0(fn);
419
420 if ((k = readlink_malloc(fn, &p)) >= 0) {
421 if (startswith(p, "/tmp") ||
422 endswith(p, " (deleted)") ||
423 hashmap_get(files, p))
424 /* Not interesting, or
425 * already read */
426 free(p);
427 else {
428 unsigned long ul;
429
430 ul = fd_first_block(m->fd);
431
432 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
433 log_warning("set_put() failed: %s", strerror(-k));
434 free(p);
435 }
436 }
437
438 } else
439 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
440
441 next_iteration:
442 if (m->fd)
443 close_nointr_nofail(m->fd);
444 }
445 }
446
447 done:
448 if (fanotify_fd >= 0) {
449 close_nointr_nofail(fanotify_fd);
450 fanotify_fd = -1;
451 }
452
453 log_debug("Writing Pack File...");
454
455 on_ssd = fs_on_ssd(root) > 0;
456 log_debug("On SSD: %s", yes_no(on_ssd));
457
458 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
459 log_debug("On btrfs: %s", yes_no(on_btrfs));
460
461 asprintf(&pack_fn, "%s/.readahead", root);
462 asprintf(&pack_fn_new, "%s/.readahead.new", root);
463
464 if (!pack_fn || !pack_fn_new) {
465 log_error("Out of memory");
466 r = -ENOMEM;
467 goto finish;
468 }
469
470 if (!(pack = fopen(pack_fn_new, "we"))) {
471 log_error("Failed to open pack file: %m");
472 r = -errno;
473 goto finish;
474 }
475
476 fputs(CANONICAL_HOST "\n", pack);
477 putc(on_ssd ? 'S' : 'R', pack);
478
479 if (on_ssd || on_btrfs) {
480
481 /* On SSD or on btrfs, just write things out in the
482 * order the files were accessed. */
483
484 HASHMAP_FOREACH_KEY(q, p, files, i)
485 pack_file(pack, p, on_btrfs);
486 } else {
487 struct item *ordered, *j;
488 unsigned k, n;
489
490 /* On rotating media, order things by the block
491 * numbers */
492
493 log_debug("Ordering...");
494
495 n = hashmap_size(files);
496 if (!(ordered = new(struct item, n))) {
497 log_error("Out of memory");
498 r = -ENOMEM;
499 goto finish;
500 }
501
502 j = ordered;
503 HASHMAP_FOREACH_KEY(q, p, files, i) {
504 j->path = p;
505 j->block = PTR_TO_SECTOR(q);
506 j++;
507 }
508
509 assert(ordered + n == j);
510
511 qsort(ordered, n, sizeof(struct item), qsort_compare);
512
513 for (k = 0; k < n; k++)
514 pack_file(pack, ordered[k].path, on_btrfs);
515
516 free(ordered);
517 }
518
519 log_debug("Finalizing...");
520
521 fflush(pack);
522
523 if (ferror(pack)) {
524 log_error("Failed to write pack file.");
525 r = -EIO;
526 goto finish;
527 }
528
529 if (rename(pack_fn_new, pack_fn) < 0) {
530 log_error("Failed to rename readahead file: %m");
531 r = -errno;
532 goto finish;
533 }
534
535 fclose(pack);
536 pack = NULL;
537
538 log_debug("Done.");
539
540 finish:
541 if (fanotify_fd >= 0)
542 close_nointr_nofail(fanotify_fd);
543
544 if (signal_fd >= 0)
545 close_nointr_nofail(signal_fd);
546
547 if (inotify_fd >= 0)
548 close_nointr_nofail(inotify_fd);
549
550 if (pack) {
551 fclose(pack);
552 unlink(pack_fn_new);
553 }
554
555 free(pack_fn_new);
556 free(pack_fn);
557
558 while ((p = hashmap_steal_first_key(files)))
559 free(p);
560
561 hashmap_free(files);
562
563 return r;
564 }
565
566 static int help(void) {
567
568 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
569 "Collect read-ahead data on early boot.\n\n"
570 " -h --help Show this help\n"
571 " --max-files=INT Maximum number of files to read ahead\n"
572 " --max-file-size=BYTES Maximum size of files to read ahead\n"
573 " --timeout=USEC Maximum time to spend collecting data\n",
574 program_invocation_short_name);
575
576 return 0;
577 }
578
579 static int parse_argv(int argc, char *argv[]) {
580
581 enum {
582 ARG_FILES_MAX = 0x100,
583 ARG_FILE_SIZE_MAX,
584 ARG_TIMEOUT
585 };
586
587 static const struct option options[] = {
588 { "help", no_argument, NULL, 'h' },
589 { "files-max", required_argument, NULL, ARG_FILES_MAX },
590 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
591 { "timeout", required_argument, NULL, ARG_TIMEOUT },
592 { NULL, 0, NULL, 0 }
593 };
594
595 int c;
596
597 assert(argc >= 0);
598 assert(argv);
599
600 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
601
602 switch (c) {
603
604 case 'h':
605 help();
606 return 0;
607
608 case ARG_FILES_MAX:
609 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
610 log_error("Failed to parse maximum number of files %s.", optarg);
611 return -EINVAL;
612 }
613 break;
614
615 case ARG_FILE_SIZE_MAX: {
616 unsigned long long ull;
617
618 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
619 log_error("Failed to parse maximum file size %s.", optarg);
620 return -EINVAL;
621 }
622
623 arg_file_size_max = (off_t) ull;
624 break;
625 }
626
627 case ARG_TIMEOUT:
628 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
629 log_error("Failed to parse timeout %s.", optarg);
630 return -EINVAL;
631 }
632
633 break;
634
635 case '?':
636 return -EINVAL;
637
638 default:
639 log_error("Unknown option code %c", c);
640 return -EINVAL;
641 }
642 }
643
644 if (optind != argc &&
645 optind != argc-1) {
646 help();
647 return -EINVAL;
648 }
649
650 return 1;
651 }
652
653 int main(int argc, char *argv[]) {
654 int r;
655 const char *root;
656
657 log_set_target(LOG_TARGET_AUTO);
658 log_parse_environment();
659 log_open();
660
661 umask(0022);
662
663 if ((r = parse_argv(argc, argv)) <= 0)
664 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
665
666 root = optind < argc ? argv[optind] : "/";
667
668 if (fs_on_read_only(root) > 0) {
669 log_info("Disabling readahead collector due to read-only media.");
670 return 0;
671 }
672
673 if (!enough_ram()) {
674 log_info("Disabling readahead collector due to low memory.");
675 return 0;
676 }
677
678 if (detect_virtualization(NULL) > 0) {
679 log_info("Disabling readahead collector due to execution in virtualized environment.");
680 return 0;
681 }
682
683 if (!(shared = shared_get()))
684 return 1;
685
686 shared->collect = getpid();
687 __sync_synchronize();
688
689 if (collect(root) < 0)
690 return 1;
691
692 return 0;
693 }