]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/readahead/readahead-collect.c
relicense to LGPLv2.1 (with exceptions)
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
CommitLineData
22be093f
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
22be093f
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
22be093f 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
22be093f
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <inttypes.h>
24#include <fcntl.h>
25#include <linux/limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/select.h>
31#include <sys/time.h>
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <unistd.h>
35#include <linux/fanotify.h>
36#include <sys/signalfd.h>
37#include <sys/poll.h>
38#include <sys/mman.h>
39#include <linux/fs.h>
40#include <linux/fiemap.h>
41#include <sys/ioctl.h>
746f8906 42#include <sys/vfs.h>
8260358d 43#include <getopt.h>
6624768c 44#include <sys/inotify.h>
22be093f 45
81527be1
LP
46#include <systemd/sd-daemon.h>
47
22be093f
LP
48#include "missing.h"
49#include "util.h"
50#include "set.h"
22be093f
LP
51#include "ioprio.h"
52#include "readahead-common.h"
b52aae1d 53#include "virt.h"
22be093f 54
41a598e2
LP
55/* fixme:
56 *
408b85df 57 * - detect ssd on btrfs/lvm...
41a598e2 58 * - read ahead directories
408b85df 59 * - gzip?
8260358d 60 * - remount rw?
6624768c 61 * - handle files where nothing is in mincore
408b85df 62 * - does ioprio_set work with fadvise()?
41a598e2
LP
63 */
64
8260358d
LP
65static unsigned arg_files_max = 16*1024;
66static off_t arg_file_size_max = READAHEAD_FILE_SIZE_MAX;
67static usec_t arg_timeout = 2*USEC_PER_MINUTE;
68
d9c7a87b
LP
69static ReadaheadShared *shared = NULL;
70
2e7485f0
LP
71/* Avoid collisions with the NULL pointer */
72#define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
73#define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
74
746f8906
LP
75static int btrfs_defrag(int fd) {
76 struct btrfs_ioctl_vol_args data;
22be093f 77
746f8906
LP
78 zero(data);
79 data.fd = fd;
22be093f 80
746f8906
LP
81 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
82}
83
84static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
22be093f
LP
85 struct stat st;
86 void *start = MAP_FAILED;
8260358d 87 uint8_t *vec;
22be093f
LP
88 uint32_t b, c;
89 size_t l, pages;
90 bool mapped;
91 int r = 0, fd = -1, k;
92
93 assert(pack);
94 assert(fn);
95
96 if ((fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW)) < 0) {
a78899f5
LP
97
98 if (errno == ENOENT)
99 return 0;
100
a76fad09
LP
101 if (errno == EPERM || errno == EACCES)
102 return 0;
103
22be093f
LP
104 log_warning("open(%s) failed: %m", fn);
105 r = -errno;
106 goto finish;
107 }
108
8260358d 109 if ((k = file_verify(fd, fn, arg_file_size_max, &st)) <= 0) {
22be093f
LP
110 r = k;
111 goto finish;
112 }
113
746f8906
LP
114 if (on_btrfs)
115 btrfs_defrag(fd);
116
22be093f
LP
117 l = PAGE_ALIGN(st.st_size);
118 if ((start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0)) == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
120 r = -errno;
121 goto finish;
122 }
123
37f85e66 124 pages = l / page_size();
8260358d
LP
125
126 vec = alloca(pages);
37f85e66 127 memset(vec, 0, pages);
22be093f
LP
128 if (mincore(start, l, vec) < 0) {
129 log_warning("mincore(%s) failed: %m", fn);
130 r = -errno;
131 goto finish;
132 }
133
134 fputs(fn, pack);
135 fputc('\n', pack);
136
22be093f
LP
137 mapped = false;
138 for (c = 0; c < pages; c++) {
408b85df 139 bool new_mapped = !!(vec[c] & 1);
22be093f
LP
140
141 if (!mapped && new_mapped)
142 b = c;
143 else if (mapped && !new_mapped) {
144 fwrite(&b, sizeof(b), 1, pack);
145 fwrite(&c, sizeof(c), 1, pack);
146
147 log_debug("%s: page %u to %u", fn, b, c);
148 }
149
150 mapped = new_mapped;
151 }
152
153 /* We don't write any range data if we should read the entire file */
154 if (mapped && b > 0) {
155 fwrite(&b, sizeof(b), 1, pack);
156 fwrite(&c, sizeof(c), 1, pack);
157
158 log_debug("%s: page %u to %u", fn, b, c);
159 }
160
161 /* End marker */
162 b = 0;
163 fwrite(&b, sizeof(b), 1, pack);
164 fwrite(&b, sizeof(b), 1, pack);
165
166finish:
167 if (start != MAP_FAILED)
168 munmap(start, l);
169
170 if (fd >= 0)
171 close_nointr_nofail(fd);
172
173 return r;
174}
175
176static unsigned long fd_first_block(int fd) {
177 struct {
178 struct fiemap fiemap;
179 struct fiemap_extent extent;
180 } data;
181
182 zero(data);
183 data.fiemap.fm_length = ~0ULL;
184 data.fiemap.fm_extent_count = 1;
185
186 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
187 return 0;
188
189 if (data.fiemap.fm_mapped_extents <= 0)
190 return 0;
191
192 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
193 return 0;
194
195 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
196}
197
198struct item {
199 const char *path;
200 unsigned long block;
201};
202
203static int qsort_compare(const void *a, const void *b) {
204 const struct item *i, *j;
205
206 i = a;
207 j = b;
208
209 if (i->block < j->block)
210 return -1;
211 if (i->block > j->block)
212 return 1;
213
214 return strcmp(i->path, j->path);
215}
216
217static int collect(const char *root) {
218 enum {
858209c5 219 FD_FANOTIFY, /* Get the actual fs events */
22be093f 220 FD_SIGNAL,
6624768c 221 FD_INOTIFY, /* We get notifications to quit early via this fd */
22be093f
LP
222 _FD_MAX
223 };
224 struct pollfd pollfd[_FD_MAX];
6624768c 225 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
22be093f
LP
226 pid_t my_pid;
227 Hashmap *files = NULL;
228 Iterator i;
229 char *p, *q;
230 sigset_t mask;
231 FILE *pack = NULL;
232 char *pack_fn_new = NULL, *pack_fn = NULL;
746f8906
LP
233 bool on_ssd, on_btrfs;
234 struct statfs sfs;
408b85df 235 usec_t not_after;
22be093f
LP
236
237 assert(root);
238
75a010e0
LP
239 write_one_line_file("/proc/self/oom_score_adj", "1000");
240
22be093f
LP
241 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
242 log_warning("Failed to set IDLE IO priority class: %m");
243
244 assert_se(sigemptyset(&mask) == 0);
245 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
246 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
247
248 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
249 log_error("signalfd(): %m");
250 r = -errno;
251 goto finish;
252 }
253
254 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
255 log_error("Failed to allocate set.");
256 r = -ENOMEM;
257 goto finish;
258 }
259
408b85df 260 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
22be093f
LP
261 log_error("Failed to create fanotify object: %m");
262 r = -errno;
263 goto finish;
264 }
265
266 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
267 log_error("Failed to mark %s: %m", root);
268 r = -errno;
269 goto finish;
270 }
271
6624768c
LP
272 if ((inotify_fd = open_inotify()) < 0) {
273 r = inotify_fd;
274 goto finish;
275 }
276
8260358d 277 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
408b85df 278
22be093f
LP
279 my_pid = getpid();
280
281 zero(pollfd);
282 pollfd[FD_FANOTIFY].fd = fanotify_fd;
283 pollfd[FD_FANOTIFY].events = POLLIN;
284 pollfd[FD_SIGNAL].fd = signal_fd;
285 pollfd[FD_SIGNAL].events = POLLIN;
6624768c
LP
286 pollfd[FD_INOTIFY].fd = inotify_fd;
287 pollfd[FD_INOTIFY].events = POLLIN;
22be093f
LP
288
289 sd_notify(0,
290 "READY=1\n"
291 "STATUS=Collecting readahead data");
292
293 log_debug("Collecting...");
294
2b583ce6 295 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
6624768c
LP
296 log_debug("Collection canceled");
297 r = -ECANCELED;
298 goto finish;
299 }
300
2b583ce6 301 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
6624768c
LP
302 log_debug("Got termination request");
303 goto done;
304 }
305
22be093f
LP
306 for (;;) {
307 union {
308 struct fanotify_event_metadata metadata;
309 char buffer[4096];
310 } data;
311 ssize_t n;
312 struct fanotify_event_metadata *m;
408b85df
LP
313 usec_t t;
314 int h;
22be093f 315
8260358d 316 if (hashmap_size(files) > arg_files_max) {
408b85df 317 log_debug("Reached maximum number of read ahead files, ending collection.");
6e3eb5ba 318 break;
408b85df
LP
319 }
320
321 t = now(CLOCK_MONOTONIC);
322 if (t >= not_after) {
323 log_debug("Reached maximum collection time, ending collection.");
324 break;
325 }
6e3eb5ba 326
408b85df 327 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
22be093f
LP
328
329 if (errno == EINTR)
330 continue;
331
332 log_error("poll(): %m");
333 r = -errno;
334 goto finish;
335 }
336
408b85df
LP
337 if (h == 0) {
338 log_debug("Reached maximum collection time, ending collection.");
339 break;
340 }
341
6624768c
LP
342 if (pollfd[FD_SIGNAL].revents) {
343 log_debug("Got signal.");
344 break;
345 }
346
347 if (pollfd[FD_INOTIFY].revents) {
348 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
349 struct inotify_event *e;
350
351 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
352 if (errno == EINTR || errno == EAGAIN)
353 continue;
354
355 log_error("Failed to read inotify event: %m");
356 r = -errno;
357 goto finish;
358 }
359
360 e = (struct inotify_event*) inotify_buffer;
361 while (n > 0) {
362 size_t step;
363
364 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
365 log_debug("Collection canceled");
366 r = -ECANCELED;
367 goto finish;
368 }
369
370 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
371 log_debug("Got termination request");
372 goto done;
373 }
374
375 step = sizeof(struct inotify_event) + e->len;
376 assert(step <= (size_t) n);
377
378 e = (struct inotify_event*) ((uint8_t*) e + step);
379 n -= step;
380 }
381 }
382
22be093f
LP
383 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
384
cf37e246
LP
385 if (errno == EINTR || errno == EAGAIN)
386 continue;
387
388 /* fanotify sometimes returns EACCES on read()
389 * where it shouldn't. For now let's just
390 * ignore it here (which is safe), but
391 * eventually this should be
392 * dropped when the kernel is fixed.
393 *
394 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
395 if (errno == EACCES)
22be093f
LP
396 continue;
397
398 log_error("Failed to read event: %m");
399 r = -errno;
400 goto finish;
401 }
402
408b85df 403 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
d9c7a87b
LP
404 char fn[PATH_MAX];
405 int k;
22be093f 406
d9c7a87b
LP
407 if (m->fd < 0)
408 goto next_iteration;
22be093f 409
d9c7a87b
LP
410 if (m->pid == my_pid)
411 goto next_iteration;
22be093f 412
d9c7a87b
LP
413 __sync_synchronize();
414 if (m->pid == shared->replay)
415 goto next_iteration;
22be093f 416
d9c7a87b
LP
417 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
418 char_array_0(fn);
419
420 if ((k = readlink_malloc(fn, &p)) >= 0) {
d9c7a87b 421 if (startswith(p, "/tmp") ||
0840ce2d 422 endswith(p, " (deleted)") ||
d9c7a87b
LP
423 hashmap_get(files, p))
424 /* Not interesting, or
425 * already read */
426 free(p);
427 else {
428 unsigned long ul;
22be093f 429
d9c7a87b
LP
430 ul = fd_first_block(m->fd);
431
432 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
433 log_warning("set_put() failed: %s", strerror(-k));
434 free(p);
22be093f 435 }
d9c7a87b 436 }
22be093f 437
d9c7a87b
LP
438 } else
439 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
22be093f 440
d9c7a87b 441 next_iteration:
22be093f
LP
442 if (m->fd)
443 close_nointr_nofail(m->fd);
22be093f 444 }
22be093f
LP
445 }
446
6624768c 447done:
22be093f
LP
448 if (fanotify_fd >= 0) {
449 close_nointr_nofail(fanotify_fd);
450 fanotify_fd = -1;
451 }
452
453 log_debug("Writing Pack File...");
454
55888fa4 455 on_ssd = fs_on_ssd(root) > 0;
22be093f
LP
456 log_debug("On SSD: %s", yes_no(on_ssd));
457
5b61848d 458 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
746f8906
LP
459 log_debug("On btrfs: %s", yes_no(on_btrfs));
460
22be093f
LP
461 asprintf(&pack_fn, "%s/.readahead", root);
462 asprintf(&pack_fn_new, "%s/.readahead.new", root);
463
464 if (!pack_fn || !pack_fn_new) {
465 log_error("Out of memory");
466 r = -ENOMEM;
467 goto finish;
468 }
469
470 if (!(pack = fopen(pack_fn_new, "we"))) {
471 log_error("Failed to open pack file: %m");
472 r = -errno;
473 goto finish;
474 }
475
476 fputs(CANONICAL_HOST "\n", pack);
477 putc(on_ssd ? 'S' : 'R', pack);
478
746f8906 479 if (on_ssd || on_btrfs) {
22be093f 480
746f8906 481 /* On SSD or on btrfs, just write things out in the
41a598e2 482 * order the files were accessed. */
22be093f
LP
483
484 HASHMAP_FOREACH_KEY(q, p, files, i)
746f8906 485 pack_file(pack, p, on_btrfs);
22be093f
LP
486 } else {
487 struct item *ordered, *j;
488 unsigned k, n;
489
490 /* On rotating media, order things by the block
491 * numbers */
492
493 log_debug("Ordering...");
494
495 n = hashmap_size(files);
496 if (!(ordered = new(struct item, n))) {
497 log_error("Out of memory");
498 r = -ENOMEM;
499 goto finish;
500 }
501
502 j = ordered;
503 HASHMAP_FOREACH_KEY(q, p, files, i) {
504 j->path = p;
2e7485f0 505 j->block = PTR_TO_SECTOR(q);
22be093f
LP
506 j++;
507 }
508
509 assert(ordered + n == j);
510
511 qsort(ordered, n, sizeof(struct item), qsort_compare);
512
513 for (k = 0; k < n; k++)
746f8906 514 pack_file(pack, ordered[k].path, on_btrfs);
22be093f
LP
515
516 free(ordered);
517 }
518
519 log_debug("Finalizing...");
520
521 fflush(pack);
522
523 if (ferror(pack)) {
524 log_error("Failed to write pack file.");
525 r = -EIO;
526 goto finish;
527 }
528
529 if (rename(pack_fn_new, pack_fn) < 0) {
530 log_error("Failed to rename readahead file: %m");
531 r = -errno;
532 goto finish;
533 }
534
535 fclose(pack);
536 pack = NULL;
537
538 log_debug("Done.");
539
540finish:
541 if (fanotify_fd >= 0)
542 close_nointr_nofail(fanotify_fd);
543
544 if (signal_fd >= 0)
545 close_nointr_nofail(signal_fd);
546
6624768c
LP
547 if (inotify_fd >= 0)
548 close_nointr_nofail(inotify_fd);
549
22be093f
LP
550 if (pack) {
551 fclose(pack);
552 unlink(pack_fn_new);
553 }
554
555 free(pack_fn_new);
556 free(pack_fn);
557
558 while ((p = hashmap_steal_first_key(files)))
f0cf061e 559 free(p);
22be093f
LP
560
561 hashmap_free(files);
562
563 return r;
564}
565
8260358d
LP
566static int help(void) {
567
568 printf("%s [OPTIONS...] [DIRECTORY]\n\n"
569 "Collect read-ahead data on early boot.\n\n"
570 " -h --help Show this help\n"
571 " --max-files=INT Maximum number of files to read ahead\n"
572 " --max-file-size=BYTES Maximum size of files to read ahead\n"
573 " --timeout=USEC Maximum time to spend collecting data\n",
574 program_invocation_short_name);
575
576 return 0;
577}
578
579static int parse_argv(int argc, char *argv[]) {
580
581 enum {
582 ARG_FILES_MAX = 0x100,
583 ARG_FILE_SIZE_MAX,
584 ARG_TIMEOUT
585 };
586
587 static const struct option options[] = {
588 { "help", no_argument, NULL, 'h' },
589 { "files-max", required_argument, NULL, ARG_FILES_MAX },
590 { "file-size-max", required_argument, NULL, ARG_FILE_SIZE_MAX },
591 { "timeout", required_argument, NULL, ARG_TIMEOUT },
592 { NULL, 0, NULL, 0 }
593 };
594
595 int c;
596
597 assert(argc >= 0);
598 assert(argv);
599
600 while ((c = getopt_long(argc, argv, "h", options, NULL)) >= 0) {
601
602 switch (c) {
603
604 case 'h':
605 help();
606 return 0;
607
608 case ARG_FILES_MAX:
609 if (safe_atou(optarg, &arg_files_max) < 0 || arg_files_max <= 0) {
610 log_error("Failed to parse maximum number of files %s.", optarg);
611 return -EINVAL;
612 }
613 break;
614
615 case ARG_FILE_SIZE_MAX: {
616 unsigned long long ull;
617
618 if (safe_atollu(optarg, &ull) < 0 || ull <= 0) {
619 log_error("Failed to parse maximum file size %s.", optarg);
620 return -EINVAL;
621 }
622
623 arg_file_size_max = (off_t) ull;
624 break;
625 }
626
627 case ARG_TIMEOUT:
628 if (parse_usec(optarg, &arg_timeout) < 0 || arg_timeout <= 0) {
629 log_error("Failed to parse timeout %s.", optarg);
630 return -EINVAL;
631 }
632
633 break;
634
635 case '?':
636 return -EINVAL;
637
638 default:
639 log_error("Unknown option code %c", c);
640 return -EINVAL;
641 }
642 }
643
644 if (optind != argc &&
645 optind != argc-1) {
646 help();
647 return -EINVAL;
648 }
649
650 return 1;
651}
652
22be093f 653int main(int argc, char *argv[]) {
8260358d 654 int r;
2b590e13 655 const char *root;
4030d7a9 656
4cfa2c99 657 log_set_target(LOG_TARGET_AUTO);
22be093f
LP
658 log_parse_environment();
659 log_open();
660
4c12626c
LP
661 umask(0022);
662
8260358d
LP
663 if ((r = parse_argv(argc, argv)) <= 0)
664 return r < 0 ? EXIT_FAILURE : EXIT_SUCCESS;
665
2b590e13
LP
666 root = optind < argc ? argv[optind] : "/";
667
668 if (fs_on_read_only(root) > 0) {
669 log_info("Disabling readahead collector due to read-only media.");
670 return 0;
671 }
672
41a598e2
LP
673 if (!enough_ram()) {
674 log_info("Disabling readahead collector due to low memory.");
675 return 0;
676 }
677
07faed4f
LP
678 if (detect_virtualization(NULL) > 0) {
679 log_info("Disabling readahead collector due to execution in virtualized environment.");
46a08e38
LP
680 return 0;
681 }
682
d9c7a87b
LP
683 if (!(shared = shared_get()))
684 return 1;
685
686 shared->collect = getpid();
687 __sync_synchronize();
688
2b590e13 689 if (collect(root) < 0)
22be093f
LP
690 return 1;
691
692 return 0;
693}