]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/readahead/readahead-collect.c
use memzero(foo, length); for all memset(foo, 0, length); calls
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
CommitLineData
22be093f
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
22be093f
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
22be093f 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
22be093f
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <inttypes.h>
24#include <fcntl.h>
25#include <linux/limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/select.h>
31#include <sys/time.h>
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <unistd.h>
35#include <linux/fanotify.h>
36#include <sys/signalfd.h>
37#include <sys/poll.h>
38#include <sys/mman.h>
39#include <linux/fs.h>
40#include <linux/fiemap.h>
41#include <sys/ioctl.h>
746f8906 42#include <sys/vfs.h>
8260358d 43#include <getopt.h>
6624768c 44#include <sys/inotify.h>
94243ef2 45#include <math.h>
22be093f 46
4b357e15
MM
47#ifdef HAVE_LINUX_BTRFS_H
48#include <linux/btrfs.h>
49#endif
50
a8348796
LP
51#ifdef HAVE_FANOTIFY_INIT
52#include <sys/fanotify.h>
53#endif
54
81527be1
LP
55#include <systemd/sd-daemon.h>
56
22be093f
LP
57#include "missing.h"
58#include "util.h"
59#include "set.h"
22be093f
LP
60#include "ioprio.h"
61#include "readahead-common.h"
b52aae1d 62#include "virt.h"
22be093f 63
41a598e2
LP
64/* fixme:
65 *
408b85df 66 * - detect ssd on btrfs/lvm...
41a598e2 67 * - read ahead directories
408b85df 68 * - gzip?
8260358d 69 * - remount rw?
6624768c 70 * - handle files where nothing is in mincore
408b85df 71 * - does ioprio_set work with fadvise()?
41a598e2
LP
72 */
73
d9c7a87b 74static ReadaheadShared *shared = NULL;
b0640287 75static usec_t starttime;
d9c7a87b 76
2e7485f0
LP
77/* Avoid collisions with the NULL pointer */
78#define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79#define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
80
746f8906 81static int btrfs_defrag(int fd) {
b92bea5d 82 struct btrfs_ioctl_vol_args data = { .fd = fd };
22be093f 83
746f8906
LP
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
85}
86
87static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
22be093f
LP
88 struct stat st;
89 void *start = MAP_FAILED;
8260358d 90 uint8_t *vec;
22be093f 91 uint32_t b, c;
189455ab 92 uint64_t inode;
22be093f
LP
93 size_t l, pages;
94 bool mapped;
95 int r = 0, fd = -1, k;
96
97 assert(pack);
98 assert(fn);
99
189455ab
LP
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
101 if (fd < 0) {
a78899f5
LP
102
103 if (errno == ENOENT)
104 return 0;
105
a76fad09
LP
106 if (errno == EPERM || errno == EACCES)
107 return 0;
108
22be093f
LP
109 log_warning("open(%s) failed: %m", fn);
110 r = -errno;
111 goto finish;
112 }
113
189455ab
LP
114 k = file_verify(fd, fn, arg_file_size_max, &st);
115 if (k <= 0) {
22be093f
LP
116 r = k;
117 goto finish;
118 }
119
746f8906
LP
120 if (on_btrfs)
121 btrfs_defrag(fd);
122
22be093f 123 l = PAGE_ALIGN(st.st_size);
189455ab
LP
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
22be093f
LP
126 log_warning("mmap(%s) failed: %m", fn);
127 r = -errno;
128 goto finish;
129 }
130
37f85e66 131 pages = l / page_size();
8260358d 132 vec = alloca(pages);
29804cc1 133 memzero(vec, pages);
22be093f
LP
134 if (mincore(start, l, vec) < 0) {
135 log_warning("mincore(%s) failed: %m", fn);
136 r = -errno;
137 goto finish;
138 }
139
140 fputs(fn, pack);
141 fputc('\n', pack);
142
189455ab
LP
143 /* Store the inode, so that we notice when the file is deleted */
144 inode = (uint64_t) st.st_ino;
145 fwrite(&inode, sizeof(inode), 1, pack);
146
22be093f
LP
147 mapped = false;
148 for (c = 0; c < pages; c++) {
408b85df 149 bool new_mapped = !!(vec[c] & 1);
22be093f
LP
150
151 if (!mapped && new_mapped)
152 b = c;
153 else if (mapped && !new_mapped) {
154 fwrite(&b, sizeof(b), 1, pack);
155 fwrite(&c, sizeof(c), 1, pack);
156
157 log_debug("%s: page %u to %u", fn, b, c);
158 }
159
160 mapped = new_mapped;
161 }
162
163 /* We don't write any range data if we should read the entire file */
164 if (mapped && b > 0) {
165 fwrite(&b, sizeof(b), 1, pack);
166 fwrite(&c, sizeof(c), 1, pack);
167
168 log_debug("%s: page %u to %u", fn, b, c);
169 }
170
171 /* End marker */
172 b = 0;
173 fwrite(&b, sizeof(b), 1, pack);
174 fwrite(&b, sizeof(b), 1, pack);
175
176finish:
177 if (start != MAP_FAILED)
178 munmap(start, l);
179
180 if (fd >= 0)
181 close_nointr_nofail(fd);
182
183 return r;
184}
185
186static unsigned long fd_first_block(int fd) {
187 struct {
188 struct fiemap fiemap;
189 struct fiemap_extent extent;
b92bea5d
ZJS
190 } data = {
191 .fiemap.fm_length = ~0ULL,
192 .fiemap.fm_extent_count = 1,
193 };
22be093f
LP
194
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 return 0;
197
198 if (data.fiemap.fm_mapped_extents <= 0)
199 return 0;
200
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return 0;
203
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
205}
206
207struct item {
208 const char *path;
209 unsigned long block;
94243ef2 210 unsigned long bin;
22be093f
LP
211};
212
213static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
215
216 i = a;
217 j = b;
218
94243ef2
AK
219 /* sort by bin first */
220 if (i->bin < j->bin)
221 return -1;
222 if (i->bin > j->bin)
223 return 1;
224
225 /* then sort by sector */
22be093f
LP
226 if (i->block < j->block)
227 return -1;
228 if (i->block > j->block)
229 return 1;
230
231 return strcmp(i->path, j->path);
232}
233
234static int collect(const char *root) {
235 enum {
858209c5 236 FD_FANOTIFY, /* Get the actual fs events */
22be093f 237 FD_SIGNAL,
6624768c 238 FD_INOTIFY, /* We get notifications to quit early via this fd */
22be093f
LP
239 _FD_MAX
240 };
b92bea5d 241 struct pollfd pollfd[_FD_MAX] = {};
6624768c 242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
22be093f
LP
243 pid_t my_pid;
244 Hashmap *files = NULL;
245 Iterator i;
246 char *p, *q;
247 sigset_t mask;
248 FILE *pack = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
746f8906
LP
250 bool on_ssd, on_btrfs;
251 struct statfs sfs;
408b85df 252 usec_t not_after;
6de338a2
LP
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
22be093f
LP
255
256 assert(root);
257
6de338a2 258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
0d0f0c50 259 r = log_oom();
6de338a2
LP
260 goto finish;
261 }
262
b0640287 263 starttime = now(CLOCK_MONOTONIC);
94243ef2 264
6de338a2
LP
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
273
22be093f
LP
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
276
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
280
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
283 r = -errno;
284 goto finish;
285 }
286
e62d8c39
ZJS
287 files = hashmap_new(string_hash_func, string_compare_func);
288 if (!files) {
22be093f
LP
289 log_error("Failed to allocate set.");
290 r = -ENOMEM;
291 goto finish;
292 }
293
7989e1f2 294 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
e62d8c39 295 if (fanotify_fd < 0) {
22be093f
LP
296 log_error("Failed to create fanotify object: %m");
297 r = -errno;
298 goto finish;
299 }
300
301 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
302 log_error("Failed to mark %s: %m", root);
303 r = -errno;
304 goto finish;
305 }
306
e62d8c39
ZJS
307 inotify_fd = open_inotify();
308 if (inotify_fd < 0) {
6624768c
LP
309 r = inotify_fd;
310 goto finish;
311 }
312
8260358d 313 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
408b85df 314
22be093f
LP
315 my_pid = getpid();
316
22be093f
LP
317 pollfd[FD_FANOTIFY].fd = fanotify_fd;
318 pollfd[FD_FANOTIFY].events = POLLIN;
319 pollfd[FD_SIGNAL].fd = signal_fd;
320 pollfd[FD_SIGNAL].events = POLLIN;
6624768c
LP
321 pollfd[FD_INOTIFY].fd = inotify_fd;
322 pollfd[FD_INOTIFY].events = POLLIN;
22be093f
LP
323
324 sd_notify(0,
325 "READY=1\n"
326 "STATUS=Collecting readahead data");
327
328 log_debug("Collecting...");
329
2b583ce6 330 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
6624768c
LP
331 log_debug("Collection canceled");
332 r = -ECANCELED;
333 goto finish;
334 }
335
2b583ce6 336 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
6624768c
LP
337 log_debug("Got termination request");
338 goto done;
339 }
340
22be093f
LP
341 for (;;) {
342 union {
343 struct fanotify_event_metadata metadata;
344 char buffer[4096];
345 } data;
346 ssize_t n;
347 struct fanotify_event_metadata *m;
408b85df
LP
348 usec_t t;
349 int h;
22be093f 350
8260358d 351 if (hashmap_size(files) > arg_files_max) {
408b85df 352 log_debug("Reached maximum number of read ahead files, ending collection.");
6e3eb5ba 353 break;
408b85df
LP
354 }
355
356 t = now(CLOCK_MONOTONIC);
357 if (t >= not_after) {
358 log_debug("Reached maximum collection time, ending collection.");
359 break;
360 }
6e3eb5ba 361
408b85df 362 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
22be093f
LP
363
364 if (errno == EINTR)
365 continue;
366
367 log_error("poll(): %m");
368 r = -errno;
369 goto finish;
370 }
371
408b85df
LP
372 if (h == 0) {
373 log_debug("Reached maximum collection time, ending collection.");
374 break;
375 }
376
6624768c
LP
377 if (pollfd[FD_SIGNAL].revents) {
378 log_debug("Got signal.");
379 break;
380 }
381
382 if (pollfd[FD_INOTIFY].revents) {
383 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
384 struct inotify_event *e;
385
386 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
387 if (errno == EINTR || errno == EAGAIN)
388 continue;
389
390 log_error("Failed to read inotify event: %m");
391 r = -errno;
392 goto finish;
393 }
394
395 e = (struct inotify_event*) inotify_buffer;
396 while (n > 0) {
397 size_t step;
398
399 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
400 log_debug("Collection canceled");
401 r = -ECANCELED;
402 goto finish;
403 }
404
405 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
406 log_debug("Got termination request");
407 goto done;
408 }
409
410 step = sizeof(struct inotify_event) + e->len;
411 assert(step <= (size_t) n);
412
413 e = (struct inotify_event*) ((uint8_t*) e + step);
414 n -= step;
415 }
416 }
417
b47d419c
ZJS
418 n = read(fanotify_fd, &data, sizeof(data));
419 if (n < 0) {
22be093f 420
cf37e246
LP
421 if (errno == EINTR || errno == EAGAIN)
422 continue;
423
424 /* fanotify sometimes returns EACCES on read()
425 * where it shouldn't. For now let's just
426 * ignore it here (which is safe), but
427 * eventually this should be
428 * dropped when the kernel is fixed.
429 *
430 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
431 if (errno == EACCES)
22be093f
LP
432 continue;
433
434 log_error("Failed to read event: %m");
435 r = -errno;
436 goto finish;
437 }
438
408b85df 439 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
b47d419c 440 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
d9c7a87b 441 int k;
22be093f 442
d9c7a87b
LP
443 if (m->fd < 0)
444 goto next_iteration;
22be093f 445
d9c7a87b
LP
446 if (m->pid == my_pid)
447 goto next_iteration;
22be093f 448
d9c7a87b
LP
449 __sync_synchronize();
450 if (m->pid == shared->replay)
451 goto next_iteration;
22be093f 452
d9c7a87b 453 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
b47d419c
ZJS
454 k = readlink_malloc(fn, &p);
455 if (k >= 0) {
d9c7a87b 456 if (startswith(p, "/tmp") ||
0840ce2d 457 endswith(p, " (deleted)") ||
d9c7a87b
LP
458 hashmap_get(files, p))
459 /* Not interesting, or
460 * already read */
461 free(p);
462 else {
463 unsigned long ul;
b0640287 464 usec_t entrytime;
94243ef2
AK
465 struct item *entry;
466
467 entry = new0(struct item, 1);
b0640287
AK
468 if (!entry) {
469 r = log_oom();
470 goto finish;
471 }
22be093f 472
d9c7a87b
LP
473 ul = fd_first_block(m->fd);
474
b0640287 475 entrytime = now(CLOCK_MONOTONIC);
94243ef2
AK
476
477 entry->block = ul;
478 entry->path = strdup(p);
b0640287
AK
479 if (!entry->path) {
480 free(entry);
481 r = log_oom();
482 goto finish;
483 }
484 entry->bin = (entrytime - starttime) / 2000000;
94243ef2 485
ef42202a
ZJS
486 k = hashmap_put(files, p, entry);
487 if (k < 0) {
488 log_warning("hashmap_put() failed: %s", strerror(-k));
d9c7a87b 489 free(p);
22be093f 490 }
d9c7a87b 491 }
22be093f 492
d9c7a87b
LP
493 } else
494 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
22be093f 495
d9c7a87b 496 next_iteration:
c4b996bd 497 if (m->fd >= 0)
22be093f 498 close_nointr_nofail(m->fd);
22be093f 499 }
22be093f
LP
500 }
501
6624768c 502done:
22be093f
LP
503 if (fanotify_fd >= 0) {
504 close_nointr_nofail(fanotify_fd);
505 fanotify_fd = -1;
506 }
507
508 log_debug("Writing Pack File...");
509
55888fa4 510 on_ssd = fs_on_ssd(root) > 0;
22be093f
LP
511 log_debug("On SSD: %s", yes_no(on_ssd));
512
c51cf056 513 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
746f8906
LP
514 log_debug("On btrfs: %s", yes_no(on_btrfs));
515
6de338a2 516 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
0d0f0c50 517 r = log_oom();
22be093f
LP
518 goto finish;
519 }
520
189455ab
LP
521 pack = fopen(pack_fn_new, "we");
522 if (!pack) {
22be093f
LP
523 log_error("Failed to open pack file: %m");
524 r = -errno;
525 goto finish;
526 }
527
cae544bc 528 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
22be093f
LP
529 putc(on_ssd ? 'S' : 'R', pack);
530
746f8906 531 if (on_ssd || on_btrfs) {
22be093f 532
746f8906 533 /* On SSD or on btrfs, just write things out in the
41a598e2 534 * order the files were accessed. */
22be093f
LP
535
536 HASHMAP_FOREACH_KEY(q, p, files, i)
746f8906 537 pack_file(pack, p, on_btrfs);
22be093f 538 } else {
7ff7394d 539 unsigned n;
22be093f
LP
540
541 /* On rotating media, order things by the block
542 * numbers */
543
544 log_debug("Ordering...");
545
546 n = hashmap_size(files);
7ff7394d
ZJS
547 if (n) {
548 _cleanup_free_ struct item *ordered;
549 struct item *j;
550 unsigned k;
551
552 ordered = new(struct item, n);
553 if (!ordered) {
554 r = log_oom();
555 goto finish;
556 }
22be093f 557
7ff7394d
ZJS
558 j = ordered;
559 HASHMAP_FOREACH_KEY(q, p, files, i) {
560 memcpy(j, q, sizeof(struct item));
561 j++;
562 }
22be093f 563
7ff7394d 564 assert(ordered + n == j);
22be093f 565
7ff7394d 566 qsort(ordered, n, sizeof(struct item), qsort_compare);
22be093f 567
7ff7394d
ZJS
568 for (k = 0; k < n; k++)
569 pack_file(pack, ordered[k].path, on_btrfs);
570 } else
571 log_warning("No pack files");
22be093f
LP
572 }
573
574 log_debug("Finalizing...");
575
576 fflush(pack);
577
578 if (ferror(pack)) {
579 log_error("Failed to write pack file.");
580 r = -EIO;
581 goto finish;
582 }
583
584 if (rename(pack_fn_new, pack_fn) < 0) {
585 log_error("Failed to rename readahead file: %m");
586 r = -errno;
587 goto finish;
588 }
589
590 fclose(pack);
591 pack = NULL;
592
593 log_debug("Done.");
594
595finish:
596 if (fanotify_fd >= 0)
597 close_nointr_nofail(fanotify_fd);
598
599 if (signal_fd >= 0)
600 close_nointr_nofail(signal_fd);
601
6624768c
LP
602 if (inotify_fd >= 0)
603 close_nointr_nofail(inotify_fd);
604
22be093f
LP
605 if (pack) {
606 fclose(pack);
607 unlink(pack_fn_new);
608 }
22be093f
LP
609 free(pack_fn_new);
610 free(pack_fn);
611
612 while ((p = hashmap_steal_first_key(files)))
f0cf061e 613 free(p);
22be093f
LP
614
615 hashmap_free(files);
616
6de338a2
LP
617 if (previous_block_readahead_set) {
618 uint64_t bytes;
619
620 /* Restore the original kernel readahead setting if we
621 * changed it, and nobody has overwritten it since
622 * yet. */
623 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
624 block_set_readahead(root, previous_block_readahead);
625 }
626
22be093f
LP
627 return r;
628}
629
87ce22cc 630int main_collect(const char *root) {
8260358d 631
87ce22cc
LP
632 if (!root)
633 root = "/";
2b590e13 634
3b2d5b02
LP
635 /* Skip this step on read-only media. Note that we check the
636 * underlying block device here, not he read-only flag of the
637 * file system on top, since that one is most likely mounted
638 * read-only anyway at boot, even if the underlying block
639 * device is theoretically writable. */
2b590e13
LP
640 if (fs_on_read_only(root) > 0) {
641 log_info("Disabling readahead collector due to read-only media.");
87ce22cc 642 return EXIT_SUCCESS;
2b590e13
LP
643 }
644
41a598e2
LP
645 if (!enough_ram()) {
646 log_info("Disabling readahead collector due to low memory.");
87ce22cc 647 return EXIT_SUCCESS;
41a598e2
LP
648 }
649
3b2d5b02
LP
650 shared = shared_get();
651 if (!shared)
87ce22cc 652 return EXIT_FAILURE;
d9c7a87b
LP
653
654 shared->collect = getpid();
655 __sync_synchronize();
656
2b590e13 657 if (collect(root) < 0)
87ce22cc 658 return EXIT_FAILURE;
22be093f 659
87ce22cc 660 return EXIT_SUCCESS;
22be093f 661}