]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/readahead/readahead-collect.c
Use initalization instead of explicit zeroing
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
CommitLineData
22be093f
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
22be093f
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
22be093f 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
22be093f
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <inttypes.h>
24#include <fcntl.h>
25#include <linux/limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/select.h>
31#include <sys/time.h>
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <unistd.h>
35#include <linux/fanotify.h>
36#include <sys/signalfd.h>
37#include <sys/poll.h>
38#include <sys/mman.h>
39#include <linux/fs.h>
40#include <linux/fiemap.h>
41#include <sys/ioctl.h>
746f8906 42#include <sys/vfs.h>
8260358d 43#include <getopt.h>
6624768c 44#include <sys/inotify.h>
94243ef2 45#include <math.h>
22be093f 46
a8348796
LP
47#ifdef HAVE_FANOTIFY_INIT
48#include <sys/fanotify.h>
49#endif
50
81527be1
LP
51#include <systemd/sd-daemon.h>
52
22be093f
LP
53#include "missing.h"
54#include "util.h"
55#include "set.h"
22be093f
LP
56#include "ioprio.h"
57#include "readahead-common.h"
b52aae1d 58#include "virt.h"
22be093f 59
41a598e2
LP
60/* fixme:
61 *
408b85df 62 * - detect ssd on btrfs/lvm...
41a598e2 63 * - read ahead directories
408b85df 64 * - gzip?
8260358d 65 * - remount rw?
6624768c 66 * - handle files where nothing is in mincore
408b85df 67 * - does ioprio_set work with fadvise()?
41a598e2
LP
68 */
69
d9c7a87b 70static ReadaheadShared *shared = NULL;
b0640287 71static usec_t starttime;
d9c7a87b 72
2e7485f0
LP
73/* Avoid collisions with the NULL pointer */
74#define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75#define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
76
746f8906 77static int btrfs_defrag(int fd) {
b92bea5d 78 struct btrfs_ioctl_vol_args data = { .fd = fd };
22be093f 79
746f8906
LP
80 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
81}
82
83static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
22be093f
LP
84 struct stat st;
85 void *start = MAP_FAILED;
8260358d 86 uint8_t *vec;
22be093f 87 uint32_t b, c;
189455ab 88 uint64_t inode;
22be093f
LP
89 size_t l, pages;
90 bool mapped;
91 int r = 0, fd = -1, k;
92
93 assert(pack);
94 assert(fn);
95
189455ab
LP
96 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
97 if (fd < 0) {
a78899f5
LP
98
99 if (errno == ENOENT)
100 return 0;
101
a76fad09
LP
102 if (errno == EPERM || errno == EACCES)
103 return 0;
104
22be093f
LP
105 log_warning("open(%s) failed: %m", fn);
106 r = -errno;
107 goto finish;
108 }
109
189455ab
LP
110 k = file_verify(fd, fn, arg_file_size_max, &st);
111 if (k <= 0) {
22be093f
LP
112 r = k;
113 goto finish;
114 }
115
746f8906
LP
116 if (on_btrfs)
117 btrfs_defrag(fd);
118
22be093f 119 l = PAGE_ALIGN(st.st_size);
189455ab
LP
120 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
121 if (start == MAP_FAILED) {
22be093f
LP
122 log_warning("mmap(%s) failed: %m", fn);
123 r = -errno;
124 goto finish;
125 }
126
37f85e66 127 pages = l / page_size();
8260358d 128 vec = alloca(pages);
37f85e66 129 memset(vec, 0, pages);
22be093f
LP
130 if (mincore(start, l, vec) < 0) {
131 log_warning("mincore(%s) failed: %m", fn);
132 r = -errno;
133 goto finish;
134 }
135
136 fputs(fn, pack);
137 fputc('\n', pack);
138
189455ab
LP
139 /* Store the inode, so that we notice when the file is deleted */
140 inode = (uint64_t) st.st_ino;
141 fwrite(&inode, sizeof(inode), 1, pack);
142
22be093f
LP
143 mapped = false;
144 for (c = 0; c < pages; c++) {
408b85df 145 bool new_mapped = !!(vec[c] & 1);
22be093f
LP
146
147 if (!mapped && new_mapped)
148 b = c;
149 else if (mapped && !new_mapped) {
150 fwrite(&b, sizeof(b), 1, pack);
151 fwrite(&c, sizeof(c), 1, pack);
152
153 log_debug("%s: page %u to %u", fn, b, c);
154 }
155
156 mapped = new_mapped;
157 }
158
159 /* We don't write any range data if we should read the entire file */
160 if (mapped && b > 0) {
161 fwrite(&b, sizeof(b), 1, pack);
162 fwrite(&c, sizeof(c), 1, pack);
163
164 log_debug("%s: page %u to %u", fn, b, c);
165 }
166
167 /* End marker */
168 b = 0;
169 fwrite(&b, sizeof(b), 1, pack);
170 fwrite(&b, sizeof(b), 1, pack);
171
172finish:
173 if (start != MAP_FAILED)
174 munmap(start, l);
175
176 if (fd >= 0)
177 close_nointr_nofail(fd);
178
179 return r;
180}
181
182static unsigned long fd_first_block(int fd) {
183 struct {
184 struct fiemap fiemap;
185 struct fiemap_extent extent;
b92bea5d
ZJS
186 } data = {
187 .fiemap.fm_length = ~0ULL,
188 .fiemap.fm_extent_count = 1,
189 };
22be093f
LP
190
191 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
192 return 0;
193
194 if (data.fiemap.fm_mapped_extents <= 0)
195 return 0;
196
197 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
198 return 0;
199
200 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
201}
202
203struct item {
204 const char *path;
205 unsigned long block;
94243ef2 206 unsigned long bin;
22be093f
LP
207};
208
209static int qsort_compare(const void *a, const void *b) {
210 const struct item *i, *j;
211
212 i = a;
213 j = b;
214
94243ef2
AK
215 /* sort by bin first */
216 if (i->bin < j->bin)
217 return -1;
218 if (i->bin > j->bin)
219 return 1;
220
221 /* then sort by sector */
22be093f
LP
222 if (i->block < j->block)
223 return -1;
224 if (i->block > j->block)
225 return 1;
226
227 return strcmp(i->path, j->path);
228}
229
230static int collect(const char *root) {
231 enum {
858209c5 232 FD_FANOTIFY, /* Get the actual fs events */
22be093f 233 FD_SIGNAL,
6624768c 234 FD_INOTIFY, /* We get notifications to quit early via this fd */
22be093f
LP
235 _FD_MAX
236 };
b92bea5d 237 struct pollfd pollfd[_FD_MAX] = {};
6624768c 238 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
22be093f
LP
239 pid_t my_pid;
240 Hashmap *files = NULL;
241 Iterator i;
242 char *p, *q;
243 sigset_t mask;
244 FILE *pack = NULL;
245 char *pack_fn_new = NULL, *pack_fn = NULL;
746f8906
LP
246 bool on_ssd, on_btrfs;
247 struct statfs sfs;
408b85df 248 usec_t not_after;
6de338a2
LP
249 uint64_t previous_block_readahead;
250 bool previous_block_readahead_set = false;
22be093f
LP
251
252 assert(root);
253
6de338a2 254 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
0d0f0c50 255 r = log_oom();
6de338a2
LP
256 goto finish;
257 }
258
b0640287 259 starttime = now(CLOCK_MONOTONIC);
94243ef2 260
6de338a2
LP
261 /* If there's no pack file yet we lower the kernel readahead
262 * so that mincore() is accurate. If there is a pack file
263 * already we assume it is accurate enough so that kernel
264 * readahead is never triggered. */
265 previous_block_readahead_set =
266 access(pack_fn, F_OK) < 0 &&
267 block_get_readahead(root, &previous_block_readahead) >= 0 &&
268 block_set_readahead(root, 8*1024) >= 0;
269
22be093f
LP
270 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
271 log_warning("Failed to set IDLE IO priority class: %m");
272
273 assert_se(sigemptyset(&mask) == 0);
274 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
275 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
276
277 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
278 log_error("signalfd(): %m");
279 r = -errno;
280 goto finish;
281 }
282
e62d8c39
ZJS
283 files = hashmap_new(string_hash_func, string_compare_func);
284 if (!files) {
22be093f
LP
285 log_error("Failed to allocate set.");
286 r = -ENOMEM;
287 goto finish;
288 }
289
7989e1f2 290 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
e62d8c39 291 if (fanotify_fd < 0) {
22be093f
LP
292 log_error("Failed to create fanotify object: %m");
293 r = -errno;
294 goto finish;
295 }
296
297 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
298 log_error("Failed to mark %s: %m", root);
299 r = -errno;
300 goto finish;
301 }
302
e62d8c39
ZJS
303 inotify_fd = open_inotify();
304 if (inotify_fd < 0) {
6624768c
LP
305 r = inotify_fd;
306 goto finish;
307 }
308
8260358d 309 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
408b85df 310
22be093f
LP
311 my_pid = getpid();
312
22be093f
LP
313 pollfd[FD_FANOTIFY].fd = fanotify_fd;
314 pollfd[FD_FANOTIFY].events = POLLIN;
315 pollfd[FD_SIGNAL].fd = signal_fd;
316 pollfd[FD_SIGNAL].events = POLLIN;
6624768c
LP
317 pollfd[FD_INOTIFY].fd = inotify_fd;
318 pollfd[FD_INOTIFY].events = POLLIN;
22be093f
LP
319
320 sd_notify(0,
321 "READY=1\n"
322 "STATUS=Collecting readahead data");
323
324 log_debug("Collecting...");
325
2b583ce6 326 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
6624768c
LP
327 log_debug("Collection canceled");
328 r = -ECANCELED;
329 goto finish;
330 }
331
2b583ce6 332 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
6624768c
LP
333 log_debug("Got termination request");
334 goto done;
335 }
336
22be093f
LP
337 for (;;) {
338 union {
339 struct fanotify_event_metadata metadata;
340 char buffer[4096];
341 } data;
342 ssize_t n;
343 struct fanotify_event_metadata *m;
408b85df
LP
344 usec_t t;
345 int h;
22be093f 346
8260358d 347 if (hashmap_size(files) > arg_files_max) {
408b85df 348 log_debug("Reached maximum number of read ahead files, ending collection.");
6e3eb5ba 349 break;
408b85df
LP
350 }
351
352 t = now(CLOCK_MONOTONIC);
353 if (t >= not_after) {
354 log_debug("Reached maximum collection time, ending collection.");
355 break;
356 }
6e3eb5ba 357
408b85df 358 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
22be093f
LP
359
360 if (errno == EINTR)
361 continue;
362
363 log_error("poll(): %m");
364 r = -errno;
365 goto finish;
366 }
367
408b85df
LP
368 if (h == 0) {
369 log_debug("Reached maximum collection time, ending collection.");
370 break;
371 }
372
6624768c
LP
373 if (pollfd[FD_SIGNAL].revents) {
374 log_debug("Got signal.");
375 break;
376 }
377
378 if (pollfd[FD_INOTIFY].revents) {
379 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
380 struct inotify_event *e;
381
382 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
383 if (errno == EINTR || errno == EAGAIN)
384 continue;
385
386 log_error("Failed to read inotify event: %m");
387 r = -errno;
388 goto finish;
389 }
390
391 e = (struct inotify_event*) inotify_buffer;
392 while (n > 0) {
393 size_t step;
394
395 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
396 log_debug("Collection canceled");
397 r = -ECANCELED;
398 goto finish;
399 }
400
401 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
402 log_debug("Got termination request");
403 goto done;
404 }
405
406 step = sizeof(struct inotify_event) + e->len;
407 assert(step <= (size_t) n);
408
409 e = (struct inotify_event*) ((uint8_t*) e + step);
410 n -= step;
411 }
412 }
413
22be093f
LP
414 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
415
cf37e246
LP
416 if (errno == EINTR || errno == EAGAIN)
417 continue;
418
419 /* fanotify sometimes returns EACCES on read()
420 * where it shouldn't. For now let's just
421 * ignore it here (which is safe), but
422 * eventually this should be
423 * dropped when the kernel is fixed.
424 *
425 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
426 if (errno == EACCES)
22be093f
LP
427 continue;
428
429 log_error("Failed to read event: %m");
430 r = -errno;
431 goto finish;
432 }
433
408b85df 434 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
d9c7a87b
LP
435 char fn[PATH_MAX];
436 int k;
22be093f 437
d9c7a87b
LP
438 if (m->fd < 0)
439 goto next_iteration;
22be093f 440
d9c7a87b
LP
441 if (m->pid == my_pid)
442 goto next_iteration;
22be093f 443
d9c7a87b
LP
444 __sync_synchronize();
445 if (m->pid == shared->replay)
446 goto next_iteration;
22be093f 447
d9c7a87b
LP
448 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
449 char_array_0(fn);
450
451 if ((k = readlink_malloc(fn, &p)) >= 0) {
d9c7a87b 452 if (startswith(p, "/tmp") ||
0840ce2d 453 endswith(p, " (deleted)") ||
d9c7a87b
LP
454 hashmap_get(files, p))
455 /* Not interesting, or
456 * already read */
457 free(p);
458 else {
459 unsigned long ul;
b0640287 460 usec_t entrytime;
94243ef2
AK
461 struct item *entry;
462
463 entry = new0(struct item, 1);
b0640287
AK
464 if (!entry) {
465 r = log_oom();
466 goto finish;
467 }
22be093f 468
d9c7a87b
LP
469 ul = fd_first_block(m->fd);
470
b0640287 471 entrytime = now(CLOCK_MONOTONIC);
94243ef2
AK
472
473 entry->block = ul;
474 entry->path = strdup(p);
b0640287
AK
475 if (!entry->path) {
476 free(entry);
477 r = log_oom();
478 goto finish;
479 }
480 entry->bin = (entrytime - starttime) / 2000000;
94243ef2
AK
481
482 if ((k = hashmap_put(files, p, entry)) < 0) {
d9c7a87b
LP
483 log_warning("set_put() failed: %s", strerror(-k));
484 free(p);
22be093f 485 }
d9c7a87b 486 }
22be093f 487
d9c7a87b
LP
488 } else
489 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
22be093f 490
d9c7a87b 491 next_iteration:
c4b996bd 492 if (m->fd >= 0)
22be093f 493 close_nointr_nofail(m->fd);
22be093f 494 }
22be093f
LP
495 }
496
6624768c 497done:
22be093f
LP
498 if (fanotify_fd >= 0) {
499 close_nointr_nofail(fanotify_fd);
500 fanotify_fd = -1;
501 }
502
503 log_debug("Writing Pack File...");
504
55888fa4 505 on_ssd = fs_on_ssd(root) > 0;
22be093f
LP
506 log_debug("On SSD: %s", yes_no(on_ssd));
507
5b61848d 508 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
746f8906
LP
509 log_debug("On btrfs: %s", yes_no(on_btrfs));
510
6de338a2 511 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
0d0f0c50 512 r = log_oom();
22be093f
LP
513 goto finish;
514 }
515
189455ab
LP
516 pack = fopen(pack_fn_new, "we");
517 if (!pack) {
22be093f
LP
518 log_error("Failed to open pack file: %m");
519 r = -errno;
520 goto finish;
521 }
522
cae544bc 523 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
22be093f
LP
524 putc(on_ssd ? 'S' : 'R', pack);
525
746f8906 526 if (on_ssd || on_btrfs) {
22be093f 527
746f8906 528 /* On SSD or on btrfs, just write things out in the
41a598e2 529 * order the files were accessed. */
22be093f
LP
530
531 HASHMAP_FOREACH_KEY(q, p, files, i)
746f8906 532 pack_file(pack, p, on_btrfs);
22be093f
LP
533 } else {
534 struct item *ordered, *j;
535 unsigned k, n;
536
537 /* On rotating media, order things by the block
538 * numbers */
539
540 log_debug("Ordering...");
541
542 n = hashmap_size(files);
543 if (!(ordered = new(struct item, n))) {
0d0f0c50 544 r = log_oom();
22be093f
LP
545 goto finish;
546 }
547
548 j = ordered;
549 HASHMAP_FOREACH_KEY(q, p, files, i) {
94243ef2 550 memcpy(j, q, sizeof(struct item));
22be093f
LP
551 j++;
552 }
553
554 assert(ordered + n == j);
555
556 qsort(ordered, n, sizeof(struct item), qsort_compare);
557
558 for (k = 0; k < n; k++)
746f8906 559 pack_file(pack, ordered[k].path, on_btrfs);
22be093f
LP
560
561 free(ordered);
562 }
563
564 log_debug("Finalizing...");
565
566 fflush(pack);
567
568 if (ferror(pack)) {
569 log_error("Failed to write pack file.");
570 r = -EIO;
571 goto finish;
572 }
573
574 if (rename(pack_fn_new, pack_fn) < 0) {
575 log_error("Failed to rename readahead file: %m");
576 r = -errno;
577 goto finish;
578 }
579
580 fclose(pack);
581 pack = NULL;
582
583 log_debug("Done.");
584
585finish:
586 if (fanotify_fd >= 0)
587 close_nointr_nofail(fanotify_fd);
588
589 if (signal_fd >= 0)
590 close_nointr_nofail(signal_fd);
591
6624768c
LP
592 if (inotify_fd >= 0)
593 close_nointr_nofail(inotify_fd);
594
22be093f
LP
595 if (pack) {
596 fclose(pack);
597 unlink(pack_fn_new);
598 }
22be093f
LP
599 free(pack_fn_new);
600 free(pack_fn);
601
602 while ((p = hashmap_steal_first_key(files)))
f0cf061e 603 free(p);
22be093f
LP
604
605 hashmap_free(files);
606
6de338a2
LP
607 if (previous_block_readahead_set) {
608 uint64_t bytes;
609
610 /* Restore the original kernel readahead setting if we
611 * changed it, and nobody has overwritten it since
612 * yet. */
613 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
614 block_set_readahead(root, previous_block_readahead);
615 }
616
22be093f
LP
617 return r;
618}
619
87ce22cc 620int main_collect(const char *root) {
8260358d 621
87ce22cc
LP
622 if (!root)
623 root = "/";
2b590e13 624
3b2d5b02
LP
625 /* Skip this step on read-only media. Note that we check the
626 * underlying block device here, not he read-only flag of the
627 * file system on top, since that one is most likely mounted
628 * read-only anyway at boot, even if the underlying block
629 * device is theoretically writable. */
2b590e13
LP
630 if (fs_on_read_only(root) > 0) {
631 log_info("Disabling readahead collector due to read-only media.");
87ce22cc 632 return EXIT_SUCCESS;
2b590e13
LP
633 }
634
41a598e2
LP
635 if (!enough_ram()) {
636 log_info("Disabling readahead collector due to low memory.");
87ce22cc 637 return EXIT_SUCCESS;
41a598e2
LP
638 }
639
3b2d5b02
LP
640 shared = shared_get();
641 if (!shared)
87ce22cc 642 return EXIT_FAILURE;
d9c7a87b
LP
643
644 shared->collect = getpid();
645 __sync_synchronize();
646
2b590e13 647 if (collect(root) < 0)
87ce22cc 648 return EXIT_FAILURE;
22be093f 649
87ce22cc 650 return EXIT_SUCCESS;
22be093f 651}