]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
readahead: cleanups
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45 #include <math.h>
46
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
49 #endif
50
51 #include <systemd/sd-daemon.h>
52
53 #include "missing.h"
54 #include "util.h"
55 #include "set.h"
56 #include "ioprio.h"
57 #include "readahead-common.h"
58 #include "virt.h"
59
60 /* fixme:
61 *
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
64 * - gzip?
65 * - remount rw?
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
68 */
69
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
72
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
76
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data;
79
80 zero(data);
81 data.fd = fd;
82
83 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 }
85
86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
87 struct stat st;
88 void *start = MAP_FAILED;
89 uint8_t *vec;
90 uint32_t b, c;
91 uint64_t inode;
92 size_t l, pages;
93 bool mapped;
94 int r = 0, fd = -1, k;
95
96 assert(pack);
97 assert(fn);
98
99 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
100 if (fd < 0) {
101
102 if (errno == ENOENT)
103 return 0;
104
105 if (errno == EPERM || errno == EACCES)
106 return 0;
107
108 log_warning("open(%s) failed: %m", fn);
109 r = -errno;
110 goto finish;
111 }
112
113 k = file_verify(fd, fn, arg_file_size_max, &st);
114 if (k <= 0) {
115 r = k;
116 goto finish;
117 }
118
119 if (on_btrfs)
120 btrfs_defrag(fd);
121
122 l = PAGE_ALIGN(st.st_size);
123 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
124 if (start == MAP_FAILED) {
125 log_warning("mmap(%s) failed: %m", fn);
126 r = -errno;
127 goto finish;
128 }
129
130 pages = l / page_size();
131 vec = alloca(pages);
132 memset(vec, 0, pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
135 r = -errno;
136 goto finish;
137 }
138
139 fputs(fn, pack);
140 fputc('\n', pack);
141
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
145
146 mapped = false;
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
149
150 if (!mapped && new_mapped)
151 b = c;
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
155
156 log_debug("%s: page %u to %u", fn, b, c);
157 }
158
159 mapped = new_mapped;
160 }
161
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
166
167 log_debug("%s: page %u to %u", fn, b, c);
168 }
169
170 /* End marker */
171 b = 0;
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
174
175 finish:
176 if (start != MAP_FAILED)
177 munmap(start, l);
178
179 if (fd >= 0)
180 close_nointr_nofail(fd);
181
182 return r;
183 }
184
185 static unsigned long fd_first_block(int fd) {
186 struct {
187 struct fiemap fiemap;
188 struct fiemap_extent extent;
189 } data;
190
191 zero(data);
192 data.fiemap.fm_length = ~0ULL;
193 data.fiemap.fm_extent_count = 1;
194
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 return 0;
197
198 if (data.fiemap.fm_mapped_extents <= 0)
199 return 0;
200
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return 0;
203
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
205 }
206
207 struct item {
208 const char *path;
209 unsigned long block;
210 unsigned long bin;
211 };
212
213 static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
215
216 i = a;
217 j = b;
218
219 /* sort by bin first */
220 if (i->bin < j->bin)
221 return -1;
222 if (i->bin > j->bin)
223 return 1;
224
225 /* then sort by sector */
226 if (i->block < j->block)
227 return -1;
228 if (i->block > j->block)
229 return 1;
230
231 return strcmp(i->path, j->path);
232 }
233
234 static int collect(const char *root) {
235 enum {
236 FD_FANOTIFY, /* Get the actual fs events */
237 FD_SIGNAL,
238 FD_INOTIFY, /* We get notifications to quit early via this fd */
239 _FD_MAX
240 };
241 struct pollfd pollfd[_FD_MAX];
242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
243 pid_t my_pid;
244 Hashmap *files = NULL;
245 Iterator i;
246 char *p, *q;
247 sigset_t mask;
248 FILE *pack = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
250 bool on_ssd, on_btrfs;
251 struct statfs sfs;
252 usec_t not_after;
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
255
256 assert(root);
257
258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
259 r = log_oom();
260 goto finish;
261 }
262
263 starttime = now(CLOCK_MONOTONIC);
264
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
273
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
276
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
280
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
283 r = -errno;
284 goto finish;
285 }
286
287 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
288 log_error("Failed to allocate set.");
289 r = -ENOMEM;
290 goto finish;
291 }
292
293 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
294 log_error("Failed to create fanotify object: %m");
295 r = -errno;
296 goto finish;
297 }
298
299 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
300 log_error("Failed to mark %s: %m", root);
301 r = -errno;
302 goto finish;
303 }
304
305 if ((inotify_fd = open_inotify()) < 0) {
306 r = inotify_fd;
307 goto finish;
308 }
309
310 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
311
312 my_pid = getpid();
313
314 zero(pollfd);
315 pollfd[FD_FANOTIFY].fd = fanotify_fd;
316 pollfd[FD_FANOTIFY].events = POLLIN;
317 pollfd[FD_SIGNAL].fd = signal_fd;
318 pollfd[FD_SIGNAL].events = POLLIN;
319 pollfd[FD_INOTIFY].fd = inotify_fd;
320 pollfd[FD_INOTIFY].events = POLLIN;
321
322 sd_notify(0,
323 "READY=1\n"
324 "STATUS=Collecting readahead data");
325
326 log_debug("Collecting...");
327
328 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
329 log_debug("Collection canceled");
330 r = -ECANCELED;
331 goto finish;
332 }
333
334 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
335 log_debug("Got termination request");
336 goto done;
337 }
338
339 for (;;) {
340 union {
341 struct fanotify_event_metadata metadata;
342 char buffer[4096];
343 } data;
344 ssize_t n;
345 struct fanotify_event_metadata *m;
346 usec_t t;
347 int h;
348
349 if (hashmap_size(files) > arg_files_max) {
350 log_debug("Reached maximum number of read ahead files, ending collection.");
351 break;
352 }
353
354 t = now(CLOCK_MONOTONIC);
355 if (t >= not_after) {
356 log_debug("Reached maximum collection time, ending collection.");
357 break;
358 }
359
360 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
361
362 if (errno == EINTR)
363 continue;
364
365 log_error("poll(): %m");
366 r = -errno;
367 goto finish;
368 }
369
370 if (h == 0) {
371 log_debug("Reached maximum collection time, ending collection.");
372 break;
373 }
374
375 if (pollfd[FD_SIGNAL].revents) {
376 log_debug("Got signal.");
377 break;
378 }
379
380 if (pollfd[FD_INOTIFY].revents) {
381 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
382 struct inotify_event *e;
383
384 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
386 continue;
387
388 log_error("Failed to read inotify event: %m");
389 r = -errno;
390 goto finish;
391 }
392
393 e = (struct inotify_event*) inotify_buffer;
394 while (n > 0) {
395 size_t step;
396
397 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
398 log_debug("Collection canceled");
399 r = -ECANCELED;
400 goto finish;
401 }
402
403 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
404 log_debug("Got termination request");
405 goto done;
406 }
407
408 step = sizeof(struct inotify_event) + e->len;
409 assert(step <= (size_t) n);
410
411 e = (struct inotify_event*) ((uint8_t*) e + step);
412 n -= step;
413 }
414 }
415
416 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
417
418 if (errno == EINTR || errno == EAGAIN)
419 continue;
420
421 /* fanotify sometimes returns EACCES on read()
422 * where it shouldn't. For now let's just
423 * ignore it here (which is safe), but
424 * eventually this should be
425 * dropped when the kernel is fixed.
426 *
427 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
428 if (errno == EACCES)
429 continue;
430
431 log_error("Failed to read event: %m");
432 r = -errno;
433 goto finish;
434 }
435
436 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
437 char fn[PATH_MAX];
438 int k;
439
440 if (m->fd < 0)
441 goto next_iteration;
442
443 if (m->pid == my_pid)
444 goto next_iteration;
445
446 __sync_synchronize();
447 if (m->pid == shared->replay)
448 goto next_iteration;
449
450 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
451 char_array_0(fn);
452
453 if ((k = readlink_malloc(fn, &p)) >= 0) {
454 if (startswith(p, "/tmp") ||
455 endswith(p, " (deleted)") ||
456 hashmap_get(files, p))
457 /* Not interesting, or
458 * already read */
459 free(p);
460 else {
461 unsigned long ul;
462 usec_t entrytime;
463 struct item *entry;
464
465 entry = new0(struct item, 1);
466 if (!entry) {
467 r = log_oom();
468 goto finish;
469 }
470
471 ul = fd_first_block(m->fd);
472
473 entrytime = now(CLOCK_MONOTONIC);
474
475 entry->block = ul;
476 entry->path = strdup(p);
477 if (!entry->path) {
478 free(entry);
479 r = log_oom();
480 goto finish;
481 }
482 entry->bin = (entrytime - starttime) / 2000000;
483
484 if ((k = hashmap_put(files, p, entry)) < 0) {
485 log_warning("set_put() failed: %s", strerror(-k));
486 free(p);
487 }
488 }
489
490 } else
491 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
492
493 next_iteration:
494 if (m->fd >= 0)
495 close_nointr_nofail(m->fd);
496 }
497 }
498
499 done:
500 if (fanotify_fd >= 0) {
501 close_nointr_nofail(fanotify_fd);
502 fanotify_fd = -1;
503 }
504
505 log_debug("Writing Pack File...");
506
507 on_ssd = fs_on_ssd(root) > 0;
508 log_debug("On SSD: %s", yes_no(on_ssd));
509
510 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
511 log_debug("On btrfs: %s", yes_no(on_btrfs));
512
513 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
514 r = log_oom();
515 goto finish;
516 }
517
518 pack = fopen(pack_fn_new, "we");
519 if (!pack) {
520 log_error("Failed to open pack file: %m");
521 r = -errno;
522 goto finish;
523 }
524
525 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
526 putc(on_ssd ? 'S' : 'R', pack);
527
528 if (on_ssd || on_btrfs) {
529
530 /* On SSD or on btrfs, just write things out in the
531 * order the files were accessed. */
532
533 HASHMAP_FOREACH_KEY(q, p, files, i)
534 pack_file(pack, p, on_btrfs);
535 } else {
536 struct item *ordered, *j;
537 unsigned k, n;
538
539 /* On rotating media, order things by the block
540 * numbers */
541
542 log_debug("Ordering...");
543
544 n = hashmap_size(files);
545 if (!(ordered = new(struct item, n))) {
546 r = log_oom();
547 goto finish;
548 }
549
550 j = ordered;
551 HASHMAP_FOREACH_KEY(q, p, files, i) {
552 memcpy(j, q, sizeof(struct item));
553 j++;
554 }
555
556 assert(ordered + n == j);
557
558 qsort(ordered, n, sizeof(struct item), qsort_compare);
559
560 for (k = 0; k < n; k++)
561 pack_file(pack, ordered[k].path, on_btrfs);
562
563 free(ordered);
564 }
565
566 log_debug("Finalizing...");
567
568 fflush(pack);
569
570 if (ferror(pack)) {
571 log_error("Failed to write pack file.");
572 r = -EIO;
573 goto finish;
574 }
575
576 if (rename(pack_fn_new, pack_fn) < 0) {
577 log_error("Failed to rename readahead file: %m");
578 r = -errno;
579 goto finish;
580 }
581
582 fclose(pack);
583 pack = NULL;
584
585 log_debug("Done.");
586
587 finish:
588 if (fanotify_fd >= 0)
589 close_nointr_nofail(fanotify_fd);
590
591 if (signal_fd >= 0)
592 close_nointr_nofail(signal_fd);
593
594 if (inotify_fd >= 0)
595 close_nointr_nofail(inotify_fd);
596
597 if (pack) {
598 fclose(pack);
599 unlink(pack_fn_new);
600 }
601 free(pack_fn_new);
602 free(pack_fn);
603
604 while ((p = hashmap_steal_first_key(files)))
605 free(p);
606
607 hashmap_free(files);
608
609 if (previous_block_readahead_set) {
610 uint64_t bytes;
611
612 /* Restore the original kernel readahead setting if we
613 * changed it, and nobody has overwritten it since
614 * yet. */
615 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
616 block_set_readahead(root, previous_block_readahead);
617 }
618
619 return r;
620 }
621
622 int main_collect(const char *root) {
623
624 if (!root)
625 root = "/";
626
627 /* Skip this step on read-only media. Note that we check the
628 * underlying block device here, not he read-only flag of the
629 * file system on top, since that one is most likely mounted
630 * read-only anyway at boot, even if the underlying block
631 * device is theoretically writable. */
632 if (fs_on_read_only(root) > 0) {
633 log_info("Disabling readahead collector due to read-only media.");
634 return EXIT_SUCCESS;
635 }
636
637 if (!enough_ram()) {
638 log_info("Disabling readahead collector due to low memory.");
639 return EXIT_SUCCESS;
640 }
641
642 shared = shared_get();
643 if (!shared)
644 return EXIT_FAILURE;
645
646 shared->collect = getpid();
647 __sync_synchronize();
648
649 if (collect(root) < 0)
650 return EXIT_FAILURE;
651
652 return EXIT_SUCCESS;
653 }