]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
Modernization
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45 #include <math.h>
46
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
49 #endif
50
51 #include <systemd/sd-daemon.h>
52
53 #include "missing.h"
54 #include "util.h"
55 #include "set.h"
56 #include "ioprio.h"
57 #include "readahead-common.h"
58 #include "virt.h"
59
60 /* fixme:
61 *
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
64 * - gzip?
65 * - remount rw?
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
68 */
69
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
72
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
76
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data;
79
80 zero(data);
81 data.fd = fd;
82
83 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
84 }
85
86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
87 struct stat st;
88 void *start = MAP_FAILED;
89 uint8_t *vec;
90 uint32_t b, c;
91 uint64_t inode;
92 size_t l, pages;
93 bool mapped;
94 int r = 0, fd = -1, k;
95
96 assert(pack);
97 assert(fn);
98
99 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
100 if (fd < 0) {
101
102 if (errno == ENOENT)
103 return 0;
104
105 if (errno == EPERM || errno == EACCES)
106 return 0;
107
108 log_warning("open(%s) failed: %m", fn);
109 r = -errno;
110 goto finish;
111 }
112
113 k = file_verify(fd, fn, arg_file_size_max, &st);
114 if (k <= 0) {
115 r = k;
116 goto finish;
117 }
118
119 if (on_btrfs)
120 btrfs_defrag(fd);
121
122 l = PAGE_ALIGN(st.st_size);
123 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
124 if (start == MAP_FAILED) {
125 log_warning("mmap(%s) failed: %m", fn);
126 r = -errno;
127 goto finish;
128 }
129
130 pages = l / page_size();
131 vec = alloca(pages);
132 memset(vec, 0, pages);
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
135 r = -errno;
136 goto finish;
137 }
138
139 fputs(fn, pack);
140 fputc('\n', pack);
141
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
145
146 mapped = false;
147 for (c = 0; c < pages; c++) {
148 bool new_mapped = !!(vec[c] & 1);
149
150 if (!mapped && new_mapped)
151 b = c;
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
155
156 log_debug("%s: page %u to %u", fn, b, c);
157 }
158
159 mapped = new_mapped;
160 }
161
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
166
167 log_debug("%s: page %u to %u", fn, b, c);
168 }
169
170 /* End marker */
171 b = 0;
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
174
175 finish:
176 if (start != MAP_FAILED)
177 munmap(start, l);
178
179 if (fd >= 0)
180 close_nointr_nofail(fd);
181
182 return r;
183 }
184
185 static unsigned long fd_first_block(int fd) {
186 struct {
187 struct fiemap fiemap;
188 struct fiemap_extent extent;
189 } data;
190
191 zero(data);
192 data.fiemap.fm_length = ~0ULL;
193 data.fiemap.fm_extent_count = 1;
194
195 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
196 return 0;
197
198 if (data.fiemap.fm_mapped_extents <= 0)
199 return 0;
200
201 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
202 return 0;
203
204 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
205 }
206
207 struct item {
208 const char *path;
209 unsigned long block;
210 unsigned long bin;
211 };
212
213 static int qsort_compare(const void *a, const void *b) {
214 const struct item *i, *j;
215
216 i = a;
217 j = b;
218
219 /* sort by bin first */
220 if (i->bin < j->bin)
221 return -1;
222 if (i->bin > j->bin)
223 return 1;
224
225 /* then sort by sector */
226 if (i->block < j->block)
227 return -1;
228 if (i->block > j->block)
229 return 1;
230
231 return strcmp(i->path, j->path);
232 }
233
234 static int collect(const char *root) {
235 enum {
236 FD_FANOTIFY, /* Get the actual fs events */
237 FD_SIGNAL,
238 FD_INOTIFY, /* We get notifications to quit early via this fd */
239 _FD_MAX
240 };
241 struct pollfd pollfd[_FD_MAX];
242 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
243 pid_t my_pid;
244 Hashmap *files = NULL;
245 Iterator i;
246 char *p, *q;
247 sigset_t mask;
248 FILE *pack = NULL;
249 char *pack_fn_new = NULL, *pack_fn = NULL;
250 bool on_ssd, on_btrfs;
251 struct statfs sfs;
252 usec_t not_after;
253 uint64_t previous_block_readahead;
254 bool previous_block_readahead_set = false;
255
256 assert(root);
257
258 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
259 r = log_oom();
260 goto finish;
261 }
262
263 starttime = now(CLOCK_MONOTONIC);
264
265 /* If there's no pack file yet we lower the kernel readahead
266 * so that mincore() is accurate. If there is a pack file
267 * already we assume it is accurate enough so that kernel
268 * readahead is never triggered. */
269 previous_block_readahead_set =
270 access(pack_fn, F_OK) < 0 &&
271 block_get_readahead(root, &previous_block_readahead) >= 0 &&
272 block_set_readahead(root, 8*1024) >= 0;
273
274 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
275 log_warning("Failed to set IDLE IO priority class: %m");
276
277 assert_se(sigemptyset(&mask) == 0);
278 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
279 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
280
281 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
282 log_error("signalfd(): %m");
283 r = -errno;
284 goto finish;
285 }
286
287 files = hashmap_new(string_hash_func, string_compare_func);
288 if (!files) {
289 log_error("Failed to allocate set.");
290 r = -ENOMEM;
291 goto finish;
292 }
293
294 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK,
295 O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
296 if (fanotify_fd < 0) {
297 log_error("Failed to create fanotify object: %m");
298 r = -errno;
299 goto finish;
300 }
301
302 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
303 log_error("Failed to mark %s: %m", root);
304 r = -errno;
305 goto finish;
306 }
307
308 inotify_fd = open_inotify();
309 if (inotify_fd < 0) {
310 r = inotify_fd;
311 goto finish;
312 }
313
314 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
315
316 my_pid = getpid();
317
318 zero(pollfd);
319 pollfd[FD_FANOTIFY].fd = fanotify_fd;
320 pollfd[FD_FANOTIFY].events = POLLIN;
321 pollfd[FD_SIGNAL].fd = signal_fd;
322 pollfd[FD_SIGNAL].events = POLLIN;
323 pollfd[FD_INOTIFY].fd = inotify_fd;
324 pollfd[FD_INOTIFY].events = POLLIN;
325
326 sd_notify(0,
327 "READY=1\n"
328 "STATUS=Collecting readahead data");
329
330 log_debug("Collecting...");
331
332 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
333 log_debug("Collection canceled");
334 r = -ECANCELED;
335 goto finish;
336 }
337
338 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
339 log_debug("Got termination request");
340 goto done;
341 }
342
343 for (;;) {
344 union {
345 struct fanotify_event_metadata metadata;
346 char buffer[4096];
347 } data;
348 ssize_t n;
349 struct fanotify_event_metadata *m;
350 usec_t t;
351 int h;
352
353 if (hashmap_size(files) > arg_files_max) {
354 log_debug("Reached maximum number of read ahead files, ending collection.");
355 break;
356 }
357
358 t = now(CLOCK_MONOTONIC);
359 if (t >= not_after) {
360 log_debug("Reached maximum collection time, ending collection.");
361 break;
362 }
363
364 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
365
366 if (errno == EINTR)
367 continue;
368
369 log_error("poll(): %m");
370 r = -errno;
371 goto finish;
372 }
373
374 if (h == 0) {
375 log_debug("Reached maximum collection time, ending collection.");
376 break;
377 }
378
379 if (pollfd[FD_SIGNAL].revents) {
380 log_debug("Got signal.");
381 break;
382 }
383
384 if (pollfd[FD_INOTIFY].revents) {
385 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
386 struct inotify_event *e;
387
388 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
389 if (errno == EINTR || errno == EAGAIN)
390 continue;
391
392 log_error("Failed to read inotify event: %m");
393 r = -errno;
394 goto finish;
395 }
396
397 e = (struct inotify_event*) inotify_buffer;
398 while (n > 0) {
399 size_t step;
400
401 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
402 log_debug("Collection canceled");
403 r = -ECANCELED;
404 goto finish;
405 }
406
407 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
408 log_debug("Got termination request");
409 goto done;
410 }
411
412 step = sizeof(struct inotify_event) + e->len;
413 assert(step <= (size_t) n);
414
415 e = (struct inotify_event*) ((uint8_t*) e + step);
416 n -= step;
417 }
418 }
419
420 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
421
422 if (errno == EINTR || errno == EAGAIN)
423 continue;
424
425 /* fanotify sometimes returns EACCES on read()
426 * where it shouldn't. For now let's just
427 * ignore it here (which is safe), but
428 * eventually this should be
429 * dropped when the kernel is fixed.
430 *
431 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
432 if (errno == EACCES)
433 continue;
434
435 log_error("Failed to read event: %m");
436 r = -errno;
437 goto finish;
438 }
439
440 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
441 char fn[PATH_MAX];
442 int k;
443
444 if (m->fd < 0)
445 goto next_iteration;
446
447 if (m->pid == my_pid)
448 goto next_iteration;
449
450 __sync_synchronize();
451 if (m->pid == shared->replay)
452 goto next_iteration;
453
454 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
455 char_array_0(fn);
456
457 if ((k = readlink_malloc(fn, &p)) >= 0) {
458 if (startswith(p, "/tmp") ||
459 endswith(p, " (deleted)") ||
460 hashmap_get(files, p))
461 /* Not interesting, or
462 * already read */
463 free(p);
464 else {
465 unsigned long ul;
466 usec_t entrytime;
467 struct item *entry;
468
469 entry = new0(struct item, 1);
470 if (!entry) {
471 r = log_oom();
472 goto finish;
473 }
474
475 ul = fd_first_block(m->fd);
476
477 entrytime = now(CLOCK_MONOTONIC);
478
479 entry->block = ul;
480 entry->path = strdup(p);
481 if (!entry->path) {
482 free(entry);
483 r = log_oom();
484 goto finish;
485 }
486 entry->bin = (entrytime - starttime) / 2000000;
487
488 if ((k = hashmap_put(files, p, entry)) < 0) {
489 log_warning("set_put() failed: %s", strerror(-k));
490 free(p);
491 }
492 }
493
494 } else
495 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
496
497 next_iteration:
498 if (m->fd >= 0)
499 close_nointr_nofail(m->fd);
500 }
501 }
502
503 done:
504 if (fanotify_fd >= 0) {
505 close_nointr_nofail(fanotify_fd);
506 fanotify_fd = -1;
507 }
508
509 log_debug("Writing Pack File...");
510
511 on_ssd = fs_on_ssd(root) > 0;
512 log_debug("On SSD: %s", yes_no(on_ssd));
513
514 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
515 log_debug("On btrfs: %s", yes_no(on_btrfs));
516
517 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
518 r = log_oom();
519 goto finish;
520 }
521
522 pack = fopen(pack_fn_new, "we");
523 if (!pack) {
524 log_error("Failed to open pack file: %m");
525 r = -errno;
526 goto finish;
527 }
528
529 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
530 putc(on_ssd ? 'S' : 'R', pack);
531
532 if (on_ssd || on_btrfs) {
533
534 /* On SSD or on btrfs, just write things out in the
535 * order the files were accessed. */
536
537 HASHMAP_FOREACH_KEY(q, p, files, i)
538 pack_file(pack, p, on_btrfs);
539 } else {
540 struct item *ordered, *j;
541 unsigned k, n;
542
543 /* On rotating media, order things by the block
544 * numbers */
545
546 log_debug("Ordering...");
547
548 n = hashmap_size(files);
549 if (!(ordered = new(struct item, n))) {
550 r = log_oom();
551 goto finish;
552 }
553
554 j = ordered;
555 HASHMAP_FOREACH_KEY(q, p, files, i) {
556 memcpy(j, q, sizeof(struct item));
557 j++;
558 }
559
560 assert(ordered + n == j);
561
562 qsort(ordered, n, sizeof(struct item), qsort_compare);
563
564 for (k = 0; k < n; k++)
565 pack_file(pack, ordered[k].path, on_btrfs);
566
567 free(ordered);
568 }
569
570 log_debug("Finalizing...");
571
572 fflush(pack);
573
574 if (ferror(pack)) {
575 log_error("Failed to write pack file.");
576 r = -EIO;
577 goto finish;
578 }
579
580 if (rename(pack_fn_new, pack_fn) < 0) {
581 log_error("Failed to rename readahead file: %m");
582 r = -errno;
583 goto finish;
584 }
585
586 fclose(pack);
587 pack = NULL;
588
589 log_debug("Done.");
590
591 finish:
592 if (fanotify_fd >= 0)
593 close_nointr_nofail(fanotify_fd);
594
595 if (signal_fd >= 0)
596 close_nointr_nofail(signal_fd);
597
598 if (inotify_fd >= 0)
599 close_nointr_nofail(inotify_fd);
600
601 if (pack) {
602 fclose(pack);
603 unlink(pack_fn_new);
604 }
605 free(pack_fn_new);
606 free(pack_fn);
607
608 while ((p = hashmap_steal_first_key(files)))
609 free(p);
610
611 hashmap_free(files);
612
613 if (previous_block_readahead_set) {
614 uint64_t bytes;
615
616 /* Restore the original kernel readahead setting if we
617 * changed it, and nobody has overwritten it since
618 * yet. */
619 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
620 block_set_readahead(root, previous_block_readahead);
621 }
622
623 return r;
624 }
625
626 int main_collect(const char *root) {
627
628 if (!root)
629 root = "/";
630
631 /* Skip this step on read-only media. Note that we check the
632 * underlying block device here, not he read-only flag of the
633 * file system on top, since that one is most likely mounted
634 * read-only anyway at boot, even if the underlying block
635 * device is theoretically writable. */
636 if (fs_on_read_only(root) > 0) {
637 log_info("Disabling readahead collector due to read-only media.");
638 return EXIT_SUCCESS;
639 }
640
641 if (!enough_ram()) {
642 log_info("Disabling readahead collector due to low memory.");
643 return EXIT_SUCCESS;
644 }
645
646 shared = shared_get();
647 if (!shared)
648 return EXIT_FAILURE;
649
650 shared->collect = getpid();
651 __sync_synchronize();
652
653 if (collect(root) < 0)
654 return EXIT_FAILURE;
655
656 return EXIT_SUCCESS;
657 }