]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
Use initalization instead of explicit zeroing
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45 #include <math.h>
46
47 #ifdef HAVE_FANOTIFY_INIT
48 #include <sys/fanotify.h>
49 #endif
50
51 #include <systemd/sd-daemon.h>
52
53 #include "missing.h"
54 #include "util.h"
55 #include "set.h"
56 #include "ioprio.h"
57 #include "readahead-common.h"
58 #include "virt.h"
59
60 /* fixme:
61 *
62 * - detect ssd on btrfs/lvm...
63 * - read ahead directories
64 * - gzip?
65 * - remount rw?
66 * - handle files where nothing is in mincore
67 * - does ioprio_set work with fadvise()?
68 */
69
70 static ReadaheadShared *shared = NULL;
71 static usec_t starttime;
72
73 /* Avoid collisions with the NULL pointer */
74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
76
77 static int btrfs_defrag(int fd) {
78 struct btrfs_ioctl_vol_args data = { .fd = fd };
79
80 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
81 }
82
83 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
84 struct stat st;
85 void *start = MAP_FAILED;
86 uint8_t *vec;
87 uint32_t b, c;
88 uint64_t inode;
89 size_t l, pages;
90 bool mapped;
91 int r = 0, fd = -1, k;
92
93 assert(pack);
94 assert(fn);
95
96 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
97 if (fd < 0) {
98
99 if (errno == ENOENT)
100 return 0;
101
102 if (errno == EPERM || errno == EACCES)
103 return 0;
104
105 log_warning("open(%s) failed: %m", fn);
106 r = -errno;
107 goto finish;
108 }
109
110 k = file_verify(fd, fn, arg_file_size_max, &st);
111 if (k <= 0) {
112 r = k;
113 goto finish;
114 }
115
116 if (on_btrfs)
117 btrfs_defrag(fd);
118
119 l = PAGE_ALIGN(st.st_size);
120 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
121 if (start == MAP_FAILED) {
122 log_warning("mmap(%s) failed: %m", fn);
123 r = -errno;
124 goto finish;
125 }
126
127 pages = l / page_size();
128 vec = alloca(pages);
129 memset(vec, 0, pages);
130 if (mincore(start, l, vec) < 0) {
131 log_warning("mincore(%s) failed: %m", fn);
132 r = -errno;
133 goto finish;
134 }
135
136 fputs(fn, pack);
137 fputc('\n', pack);
138
139 /* Store the inode, so that we notice when the file is deleted */
140 inode = (uint64_t) st.st_ino;
141 fwrite(&inode, sizeof(inode), 1, pack);
142
143 mapped = false;
144 for (c = 0; c < pages; c++) {
145 bool new_mapped = !!(vec[c] & 1);
146
147 if (!mapped && new_mapped)
148 b = c;
149 else if (mapped && !new_mapped) {
150 fwrite(&b, sizeof(b), 1, pack);
151 fwrite(&c, sizeof(c), 1, pack);
152
153 log_debug("%s: page %u to %u", fn, b, c);
154 }
155
156 mapped = new_mapped;
157 }
158
159 /* We don't write any range data if we should read the entire file */
160 if (mapped && b > 0) {
161 fwrite(&b, sizeof(b), 1, pack);
162 fwrite(&c, sizeof(c), 1, pack);
163
164 log_debug("%s: page %u to %u", fn, b, c);
165 }
166
167 /* End marker */
168 b = 0;
169 fwrite(&b, sizeof(b), 1, pack);
170 fwrite(&b, sizeof(b), 1, pack);
171
172 finish:
173 if (start != MAP_FAILED)
174 munmap(start, l);
175
176 if (fd >= 0)
177 close_nointr_nofail(fd);
178
179 return r;
180 }
181
182 static unsigned long fd_first_block(int fd) {
183 struct {
184 struct fiemap fiemap;
185 struct fiemap_extent extent;
186 } data = {
187 .fiemap.fm_length = ~0ULL,
188 .fiemap.fm_extent_count = 1,
189 };
190
191 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
192 return 0;
193
194 if (data.fiemap.fm_mapped_extents <= 0)
195 return 0;
196
197 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
198 return 0;
199
200 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
201 }
202
203 struct item {
204 const char *path;
205 unsigned long block;
206 unsigned long bin;
207 };
208
209 static int qsort_compare(const void *a, const void *b) {
210 const struct item *i, *j;
211
212 i = a;
213 j = b;
214
215 /* sort by bin first */
216 if (i->bin < j->bin)
217 return -1;
218 if (i->bin > j->bin)
219 return 1;
220
221 /* then sort by sector */
222 if (i->block < j->block)
223 return -1;
224 if (i->block > j->block)
225 return 1;
226
227 return strcmp(i->path, j->path);
228 }
229
230 static int collect(const char *root) {
231 enum {
232 FD_FANOTIFY, /* Get the actual fs events */
233 FD_SIGNAL,
234 FD_INOTIFY, /* We get notifications to quit early via this fd */
235 _FD_MAX
236 };
237 struct pollfd pollfd[_FD_MAX] = {};
238 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
239 pid_t my_pid;
240 Hashmap *files = NULL;
241 Iterator i;
242 char *p, *q;
243 sigset_t mask;
244 FILE *pack = NULL;
245 char *pack_fn_new = NULL, *pack_fn = NULL;
246 bool on_ssd, on_btrfs;
247 struct statfs sfs;
248 usec_t not_after;
249 uint64_t previous_block_readahead;
250 bool previous_block_readahead_set = false;
251
252 assert(root);
253
254 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
255 r = log_oom();
256 goto finish;
257 }
258
259 starttime = now(CLOCK_MONOTONIC);
260
261 /* If there's no pack file yet we lower the kernel readahead
262 * so that mincore() is accurate. If there is a pack file
263 * already we assume it is accurate enough so that kernel
264 * readahead is never triggered. */
265 previous_block_readahead_set =
266 access(pack_fn, F_OK) < 0 &&
267 block_get_readahead(root, &previous_block_readahead) >= 0 &&
268 block_set_readahead(root, 8*1024) >= 0;
269
270 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
271 log_warning("Failed to set IDLE IO priority class: %m");
272
273 assert_se(sigemptyset(&mask) == 0);
274 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
275 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
276
277 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
278 log_error("signalfd(): %m");
279 r = -errno;
280 goto finish;
281 }
282
283 files = hashmap_new(string_hash_func, string_compare_func);
284 if (!files) {
285 log_error("Failed to allocate set.");
286 r = -ENOMEM;
287 goto finish;
288 }
289
290 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
291 if (fanotify_fd < 0) {
292 log_error("Failed to create fanotify object: %m");
293 r = -errno;
294 goto finish;
295 }
296
297 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
298 log_error("Failed to mark %s: %m", root);
299 r = -errno;
300 goto finish;
301 }
302
303 inotify_fd = open_inotify();
304 if (inotify_fd < 0) {
305 r = inotify_fd;
306 goto finish;
307 }
308
309 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
310
311 my_pid = getpid();
312
313 pollfd[FD_FANOTIFY].fd = fanotify_fd;
314 pollfd[FD_FANOTIFY].events = POLLIN;
315 pollfd[FD_SIGNAL].fd = signal_fd;
316 pollfd[FD_SIGNAL].events = POLLIN;
317 pollfd[FD_INOTIFY].fd = inotify_fd;
318 pollfd[FD_INOTIFY].events = POLLIN;
319
320 sd_notify(0,
321 "READY=1\n"
322 "STATUS=Collecting readahead data");
323
324 log_debug("Collecting...");
325
326 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
327 log_debug("Collection canceled");
328 r = -ECANCELED;
329 goto finish;
330 }
331
332 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
333 log_debug("Got termination request");
334 goto done;
335 }
336
337 for (;;) {
338 union {
339 struct fanotify_event_metadata metadata;
340 char buffer[4096];
341 } data;
342 ssize_t n;
343 struct fanotify_event_metadata *m;
344 usec_t t;
345 int h;
346
347 if (hashmap_size(files) > arg_files_max) {
348 log_debug("Reached maximum number of read ahead files, ending collection.");
349 break;
350 }
351
352 t = now(CLOCK_MONOTONIC);
353 if (t >= not_after) {
354 log_debug("Reached maximum collection time, ending collection.");
355 break;
356 }
357
358 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
359
360 if (errno == EINTR)
361 continue;
362
363 log_error("poll(): %m");
364 r = -errno;
365 goto finish;
366 }
367
368 if (h == 0) {
369 log_debug("Reached maximum collection time, ending collection.");
370 break;
371 }
372
373 if (pollfd[FD_SIGNAL].revents) {
374 log_debug("Got signal.");
375 break;
376 }
377
378 if (pollfd[FD_INOTIFY].revents) {
379 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
380 struct inotify_event *e;
381
382 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
383 if (errno == EINTR || errno == EAGAIN)
384 continue;
385
386 log_error("Failed to read inotify event: %m");
387 r = -errno;
388 goto finish;
389 }
390
391 e = (struct inotify_event*) inotify_buffer;
392 while (n > 0) {
393 size_t step;
394
395 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
396 log_debug("Collection canceled");
397 r = -ECANCELED;
398 goto finish;
399 }
400
401 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
402 log_debug("Got termination request");
403 goto done;
404 }
405
406 step = sizeof(struct inotify_event) + e->len;
407 assert(step <= (size_t) n);
408
409 e = (struct inotify_event*) ((uint8_t*) e + step);
410 n -= step;
411 }
412 }
413
414 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
415
416 if (errno == EINTR || errno == EAGAIN)
417 continue;
418
419 /* fanotify sometimes returns EACCES on read()
420 * where it shouldn't. For now let's just
421 * ignore it here (which is safe), but
422 * eventually this should be
423 * dropped when the kernel is fixed.
424 *
425 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
426 if (errno == EACCES)
427 continue;
428
429 log_error("Failed to read event: %m");
430 r = -errno;
431 goto finish;
432 }
433
434 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
435 char fn[PATH_MAX];
436 int k;
437
438 if (m->fd < 0)
439 goto next_iteration;
440
441 if (m->pid == my_pid)
442 goto next_iteration;
443
444 __sync_synchronize();
445 if (m->pid == shared->replay)
446 goto next_iteration;
447
448 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
449 char_array_0(fn);
450
451 if ((k = readlink_malloc(fn, &p)) >= 0) {
452 if (startswith(p, "/tmp") ||
453 endswith(p, " (deleted)") ||
454 hashmap_get(files, p))
455 /* Not interesting, or
456 * already read */
457 free(p);
458 else {
459 unsigned long ul;
460 usec_t entrytime;
461 struct item *entry;
462
463 entry = new0(struct item, 1);
464 if (!entry) {
465 r = log_oom();
466 goto finish;
467 }
468
469 ul = fd_first_block(m->fd);
470
471 entrytime = now(CLOCK_MONOTONIC);
472
473 entry->block = ul;
474 entry->path = strdup(p);
475 if (!entry->path) {
476 free(entry);
477 r = log_oom();
478 goto finish;
479 }
480 entry->bin = (entrytime - starttime) / 2000000;
481
482 if ((k = hashmap_put(files, p, entry)) < 0) {
483 log_warning("set_put() failed: %s", strerror(-k));
484 free(p);
485 }
486 }
487
488 } else
489 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
490
491 next_iteration:
492 if (m->fd >= 0)
493 close_nointr_nofail(m->fd);
494 }
495 }
496
497 done:
498 if (fanotify_fd >= 0) {
499 close_nointr_nofail(fanotify_fd);
500 fanotify_fd = -1;
501 }
502
503 log_debug("Writing Pack File...");
504
505 on_ssd = fs_on_ssd(root) > 0;
506 log_debug("On SSD: %s", yes_no(on_ssd));
507
508 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
509 log_debug("On btrfs: %s", yes_no(on_btrfs));
510
511 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
512 r = log_oom();
513 goto finish;
514 }
515
516 pack = fopen(pack_fn_new, "we");
517 if (!pack) {
518 log_error("Failed to open pack file: %m");
519 r = -errno;
520 goto finish;
521 }
522
523 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
524 putc(on_ssd ? 'S' : 'R', pack);
525
526 if (on_ssd || on_btrfs) {
527
528 /* On SSD or on btrfs, just write things out in the
529 * order the files were accessed. */
530
531 HASHMAP_FOREACH_KEY(q, p, files, i)
532 pack_file(pack, p, on_btrfs);
533 } else {
534 struct item *ordered, *j;
535 unsigned k, n;
536
537 /* On rotating media, order things by the block
538 * numbers */
539
540 log_debug("Ordering...");
541
542 n = hashmap_size(files);
543 if (!(ordered = new(struct item, n))) {
544 r = log_oom();
545 goto finish;
546 }
547
548 j = ordered;
549 HASHMAP_FOREACH_KEY(q, p, files, i) {
550 memcpy(j, q, sizeof(struct item));
551 j++;
552 }
553
554 assert(ordered + n == j);
555
556 qsort(ordered, n, sizeof(struct item), qsort_compare);
557
558 for (k = 0; k < n; k++)
559 pack_file(pack, ordered[k].path, on_btrfs);
560
561 free(ordered);
562 }
563
564 log_debug("Finalizing...");
565
566 fflush(pack);
567
568 if (ferror(pack)) {
569 log_error("Failed to write pack file.");
570 r = -EIO;
571 goto finish;
572 }
573
574 if (rename(pack_fn_new, pack_fn) < 0) {
575 log_error("Failed to rename readahead file: %m");
576 r = -errno;
577 goto finish;
578 }
579
580 fclose(pack);
581 pack = NULL;
582
583 log_debug("Done.");
584
585 finish:
586 if (fanotify_fd >= 0)
587 close_nointr_nofail(fanotify_fd);
588
589 if (signal_fd >= 0)
590 close_nointr_nofail(signal_fd);
591
592 if (inotify_fd >= 0)
593 close_nointr_nofail(inotify_fd);
594
595 if (pack) {
596 fclose(pack);
597 unlink(pack_fn_new);
598 }
599 free(pack_fn_new);
600 free(pack_fn);
601
602 while ((p = hashmap_steal_first_key(files)))
603 free(p);
604
605 hashmap_free(files);
606
607 if (previous_block_readahead_set) {
608 uint64_t bytes;
609
610 /* Restore the original kernel readahead setting if we
611 * changed it, and nobody has overwritten it since
612 * yet. */
613 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
614 block_set_readahead(root, previous_block_readahead);
615 }
616
617 return r;
618 }
619
620 int main_collect(const char *root) {
621
622 if (!root)
623 root = "/";
624
625 /* Skip this step on read-only media. Note that we check the
626 * underlying block device here, not he read-only flag of the
627 * file system on top, since that one is most likely mounted
628 * read-only anyway at boot, even if the underlying block
629 * device is theoretically writable. */
630 if (fs_on_read_only(root) > 0) {
631 log_info("Disabling readahead collector due to read-only media.");
632 return EXIT_SUCCESS;
633 }
634
635 if (!enough_ram()) {
636 log_info("Disabling readahead collector due to low memory.");
637 return EXIT_SUCCESS;
638 }
639
640 shared = shared_get();
641 if (!shared)
642 return EXIT_FAILURE;
643
644 shared->collect = getpid();
645 __sync_synchronize();
646
647 if (collect(root) < 0)
648 return EXIT_FAILURE;
649
650 return EXIT_SUCCESS;
651 }