]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/readahead/readahead-collect.c
log.h: new log_oom() -> int -ENOMEM, use it
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3 /***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
17
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20 ***/
21
22 #include <errno.h>
23 #include <inttypes.h>
24 #include <fcntl.h>
25 #include <linux/limits.h>
26 #include <stdbool.h>
27 #include <stdio.h>
28 #include <stdlib.h>
29 #include <string.h>
30 #include <sys/select.h>
31 #include <sys/time.h>
32 #include <sys/types.h>
33 #include <sys/stat.h>
34 #include <unistd.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
37 #include <sys/poll.h>
38 #include <sys/mman.h>
39 #include <linux/fs.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
42 #include <sys/vfs.h>
43 #include <getopt.h>
44 #include <sys/inotify.h>
45
46 #include <systemd/sd-daemon.h>
47
48 #include "missing.h"
49 #include "util.h"
50 #include "set.h"
51 #include "ioprio.h"
52 #include "readahead-common.h"
53 #include "virt.h"
54
55 /* fixme:
56 *
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
59 * - gzip?
60 * - remount rw?
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
63 */
64
65 static ReadaheadShared *shared = NULL;
66
67 /* Avoid collisions with the NULL pointer */
68 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
69 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
70
71 static int btrfs_defrag(int fd) {
72 struct btrfs_ioctl_vol_args data;
73
74 zero(data);
75 data.fd = fd;
76
77 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
78 }
79
80 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
81 struct stat st;
82 void *start = MAP_FAILED;
83 uint8_t *vec;
84 uint32_t b, c;
85 uint64_t inode;
86 size_t l, pages;
87 bool mapped;
88 int r = 0, fd = -1, k;
89
90 assert(pack);
91 assert(fn);
92
93 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
94 if (fd < 0) {
95
96 if (errno == ENOENT)
97 return 0;
98
99 if (errno == EPERM || errno == EACCES)
100 return 0;
101
102 log_warning("open(%s) failed: %m", fn);
103 r = -errno;
104 goto finish;
105 }
106
107 k = file_verify(fd, fn, arg_file_size_max, &st);
108 if (k <= 0) {
109 r = k;
110 goto finish;
111 }
112
113 if (on_btrfs)
114 btrfs_defrag(fd);
115
116 l = PAGE_ALIGN(st.st_size);
117 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
118 if (start == MAP_FAILED) {
119 log_warning("mmap(%s) failed: %m", fn);
120 r = -errno;
121 goto finish;
122 }
123
124 pages = l / page_size();
125 vec = alloca(pages);
126 memset(vec, 0, pages);
127 if (mincore(start, l, vec) < 0) {
128 log_warning("mincore(%s) failed: %m", fn);
129 r = -errno;
130 goto finish;
131 }
132
133 fputs(fn, pack);
134 fputc('\n', pack);
135
136 /* Store the inode, so that we notice when the file is deleted */
137 inode = (uint64_t) st.st_ino;
138 fwrite(&inode, sizeof(inode), 1, pack);
139
140 mapped = false;
141 for (c = 0; c < pages; c++) {
142 bool new_mapped = !!(vec[c] & 1);
143
144 if (!mapped && new_mapped)
145 b = c;
146 else if (mapped && !new_mapped) {
147 fwrite(&b, sizeof(b), 1, pack);
148 fwrite(&c, sizeof(c), 1, pack);
149
150 log_debug("%s: page %u to %u", fn, b, c);
151 }
152
153 mapped = new_mapped;
154 }
155
156 /* We don't write any range data if we should read the entire file */
157 if (mapped && b > 0) {
158 fwrite(&b, sizeof(b), 1, pack);
159 fwrite(&c, sizeof(c), 1, pack);
160
161 log_debug("%s: page %u to %u", fn, b, c);
162 }
163
164 /* End marker */
165 b = 0;
166 fwrite(&b, sizeof(b), 1, pack);
167 fwrite(&b, sizeof(b), 1, pack);
168
169 finish:
170 if (start != MAP_FAILED)
171 munmap(start, l);
172
173 if (fd >= 0)
174 close_nointr_nofail(fd);
175
176 return r;
177 }
178
179 static unsigned long fd_first_block(int fd) {
180 struct {
181 struct fiemap fiemap;
182 struct fiemap_extent extent;
183 } data;
184
185 zero(data);
186 data.fiemap.fm_length = ~0ULL;
187 data.fiemap.fm_extent_count = 1;
188
189 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
190 return 0;
191
192 if (data.fiemap.fm_mapped_extents <= 0)
193 return 0;
194
195 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
196 return 0;
197
198 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
199 }
200
201 struct item {
202 const char *path;
203 unsigned long block;
204 };
205
206 static int qsort_compare(const void *a, const void *b) {
207 const struct item *i, *j;
208
209 i = a;
210 j = b;
211
212 if (i->block < j->block)
213 return -1;
214 if (i->block > j->block)
215 return 1;
216
217 return strcmp(i->path, j->path);
218 }
219
220 static int collect(const char *root) {
221 enum {
222 FD_FANOTIFY, /* Get the actual fs events */
223 FD_SIGNAL,
224 FD_INOTIFY, /* We get notifications to quit early via this fd */
225 _FD_MAX
226 };
227 struct pollfd pollfd[_FD_MAX];
228 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
229 pid_t my_pid;
230 Hashmap *files = NULL;
231 Iterator i;
232 char *p, *q;
233 sigset_t mask;
234 FILE *pack = NULL;
235 char *pack_fn_new = NULL, *pack_fn = NULL;
236 bool on_ssd, on_btrfs;
237 struct statfs sfs;
238 usec_t not_after;
239 uint64_t previous_block_readahead;
240 bool previous_block_readahead_set = false;
241
242 assert(root);
243
244 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
245 r = log_oom();
246 goto finish;
247 }
248
249 /* If there's no pack file yet we lower the kernel readahead
250 * so that mincore() is accurate. If there is a pack file
251 * already we assume it is accurate enough so that kernel
252 * readahead is never triggered. */
253 previous_block_readahead_set =
254 access(pack_fn, F_OK) < 0 &&
255 block_get_readahead(root, &previous_block_readahead) >= 0 &&
256 block_set_readahead(root, 8*1024) >= 0;
257
258 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
259 log_warning("Failed to set IDLE IO priority class: %m");
260
261 assert_se(sigemptyset(&mask) == 0);
262 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
263 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
264
265 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
266 log_error("signalfd(): %m");
267 r = -errno;
268 goto finish;
269 }
270
271 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
272 log_error("Failed to allocate set.");
273 r = -ENOMEM;
274 goto finish;
275 }
276
277 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
278 log_error("Failed to create fanotify object: %m");
279 r = -errno;
280 goto finish;
281 }
282
283 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
284 log_error("Failed to mark %s: %m", root);
285 r = -errno;
286 goto finish;
287 }
288
289 if ((inotify_fd = open_inotify()) < 0) {
290 r = inotify_fd;
291 goto finish;
292 }
293
294 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
295
296 my_pid = getpid();
297
298 zero(pollfd);
299 pollfd[FD_FANOTIFY].fd = fanotify_fd;
300 pollfd[FD_FANOTIFY].events = POLLIN;
301 pollfd[FD_SIGNAL].fd = signal_fd;
302 pollfd[FD_SIGNAL].events = POLLIN;
303 pollfd[FD_INOTIFY].fd = inotify_fd;
304 pollfd[FD_INOTIFY].events = POLLIN;
305
306 sd_notify(0,
307 "READY=1\n"
308 "STATUS=Collecting readahead data");
309
310 log_debug("Collecting...");
311
312 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
313 log_debug("Collection canceled");
314 r = -ECANCELED;
315 goto finish;
316 }
317
318 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
319 log_debug("Got termination request");
320 goto done;
321 }
322
323 for (;;) {
324 union {
325 struct fanotify_event_metadata metadata;
326 char buffer[4096];
327 } data;
328 ssize_t n;
329 struct fanotify_event_metadata *m;
330 usec_t t;
331 int h;
332
333 if (hashmap_size(files) > arg_files_max) {
334 log_debug("Reached maximum number of read ahead files, ending collection.");
335 break;
336 }
337
338 t = now(CLOCK_MONOTONIC);
339 if (t >= not_after) {
340 log_debug("Reached maximum collection time, ending collection.");
341 break;
342 }
343
344 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
345
346 if (errno == EINTR)
347 continue;
348
349 log_error("poll(): %m");
350 r = -errno;
351 goto finish;
352 }
353
354 if (h == 0) {
355 log_debug("Reached maximum collection time, ending collection.");
356 break;
357 }
358
359 if (pollfd[FD_SIGNAL].revents) {
360 log_debug("Got signal.");
361 break;
362 }
363
364 if (pollfd[FD_INOTIFY].revents) {
365 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
366 struct inotify_event *e;
367
368 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
369 if (errno == EINTR || errno == EAGAIN)
370 continue;
371
372 log_error("Failed to read inotify event: %m");
373 r = -errno;
374 goto finish;
375 }
376
377 e = (struct inotify_event*) inotify_buffer;
378 while (n > 0) {
379 size_t step;
380
381 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
382 log_debug("Collection canceled");
383 r = -ECANCELED;
384 goto finish;
385 }
386
387 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
388 log_debug("Got termination request");
389 goto done;
390 }
391
392 step = sizeof(struct inotify_event) + e->len;
393 assert(step <= (size_t) n);
394
395 e = (struct inotify_event*) ((uint8_t*) e + step);
396 n -= step;
397 }
398 }
399
400 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
401
402 if (errno == EINTR || errno == EAGAIN)
403 continue;
404
405 /* fanotify sometimes returns EACCES on read()
406 * where it shouldn't. For now let's just
407 * ignore it here (which is safe), but
408 * eventually this should be
409 * dropped when the kernel is fixed.
410 *
411 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
412 if (errno == EACCES)
413 continue;
414
415 log_error("Failed to read event: %m");
416 r = -errno;
417 goto finish;
418 }
419
420 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
421 char fn[PATH_MAX];
422 int k;
423
424 if (m->fd < 0)
425 goto next_iteration;
426
427 if (m->pid == my_pid)
428 goto next_iteration;
429
430 __sync_synchronize();
431 if (m->pid == shared->replay)
432 goto next_iteration;
433
434 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
435 char_array_0(fn);
436
437 if ((k = readlink_malloc(fn, &p)) >= 0) {
438 if (startswith(p, "/tmp") ||
439 endswith(p, " (deleted)") ||
440 hashmap_get(files, p))
441 /* Not interesting, or
442 * already read */
443 free(p);
444 else {
445 unsigned long ul;
446
447 ul = fd_first_block(m->fd);
448
449 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
450 log_warning("set_put() failed: %s", strerror(-k));
451 free(p);
452 }
453 }
454
455 } else
456 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
457
458 next_iteration:
459 if (m->fd)
460 close_nointr_nofail(m->fd);
461 }
462 }
463
464 done:
465 if (fanotify_fd >= 0) {
466 close_nointr_nofail(fanotify_fd);
467 fanotify_fd = -1;
468 }
469
470 log_debug("Writing Pack File...");
471
472 on_ssd = fs_on_ssd(root) > 0;
473 log_debug("On SSD: %s", yes_no(on_ssd));
474
475 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
476 log_debug("On btrfs: %s", yes_no(on_btrfs));
477
478 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
479 r = log_oom();
480 goto finish;
481 }
482
483 pack = fopen(pack_fn_new, "we");
484 if (!pack) {
485 log_error("Failed to open pack file: %m");
486 r = -errno;
487 goto finish;
488 }
489
490 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
491 putc(on_ssd ? 'S' : 'R', pack);
492
493 if (on_ssd || on_btrfs) {
494
495 /* On SSD or on btrfs, just write things out in the
496 * order the files were accessed. */
497
498 HASHMAP_FOREACH_KEY(q, p, files, i)
499 pack_file(pack, p, on_btrfs);
500 } else {
501 struct item *ordered, *j;
502 unsigned k, n;
503
504 /* On rotating media, order things by the block
505 * numbers */
506
507 log_debug("Ordering...");
508
509 n = hashmap_size(files);
510 if (!(ordered = new(struct item, n))) {
511 r = log_oom();
512 goto finish;
513 }
514
515 j = ordered;
516 HASHMAP_FOREACH_KEY(q, p, files, i) {
517 j->path = p;
518 j->block = PTR_TO_SECTOR(q);
519 j++;
520 }
521
522 assert(ordered + n == j);
523
524 qsort(ordered, n, sizeof(struct item), qsort_compare);
525
526 for (k = 0; k < n; k++)
527 pack_file(pack, ordered[k].path, on_btrfs);
528
529 free(ordered);
530 }
531
532 log_debug("Finalizing...");
533
534 fflush(pack);
535
536 if (ferror(pack)) {
537 log_error("Failed to write pack file.");
538 r = -EIO;
539 goto finish;
540 }
541
542 if (rename(pack_fn_new, pack_fn) < 0) {
543 log_error("Failed to rename readahead file: %m");
544 r = -errno;
545 goto finish;
546 }
547
548 fclose(pack);
549 pack = NULL;
550
551 log_debug("Done.");
552
553 finish:
554 if (fanotify_fd >= 0)
555 close_nointr_nofail(fanotify_fd);
556
557 if (signal_fd >= 0)
558 close_nointr_nofail(signal_fd);
559
560 if (inotify_fd >= 0)
561 close_nointr_nofail(inotify_fd);
562
563 if (pack) {
564 fclose(pack);
565 unlink(pack_fn_new);
566 }
567 free(pack_fn_new);
568 free(pack_fn);
569
570 while ((p = hashmap_steal_first_key(files)))
571 free(p);
572
573 hashmap_free(files);
574
575 if (previous_block_readahead_set) {
576 uint64_t bytes;
577
578 /* Restore the original kernel readahead setting if we
579 * changed it, and nobody has overwritten it since
580 * yet. */
581 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
582 block_set_readahead(root, previous_block_readahead);
583 }
584
585 return r;
586 }
587
588 int main_collect(const char *root) {
589
590 if (!root)
591 root = "/";
592
593 /* Skip this step on read-only media. Note that we check the
594 * underlying block device here, not he read-only flag of the
595 * file system on top, since that one is most likely mounted
596 * read-only anyway at boot, even if the underlying block
597 * device is theoretically writable. */
598 if (fs_on_read_only(root) > 0) {
599 log_info("Disabling readahead collector due to read-only media.");
600 return EXIT_SUCCESS;
601 }
602
603 if (!enough_ram()) {
604 log_info("Disabling readahead collector due to low memory.");
605 return EXIT_SUCCESS;
606 }
607
608 shared = shared_get();
609 if (!shared)
610 return EXIT_FAILURE;
611
612 shared->collect = getpid();
613 __sync_synchronize();
614
615 if (collect(root) < 0)
616 return EXIT_FAILURE;
617
618 return EXIT_SUCCESS;
619 }