]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/readahead/readahead-collect.c
util: replace close_nointr_nofail() by a more useful safe_close()
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
CommitLineData
22be093f
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
22be093f
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
22be093f 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
22be093f
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <inttypes.h>
24#include <fcntl.h>
25#include <linux/limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/select.h>
31#include <sys/time.h>
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <unistd.h>
35#include <linux/fanotify.h>
36#include <sys/signalfd.h>
37#include <sys/poll.h>
38#include <sys/mman.h>
39#include <linux/fs.h>
40#include <linux/fiemap.h>
41#include <sys/ioctl.h>
746f8906 42#include <sys/vfs.h>
8260358d 43#include <getopt.h>
6624768c 44#include <sys/inotify.h>
94243ef2 45#include <math.h>
22be093f 46
4b357e15
MM
47#ifdef HAVE_LINUX_BTRFS_H
48#include <linux/btrfs.h>
49#endif
50
a8348796
LP
51#ifdef HAVE_FANOTIFY_INIT
52#include <sys/fanotify.h>
53#endif
54
81527be1
LP
55#include <systemd/sd-daemon.h>
56
22be093f
LP
57#include "missing.h"
58#include "util.h"
59#include "set.h"
22be093f
LP
60#include "ioprio.h"
61#include "readahead-common.h"
b52aae1d 62#include "virt.h"
22be093f 63
41a598e2
LP
64/* fixme:
65 *
408b85df 66 * - detect ssd on btrfs/lvm...
41a598e2 67 * - read ahead directories
408b85df 68 * - gzip?
8260358d 69 * - remount rw?
6624768c 70 * - handle files where nothing is in mincore
408b85df 71 * - does ioprio_set work with fadvise()?
41a598e2
LP
72 */
73
d9c7a87b 74static ReadaheadShared *shared = NULL;
b0640287 75static usec_t starttime;
d9c7a87b 76
2e7485f0
LP
77/* Avoid collisions with the NULL pointer */
78#define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
79#define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
80
746f8906 81static int btrfs_defrag(int fd) {
b92bea5d 82 struct btrfs_ioctl_vol_args data = { .fd = fd };
22be093f 83
746f8906
LP
84 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
85}
86
87static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
22be093f
LP
88 struct stat st;
89 void *start = MAP_FAILED;
8260358d 90 uint8_t *vec;
22be093f 91 uint32_t b, c;
189455ab 92 uint64_t inode;
22be093f
LP
93 size_t l, pages;
94 bool mapped;
95 int r = 0, fd = -1, k;
96
97 assert(pack);
98 assert(fn);
99
189455ab
LP
100 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
101 if (fd < 0) {
a78899f5
LP
102
103 if (errno == ENOENT)
104 return 0;
105
a76fad09
LP
106 if (errno == EPERM || errno == EACCES)
107 return 0;
108
22be093f
LP
109 log_warning("open(%s) failed: %m", fn);
110 r = -errno;
111 goto finish;
112 }
113
189455ab
LP
114 k = file_verify(fd, fn, arg_file_size_max, &st);
115 if (k <= 0) {
22be093f
LP
116 r = k;
117 goto finish;
118 }
119
746f8906
LP
120 if (on_btrfs)
121 btrfs_defrag(fd);
122
22be093f 123 l = PAGE_ALIGN(st.st_size);
189455ab
LP
124 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
125 if (start == MAP_FAILED) {
22be093f
LP
126 log_warning("mmap(%s) failed: %m", fn);
127 r = -errno;
128 goto finish;
129 }
130
37f85e66 131 pages = l / page_size();
fb818b2e 132 vec = alloca0(pages);
22be093f
LP
133 if (mincore(start, l, vec) < 0) {
134 log_warning("mincore(%s) failed: %m", fn);
135 r = -errno;
136 goto finish;
137 }
138
139 fputs(fn, pack);
140 fputc('\n', pack);
141
189455ab
LP
142 /* Store the inode, so that we notice when the file is deleted */
143 inode = (uint64_t) st.st_ino;
144 fwrite(&inode, sizeof(inode), 1, pack);
145
22be093f
LP
146 mapped = false;
147 for (c = 0; c < pages; c++) {
408b85df 148 bool new_mapped = !!(vec[c] & 1);
22be093f
LP
149
150 if (!mapped && new_mapped)
151 b = c;
152 else if (mapped && !new_mapped) {
153 fwrite(&b, sizeof(b), 1, pack);
154 fwrite(&c, sizeof(c), 1, pack);
155
156 log_debug("%s: page %u to %u", fn, b, c);
157 }
158
159 mapped = new_mapped;
160 }
161
162 /* We don't write any range data if we should read the entire file */
163 if (mapped && b > 0) {
164 fwrite(&b, sizeof(b), 1, pack);
165 fwrite(&c, sizeof(c), 1, pack);
166
167 log_debug("%s: page %u to %u", fn, b, c);
168 }
169
170 /* End marker */
171 b = 0;
172 fwrite(&b, sizeof(b), 1, pack);
173 fwrite(&b, sizeof(b), 1, pack);
174
175finish:
176 if (start != MAP_FAILED)
177 munmap(start, l);
178
03e334a1 179 safe_close(fd);
22be093f
LP
180
181 return r;
182}
183
184static unsigned long fd_first_block(int fd) {
185 struct {
186 struct fiemap fiemap;
187 struct fiemap_extent extent;
b92bea5d
ZJS
188 } data = {
189 .fiemap.fm_length = ~0ULL,
190 .fiemap.fm_extent_count = 1,
191 };
22be093f
LP
192
193 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
194 return 0;
195
196 if (data.fiemap.fm_mapped_extents <= 0)
197 return 0;
198
199 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
200 return 0;
201
202 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
203}
204
205struct item {
206 const char *path;
207 unsigned long block;
94243ef2 208 unsigned long bin;
22be093f
LP
209};
210
211static int qsort_compare(const void *a, const void *b) {
212 const struct item *i, *j;
213
214 i = a;
215 j = b;
216
94243ef2
AK
217 /* sort by bin first */
218 if (i->bin < j->bin)
219 return -1;
220 if (i->bin > j->bin)
221 return 1;
222
223 /* then sort by sector */
22be093f
LP
224 if (i->block < j->block)
225 return -1;
226 if (i->block > j->block)
227 return 1;
228
229 return strcmp(i->path, j->path);
230}
231
232static int collect(const char *root) {
233 enum {
858209c5 234 FD_FANOTIFY, /* Get the actual fs events */
22be093f 235 FD_SIGNAL,
6624768c 236 FD_INOTIFY, /* We get notifications to quit early via this fd */
22be093f
LP
237 _FD_MAX
238 };
b92bea5d 239 struct pollfd pollfd[_FD_MAX] = {};
6624768c 240 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
22be093f
LP
241 pid_t my_pid;
242 Hashmap *files = NULL;
243 Iterator i;
244 char *p, *q;
245 sigset_t mask;
246 FILE *pack = NULL;
247 char *pack_fn_new = NULL, *pack_fn = NULL;
746f8906
LP
248 bool on_ssd, on_btrfs;
249 struct statfs sfs;
408b85df 250 usec_t not_after;
6de338a2
LP
251 uint64_t previous_block_readahead;
252 bool previous_block_readahead_set = false;
22be093f
LP
253
254 assert(root);
255
6de338a2 256 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
0d0f0c50 257 r = log_oom();
6de338a2
LP
258 goto finish;
259 }
260
b0640287 261 starttime = now(CLOCK_MONOTONIC);
94243ef2 262
6de338a2
LP
263 /* If there's no pack file yet we lower the kernel readahead
264 * so that mincore() is accurate. If there is a pack file
265 * already we assume it is accurate enough so that kernel
266 * readahead is never triggered. */
267 previous_block_readahead_set =
268 access(pack_fn, F_OK) < 0 &&
269 block_get_readahead(root, &previous_block_readahead) >= 0 &&
270 block_set_readahead(root, 8*1024) >= 0;
271
22be093f
LP
272 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
273 log_warning("Failed to set IDLE IO priority class: %m");
274
275 assert_se(sigemptyset(&mask) == 0);
276 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
277 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
278
279 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
280 log_error("signalfd(): %m");
281 r = -errno;
282 goto finish;
283 }
284
e62d8c39
ZJS
285 files = hashmap_new(string_hash_func, string_compare_func);
286 if (!files) {
22be093f
LP
287 log_error("Failed to allocate set.");
288 r = -ENOMEM;
289 goto finish;
290 }
291
7989e1f2 292 fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
e62d8c39 293 if (fanotify_fd < 0) {
22be093f
LP
294 log_error("Failed to create fanotify object: %m");
295 r = -errno;
296 goto finish;
297 }
298
299 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
300 log_error("Failed to mark %s: %m", root);
301 r = -errno;
302 goto finish;
303 }
304
e62d8c39
ZJS
305 inotify_fd = open_inotify();
306 if (inotify_fd < 0) {
6624768c
LP
307 r = inotify_fd;
308 goto finish;
309 }
310
8260358d 311 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
408b85df 312
22be093f
LP
313 my_pid = getpid();
314
22be093f
LP
315 pollfd[FD_FANOTIFY].fd = fanotify_fd;
316 pollfd[FD_FANOTIFY].events = POLLIN;
317 pollfd[FD_SIGNAL].fd = signal_fd;
318 pollfd[FD_SIGNAL].events = POLLIN;
6624768c
LP
319 pollfd[FD_INOTIFY].fd = inotify_fd;
320 pollfd[FD_INOTIFY].events = POLLIN;
22be093f
LP
321
322 sd_notify(0,
323 "READY=1\n"
324 "STATUS=Collecting readahead data");
325
326 log_debug("Collecting...");
327
2b583ce6 328 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
6624768c
LP
329 log_debug("Collection canceled");
330 r = -ECANCELED;
331 goto finish;
332 }
333
2b583ce6 334 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
6624768c
LP
335 log_debug("Got termination request");
336 goto done;
337 }
338
22be093f
LP
339 for (;;) {
340 union {
341 struct fanotify_event_metadata metadata;
342 char buffer[4096];
343 } data;
344 ssize_t n;
345 struct fanotify_event_metadata *m;
408b85df
LP
346 usec_t t;
347 int h;
22be093f 348
8260358d 349 if (hashmap_size(files) > arg_files_max) {
408b85df 350 log_debug("Reached maximum number of read ahead files, ending collection.");
6e3eb5ba 351 break;
408b85df
LP
352 }
353
354 t = now(CLOCK_MONOTONIC);
355 if (t >= not_after) {
356 log_debug("Reached maximum collection time, ending collection.");
357 break;
358 }
6e3eb5ba 359
408b85df 360 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
22be093f
LP
361
362 if (errno == EINTR)
363 continue;
364
365 log_error("poll(): %m");
366 r = -errno;
367 goto finish;
368 }
369
408b85df
LP
370 if (h == 0) {
371 log_debug("Reached maximum collection time, ending collection.");
372 break;
373 }
374
6624768c
LP
375 if (pollfd[FD_SIGNAL].revents) {
376 log_debug("Got signal.");
377 break;
378 }
379
380 if (pollfd[FD_INOTIFY].revents) {
381 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
382 struct inotify_event *e;
383
384 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
385 if (errno == EINTR || errno == EAGAIN)
386 continue;
387
388 log_error("Failed to read inotify event: %m");
389 r = -errno;
390 goto finish;
391 }
392
393 e = (struct inotify_event*) inotify_buffer;
394 while (n > 0) {
395 size_t step;
396
397 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
398 log_debug("Collection canceled");
399 r = -ECANCELED;
400 goto finish;
401 }
402
403 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
404 log_debug("Got termination request");
405 goto done;
406 }
407
408 step = sizeof(struct inotify_event) + e->len;
409 assert(step <= (size_t) n);
410
411 e = (struct inotify_event*) ((uint8_t*) e + step);
412 n -= step;
413 }
414 }
415
b47d419c
ZJS
416 n = read(fanotify_fd, &data, sizeof(data));
417 if (n < 0) {
22be093f 418
cf37e246
LP
419 if (errno == EINTR || errno == EAGAIN)
420 continue;
421
422 /* fanotify sometimes returns EACCES on read()
423 * where it shouldn't. For now let's just
424 * ignore it here (which is safe), but
425 * eventually this should be
426 * dropped when the kernel is fixed.
427 *
428 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
429 if (errno == EACCES)
22be093f
LP
430 continue;
431
432 log_error("Failed to read event: %m");
433 r = -errno;
434 goto finish;
435 }
436
408b85df 437 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
b47d419c 438 char fn[sizeof("/proc/self/fd/") + DECIMAL_STR_MAX(int)];
d9c7a87b 439 int k;
22be093f 440
d9c7a87b
LP
441 if (m->fd < 0)
442 goto next_iteration;
22be093f 443
d9c7a87b
LP
444 if (m->pid == my_pid)
445 goto next_iteration;
22be093f 446
d9c7a87b
LP
447 __sync_synchronize();
448 if (m->pid == shared->replay)
449 goto next_iteration;
22be093f 450
d9c7a87b 451 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
b47d419c
ZJS
452 k = readlink_malloc(fn, &p);
453 if (k >= 0) {
d9c7a87b 454 if (startswith(p, "/tmp") ||
0840ce2d 455 endswith(p, " (deleted)") ||
d9c7a87b
LP
456 hashmap_get(files, p))
457 /* Not interesting, or
458 * already read */
459 free(p);
460 else {
461 unsigned long ul;
b0640287 462 usec_t entrytime;
94243ef2
AK
463 struct item *entry;
464
465 entry = new0(struct item, 1);
b0640287
AK
466 if (!entry) {
467 r = log_oom();
468 goto finish;
469 }
22be093f 470
d9c7a87b
LP
471 ul = fd_first_block(m->fd);
472
b0640287 473 entrytime = now(CLOCK_MONOTONIC);
94243ef2
AK
474
475 entry->block = ul;
476 entry->path = strdup(p);
b0640287
AK
477 if (!entry->path) {
478 free(entry);
479 r = log_oom();
480 goto finish;
481 }
482 entry->bin = (entrytime - starttime) / 2000000;
94243ef2 483
ef42202a
ZJS
484 k = hashmap_put(files, p, entry);
485 if (k < 0) {
486 log_warning("hashmap_put() failed: %s", strerror(-k));
d9c7a87b 487 free(p);
22be093f 488 }
d9c7a87b 489 }
22be093f 490
d9c7a87b
LP
491 } else
492 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
22be093f 493
d9c7a87b 494 next_iteration:
03e334a1 495 safe_close(m->fd);
22be093f 496 }
22be093f
LP
497 }
498
6624768c 499done:
03e334a1 500 fanotify_fd = safe_close(fanotify_fd);
22be093f
LP
501
502 log_debug("Writing Pack File...");
503
55888fa4 504 on_ssd = fs_on_ssd(root) > 0;
22be093f
LP
505 log_debug("On SSD: %s", yes_no(on_ssd));
506
c51cf056 507 on_btrfs = statfs(root, &sfs) >= 0 && F_TYPE_EQUAL(sfs.f_type, BTRFS_SUPER_MAGIC);
746f8906
LP
508 log_debug("On btrfs: %s", yes_no(on_btrfs));
509
6de338a2 510 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
0d0f0c50 511 r = log_oom();
22be093f
LP
512 goto finish;
513 }
514
189455ab
LP
515 pack = fopen(pack_fn_new, "we");
516 if (!pack) {
22be093f
LP
517 log_error("Failed to open pack file: %m");
518 r = -errno;
519 goto finish;
520 }
521
cae544bc 522 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
22be093f
LP
523 putc(on_ssd ? 'S' : 'R', pack);
524
746f8906 525 if (on_ssd || on_btrfs) {
22be093f 526
746f8906 527 /* On SSD or on btrfs, just write things out in the
41a598e2 528 * order the files were accessed. */
22be093f
LP
529
530 HASHMAP_FOREACH_KEY(q, p, files, i)
746f8906 531 pack_file(pack, p, on_btrfs);
22be093f 532 } else {
7ff7394d 533 unsigned n;
22be093f
LP
534
535 /* On rotating media, order things by the block
536 * numbers */
537
538 log_debug("Ordering...");
539
540 n = hashmap_size(files);
7ff7394d
ZJS
541 if (n) {
542 _cleanup_free_ struct item *ordered;
543 struct item *j;
544 unsigned k;
545
546 ordered = new(struct item, n);
547 if (!ordered) {
548 r = log_oom();
549 goto finish;
550 }
22be093f 551
7ff7394d
ZJS
552 j = ordered;
553 HASHMAP_FOREACH_KEY(q, p, files, i) {
554 memcpy(j, q, sizeof(struct item));
555 j++;
556 }
22be093f 557
7ff7394d 558 assert(ordered + n == j);
22be093f 559
7ff7394d 560 qsort(ordered, n, sizeof(struct item), qsort_compare);
22be093f 561
7ff7394d
ZJS
562 for (k = 0; k < n; k++)
563 pack_file(pack, ordered[k].path, on_btrfs);
564 } else
565 log_warning("No pack files");
22be093f
LP
566 }
567
568 log_debug("Finalizing...");
569
570 fflush(pack);
571
572 if (ferror(pack)) {
573 log_error("Failed to write pack file.");
574 r = -EIO;
575 goto finish;
576 }
577
578 if (rename(pack_fn_new, pack_fn) < 0) {
579 log_error("Failed to rename readahead file: %m");
580 r = -errno;
581 goto finish;
582 }
583
584 fclose(pack);
585 pack = NULL;
586
587 log_debug("Done.");
588
589finish:
03e334a1
LP
590 safe_close(fanotify_fd);
591 safe_close(signal_fd);
592 safe_close(inotify_fd);
6624768c 593
22be093f
LP
594 if (pack) {
595 fclose(pack);
596 unlink(pack_fn_new);
597 }
22be093f
LP
598 free(pack_fn_new);
599 free(pack_fn);
600
601 while ((p = hashmap_steal_first_key(files)))
f0cf061e 602 free(p);
22be093f
LP
603
604 hashmap_free(files);
605
6de338a2
LP
606 if (previous_block_readahead_set) {
607 uint64_t bytes;
608
609 /* Restore the original kernel readahead setting if we
610 * changed it, and nobody has overwritten it since
611 * yet. */
612 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
613 block_set_readahead(root, previous_block_readahead);
614 }
615
22be093f
LP
616 return r;
617}
618
87ce22cc 619int main_collect(const char *root) {
8260358d 620
87ce22cc
LP
621 if (!root)
622 root = "/";
2b590e13 623
3b2d5b02
LP
624 /* Skip this step on read-only media. Note that we check the
625 * underlying block device here, not he read-only flag of the
626 * file system on top, since that one is most likely mounted
627 * read-only anyway at boot, even if the underlying block
628 * device is theoretically writable. */
2b590e13
LP
629 if (fs_on_read_only(root) > 0) {
630 log_info("Disabling readahead collector due to read-only media.");
87ce22cc 631 return EXIT_SUCCESS;
2b590e13
LP
632 }
633
41a598e2
LP
634 if (!enough_ram()) {
635 log_info("Disabling readahead collector due to low memory.");
87ce22cc 636 return EXIT_SUCCESS;
41a598e2
LP
637 }
638
3b2d5b02
LP
639 shared = shared_get();
640 if (!shared)
87ce22cc 641 return EXIT_FAILURE;
d9c7a87b
LP
642
643 shared->collect = getpid();
644 __sync_synchronize();
645
2b590e13 646 if (collect(root) < 0)
87ce22cc 647 return EXIT_FAILURE;
22be093f 648
87ce22cc 649 return EXIT_SUCCESS;
22be093f 650}