]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/readahead/readahead-collect.c
use "Out of memory." consistantly (or with "\n")
[thirdparty/systemd.git] / src / readahead / readahead-collect.c
CommitLineData
22be093f
LP
1/*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
2
3/***
4 This file is part of systemd.
5
6 Copyright 2010 Lennart Poettering
7
8 systemd is free software; you can redistribute it and/or modify it
5430f7f2
LP
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
22be093f
LP
11 (at your option) any later version.
12
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
5430f7f2 16 Lesser General Public License for more details.
22be093f 17
5430f7f2 18 You should have received a copy of the GNU Lesser General Public License
22be093f
LP
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
20***/
21
22#include <errno.h>
23#include <inttypes.h>
24#include <fcntl.h>
25#include <linux/limits.h>
26#include <stdbool.h>
27#include <stdio.h>
28#include <stdlib.h>
29#include <string.h>
30#include <sys/select.h>
31#include <sys/time.h>
32#include <sys/types.h>
33#include <sys/stat.h>
34#include <unistd.h>
35#include <linux/fanotify.h>
36#include <sys/signalfd.h>
37#include <sys/poll.h>
38#include <sys/mman.h>
39#include <linux/fs.h>
40#include <linux/fiemap.h>
41#include <sys/ioctl.h>
746f8906 42#include <sys/vfs.h>
8260358d 43#include <getopt.h>
6624768c 44#include <sys/inotify.h>
22be093f 45
81527be1
LP
46#include <systemd/sd-daemon.h>
47
22be093f
LP
48#include "missing.h"
49#include "util.h"
50#include "set.h"
22be093f
LP
51#include "ioprio.h"
52#include "readahead-common.h"
b52aae1d 53#include "virt.h"
22be093f 54
41a598e2
LP
55/* fixme:
56 *
408b85df 57 * - detect ssd on btrfs/lvm...
41a598e2 58 * - read ahead directories
408b85df 59 * - gzip?
8260358d 60 * - remount rw?
6624768c 61 * - handle files where nothing is in mincore
408b85df 62 * - does ioprio_set work with fadvise()?
41a598e2
LP
63 */
64
d9c7a87b
LP
65static ReadaheadShared *shared = NULL;
66
2e7485f0
LP
67/* Avoid collisions with the NULL pointer */
68#define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
69#define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
70
746f8906
LP
71static int btrfs_defrag(int fd) {
72 struct btrfs_ioctl_vol_args data;
22be093f 73
746f8906
LP
74 zero(data);
75 data.fd = fd;
22be093f 76
746f8906
LP
77 return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
78}
79
80static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
22be093f
LP
81 struct stat st;
82 void *start = MAP_FAILED;
8260358d 83 uint8_t *vec;
22be093f 84 uint32_t b, c;
189455ab 85 uint64_t inode;
22be093f
LP
86 size_t l, pages;
87 bool mapped;
88 int r = 0, fd = -1, k;
89
90 assert(pack);
91 assert(fn);
92
189455ab
LP
93 fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
94 if (fd < 0) {
a78899f5
LP
95
96 if (errno == ENOENT)
97 return 0;
98
a76fad09
LP
99 if (errno == EPERM || errno == EACCES)
100 return 0;
101
22be093f
LP
102 log_warning("open(%s) failed: %m", fn);
103 r = -errno;
104 goto finish;
105 }
106
189455ab
LP
107 k = file_verify(fd, fn, arg_file_size_max, &st);
108 if (k <= 0) {
22be093f
LP
109 r = k;
110 goto finish;
111 }
112
746f8906
LP
113 if (on_btrfs)
114 btrfs_defrag(fd);
115
22be093f 116 l = PAGE_ALIGN(st.st_size);
189455ab
LP
117 start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
118 if (start == MAP_FAILED) {
22be093f
LP
119 log_warning("mmap(%s) failed: %m", fn);
120 r = -errno;
121 goto finish;
122 }
123
37f85e66 124 pages = l / page_size();
8260358d 125 vec = alloca(pages);
37f85e66 126 memset(vec, 0, pages);
22be093f
LP
127 if (mincore(start, l, vec) < 0) {
128 log_warning("mincore(%s) failed: %m", fn);
129 r = -errno;
130 goto finish;
131 }
132
133 fputs(fn, pack);
134 fputc('\n', pack);
135
189455ab
LP
136 /* Store the inode, so that we notice when the file is deleted */
137 inode = (uint64_t) st.st_ino;
138 fwrite(&inode, sizeof(inode), 1, pack);
139
22be093f
LP
140 mapped = false;
141 for (c = 0; c < pages; c++) {
408b85df 142 bool new_mapped = !!(vec[c] & 1);
22be093f
LP
143
144 if (!mapped && new_mapped)
145 b = c;
146 else if (mapped && !new_mapped) {
147 fwrite(&b, sizeof(b), 1, pack);
148 fwrite(&c, sizeof(c), 1, pack);
149
150 log_debug("%s: page %u to %u", fn, b, c);
151 }
152
153 mapped = new_mapped;
154 }
155
156 /* We don't write any range data if we should read the entire file */
157 if (mapped && b > 0) {
158 fwrite(&b, sizeof(b), 1, pack);
159 fwrite(&c, sizeof(c), 1, pack);
160
161 log_debug("%s: page %u to %u", fn, b, c);
162 }
163
164 /* End marker */
165 b = 0;
166 fwrite(&b, sizeof(b), 1, pack);
167 fwrite(&b, sizeof(b), 1, pack);
168
169finish:
170 if (start != MAP_FAILED)
171 munmap(start, l);
172
173 if (fd >= 0)
174 close_nointr_nofail(fd);
175
176 return r;
177}
178
179static unsigned long fd_first_block(int fd) {
180 struct {
181 struct fiemap fiemap;
182 struct fiemap_extent extent;
183 } data;
184
185 zero(data);
186 data.fiemap.fm_length = ~0ULL;
187 data.fiemap.fm_extent_count = 1;
188
189 if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
190 return 0;
191
192 if (data.fiemap.fm_mapped_extents <= 0)
193 return 0;
194
195 if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
196 return 0;
197
198 return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
199}
200
201struct item {
202 const char *path;
203 unsigned long block;
204};
205
206static int qsort_compare(const void *a, const void *b) {
207 const struct item *i, *j;
208
209 i = a;
210 j = b;
211
212 if (i->block < j->block)
213 return -1;
214 if (i->block > j->block)
215 return 1;
216
217 return strcmp(i->path, j->path);
218}
219
220static int collect(const char *root) {
221 enum {
858209c5 222 FD_FANOTIFY, /* Get the actual fs events */
22be093f 223 FD_SIGNAL,
6624768c 224 FD_INOTIFY, /* We get notifications to quit early via this fd */
22be093f
LP
225 _FD_MAX
226 };
227 struct pollfd pollfd[_FD_MAX];
6624768c 228 int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
22be093f
LP
229 pid_t my_pid;
230 Hashmap *files = NULL;
231 Iterator i;
232 char *p, *q;
233 sigset_t mask;
234 FILE *pack = NULL;
235 char *pack_fn_new = NULL, *pack_fn = NULL;
746f8906
LP
236 bool on_ssd, on_btrfs;
237 struct statfs sfs;
408b85df 238 usec_t not_after;
6de338a2
LP
239 uint64_t previous_block_readahead;
240 bool previous_block_readahead_set = false;
22be093f
LP
241
242 assert(root);
243
6de338a2 244 if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
669241a0 245 log_error("Out of memory.");
6de338a2
LP
246 r = -ENOMEM;
247 goto finish;
248 }
249
250 /* If there's no pack file yet we lower the kernel readahead
251 * so that mincore() is accurate. If there is a pack file
252 * already we assume it is accurate enough so that kernel
253 * readahead is never triggered. */
254 previous_block_readahead_set =
255 access(pack_fn, F_OK) < 0 &&
256 block_get_readahead(root, &previous_block_readahead) >= 0 &&
257 block_set_readahead(root, 8*1024) >= 0;
258
22be093f
LP
259 if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
260 log_warning("Failed to set IDLE IO priority class: %m");
261
262 assert_se(sigemptyset(&mask) == 0);
263 sigset_add_many(&mask, SIGINT, SIGTERM, -1);
264 assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
265
266 if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
267 log_error("signalfd(): %m");
268 r = -errno;
269 goto finish;
270 }
271
272 if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
273 log_error("Failed to allocate set.");
274 r = -ENOMEM;
275 goto finish;
276 }
277
408b85df 278 if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0) {
22be093f
LP
279 log_error("Failed to create fanotify object: %m");
280 r = -errno;
281 goto finish;
282 }
283
284 if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
285 log_error("Failed to mark %s: %m", root);
286 r = -errno;
287 goto finish;
288 }
289
6624768c
LP
290 if ((inotify_fd = open_inotify()) < 0) {
291 r = inotify_fd;
292 goto finish;
293 }
294
8260358d 295 not_after = now(CLOCK_MONOTONIC) + arg_timeout;
408b85df 296
22be093f
LP
297 my_pid = getpid();
298
299 zero(pollfd);
300 pollfd[FD_FANOTIFY].fd = fanotify_fd;
301 pollfd[FD_FANOTIFY].events = POLLIN;
302 pollfd[FD_SIGNAL].fd = signal_fd;
303 pollfd[FD_SIGNAL].events = POLLIN;
6624768c
LP
304 pollfd[FD_INOTIFY].fd = inotify_fd;
305 pollfd[FD_INOTIFY].events = POLLIN;
22be093f
LP
306
307 sd_notify(0,
308 "READY=1\n"
309 "STATUS=Collecting readahead data");
310
311 log_debug("Collecting...");
312
2b583ce6 313 if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
6624768c
LP
314 log_debug("Collection canceled");
315 r = -ECANCELED;
316 goto finish;
317 }
318
2b583ce6 319 if (access("/run/systemd/readahead/done", F_OK) >= 0) {
6624768c
LP
320 log_debug("Got termination request");
321 goto done;
322 }
323
22be093f
LP
324 for (;;) {
325 union {
326 struct fanotify_event_metadata metadata;
327 char buffer[4096];
328 } data;
329 ssize_t n;
330 struct fanotify_event_metadata *m;
408b85df
LP
331 usec_t t;
332 int h;
22be093f 333
8260358d 334 if (hashmap_size(files) > arg_files_max) {
408b85df 335 log_debug("Reached maximum number of read ahead files, ending collection.");
6e3eb5ba 336 break;
408b85df
LP
337 }
338
339 t = now(CLOCK_MONOTONIC);
340 if (t >= not_after) {
341 log_debug("Reached maximum collection time, ending collection.");
342 break;
343 }
6e3eb5ba 344
408b85df 345 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
22be093f
LP
346
347 if (errno == EINTR)
348 continue;
349
350 log_error("poll(): %m");
351 r = -errno;
352 goto finish;
353 }
354
408b85df
LP
355 if (h == 0) {
356 log_debug("Reached maximum collection time, ending collection.");
357 break;
358 }
359
6624768c
LP
360 if (pollfd[FD_SIGNAL].revents) {
361 log_debug("Got signal.");
362 break;
363 }
364
365 if (pollfd[FD_INOTIFY].revents) {
366 uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
367 struct inotify_event *e;
368
369 if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
370 if (errno == EINTR || errno == EAGAIN)
371 continue;
372
373 log_error("Failed to read inotify event: %m");
374 r = -errno;
375 goto finish;
376 }
377
378 e = (struct inotify_event*) inotify_buffer;
379 while (n > 0) {
380 size_t step;
381
382 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
383 log_debug("Collection canceled");
384 r = -ECANCELED;
385 goto finish;
386 }
387
388 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
389 log_debug("Got termination request");
390 goto done;
391 }
392
393 step = sizeof(struct inotify_event) + e->len;
394 assert(step <= (size_t) n);
395
396 e = (struct inotify_event*) ((uint8_t*) e + step);
397 n -= step;
398 }
399 }
400
22be093f
LP
401 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
402
cf37e246
LP
403 if (errno == EINTR || errno == EAGAIN)
404 continue;
405
406 /* fanotify sometimes returns EACCES on read()
407 * where it shouldn't. For now let's just
408 * ignore it here (which is safe), but
409 * eventually this should be
410 * dropped when the kernel is fixed.
411 *
412 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
413 if (errno == EACCES)
22be093f
LP
414 continue;
415
416 log_error("Failed to read event: %m");
417 r = -errno;
418 goto finish;
419 }
420
408b85df 421 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
d9c7a87b
LP
422 char fn[PATH_MAX];
423 int k;
22be093f 424
d9c7a87b
LP
425 if (m->fd < 0)
426 goto next_iteration;
22be093f 427
d9c7a87b
LP
428 if (m->pid == my_pid)
429 goto next_iteration;
22be093f 430
d9c7a87b
LP
431 __sync_synchronize();
432 if (m->pid == shared->replay)
433 goto next_iteration;
22be093f 434
d9c7a87b
LP
435 snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
436 char_array_0(fn);
437
438 if ((k = readlink_malloc(fn, &p)) >= 0) {
d9c7a87b 439 if (startswith(p, "/tmp") ||
0840ce2d 440 endswith(p, " (deleted)") ||
d9c7a87b
LP
441 hashmap_get(files, p))
442 /* Not interesting, or
443 * already read */
444 free(p);
445 else {
446 unsigned long ul;
22be093f 447
d9c7a87b
LP
448 ul = fd_first_block(m->fd);
449
450 if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
451 log_warning("set_put() failed: %s", strerror(-k));
452 free(p);
22be093f 453 }
d9c7a87b 454 }
22be093f 455
d9c7a87b
LP
456 } else
457 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
22be093f 458
d9c7a87b 459 next_iteration:
22be093f
LP
460 if (m->fd)
461 close_nointr_nofail(m->fd);
22be093f 462 }
22be093f
LP
463 }
464
6624768c 465done:
22be093f
LP
466 if (fanotify_fd >= 0) {
467 close_nointr_nofail(fanotify_fd);
468 fanotify_fd = -1;
469 }
470
471 log_debug("Writing Pack File...");
472
55888fa4 473 on_ssd = fs_on_ssd(root) > 0;
22be093f
LP
474 log_debug("On SSD: %s", yes_no(on_ssd));
475
5b61848d 476 on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
746f8906
LP
477 log_debug("On btrfs: %s", yes_no(on_btrfs));
478
6de338a2 479 if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
669241a0 480 log_error("Out of memory.");
22be093f
LP
481 r = -ENOMEM;
482 goto finish;
483 }
484
189455ab
LP
485 pack = fopen(pack_fn_new, "we");
486 if (!pack) {
22be093f
LP
487 log_error("Failed to open pack file: %m");
488 r = -errno;
489 goto finish;
490 }
491
cae544bc 492 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
22be093f
LP
493 putc(on_ssd ? 'S' : 'R', pack);
494
746f8906 495 if (on_ssd || on_btrfs) {
22be093f 496
746f8906 497 /* On SSD or on btrfs, just write things out in the
41a598e2 498 * order the files were accessed. */
22be093f
LP
499
500 HASHMAP_FOREACH_KEY(q, p, files, i)
746f8906 501 pack_file(pack, p, on_btrfs);
22be093f
LP
502 } else {
503 struct item *ordered, *j;
504 unsigned k, n;
505
506 /* On rotating media, order things by the block
507 * numbers */
508
509 log_debug("Ordering...");
510
511 n = hashmap_size(files);
512 if (!(ordered = new(struct item, n))) {
669241a0 513 log_error("Out of memory.");
22be093f
LP
514 r = -ENOMEM;
515 goto finish;
516 }
517
518 j = ordered;
519 HASHMAP_FOREACH_KEY(q, p, files, i) {
520 j->path = p;
2e7485f0 521 j->block = PTR_TO_SECTOR(q);
22be093f
LP
522 j++;
523 }
524
525 assert(ordered + n == j);
526
527 qsort(ordered, n, sizeof(struct item), qsort_compare);
528
529 for (k = 0; k < n; k++)
746f8906 530 pack_file(pack, ordered[k].path, on_btrfs);
22be093f
LP
531
532 free(ordered);
533 }
534
535 log_debug("Finalizing...");
536
537 fflush(pack);
538
539 if (ferror(pack)) {
540 log_error("Failed to write pack file.");
541 r = -EIO;
542 goto finish;
543 }
544
545 if (rename(pack_fn_new, pack_fn) < 0) {
546 log_error("Failed to rename readahead file: %m");
547 r = -errno;
548 goto finish;
549 }
550
551 fclose(pack);
552 pack = NULL;
553
554 log_debug("Done.");
555
556finish:
557 if (fanotify_fd >= 0)
558 close_nointr_nofail(fanotify_fd);
559
560 if (signal_fd >= 0)
561 close_nointr_nofail(signal_fd);
562
6624768c
LP
563 if (inotify_fd >= 0)
564 close_nointr_nofail(inotify_fd);
565
22be093f
LP
566 if (pack) {
567 fclose(pack);
568 unlink(pack_fn_new);
569 }
22be093f
LP
570 free(pack_fn_new);
571 free(pack_fn);
572
573 while ((p = hashmap_steal_first_key(files)))
f0cf061e 574 free(p);
22be093f
LP
575
576 hashmap_free(files);
577
6de338a2
LP
578 if (previous_block_readahead_set) {
579 uint64_t bytes;
580
581 /* Restore the original kernel readahead setting if we
582 * changed it, and nobody has overwritten it since
583 * yet. */
584 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
585 block_set_readahead(root, previous_block_readahead);
586 }
587
22be093f
LP
588 return r;
589}
590
87ce22cc 591int main_collect(const char *root) {
8260358d 592
87ce22cc
LP
593 if (!root)
594 root = "/";
2b590e13 595
3b2d5b02
LP
596 /* Skip this step on read-only media. Note that we check the
597 * underlying block device here, not he read-only flag of the
598 * file system on top, since that one is most likely mounted
599 * read-only anyway at boot, even if the underlying block
600 * device is theoretically writable. */
2b590e13
LP
601 if (fs_on_read_only(root) > 0) {
602 log_info("Disabling readahead collector due to read-only media.");
87ce22cc 603 return EXIT_SUCCESS;
2b590e13
LP
604 }
605
41a598e2
LP
606 if (!enough_ram()) {
607 log_info("Disabling readahead collector due to low memory.");
87ce22cc 608 return EXIT_SUCCESS;
41a598e2
LP
609 }
610
3b2d5b02
LP
611 shared = shared_get();
612 if (!shared)
87ce22cc 613 return EXIT_FAILURE;
d9c7a87b
LP
614
615 shared->collect = getpid();
616 __sync_synchronize();
617
2b590e13 618 if (collect(root) < 0)
87ce22cc 619 return EXIT_FAILURE;
22be093f 620
87ce22cc 621 return EXIT_SUCCESS;
22be093f 622}