1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
4 This file is part of systemd.
6 Copyright 2010 Lennart Poettering
8 systemd is free software; you can redistribute it and/or modify it
9 under the terms of the GNU Lesser General Public License as published by
10 the Free Software Foundation; either version 2.1 of the License, or
11 (at your option) any later version.
13 systemd is distributed in the hope that it will be useful, but
14 WITHOUT ANY WARRANTY; without even the implied warranty of
15 MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
16 Lesser General Public License for more details.
18 You should have received a copy of the GNU Lesser General Public License
19 along with systemd; If not, see <http://www.gnu.org/licenses/>.
25 #include <linux/limits.h>
30 #include <sys/select.h>
32 #include <sys/types.h>
35 #include <linux/fanotify.h>
36 #include <sys/signalfd.h>
40 #include <linux/fiemap.h>
41 #include <sys/ioctl.h>
44 #include <sys/inotify.h>
46 #include <systemd/sd-daemon.h>
52 #include "readahead-common.h"
57 * - detect ssd on btrfs/lvm...
58 * - read ahead directories
61 * - handle files where nothing is in mincore
62 * - does ioprio_set work with fadvise()?
65 static ReadaheadShared
*shared
= NULL
;
67 /* Avoid collisions with the NULL pointer */
68 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
69 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
71 static int btrfs_defrag(int fd
) {
72 struct btrfs_ioctl_vol_args data
;
77 return ioctl(fd
, BTRFS_IOC_DEFRAG
, &data
);
80 static int pack_file(FILE *pack
, const char *fn
, bool on_btrfs
) {
82 void *start
= MAP_FAILED
;
88 int r
= 0, fd
= -1, k
;
93 fd
= open(fn
, O_RDONLY
|O_CLOEXEC
|O_NOATIME
|O_NOCTTY
|O_NOFOLLOW
);
99 if (errno
== EPERM
|| errno
== EACCES
)
102 log_warning("open(%s) failed: %m", fn
);
107 k
= file_verify(fd
, fn
, arg_file_size_max
, &st
);
116 l
= PAGE_ALIGN(st
.st_size
);
117 start
= mmap(NULL
, l
, PROT_READ
, MAP_SHARED
, fd
, 0);
118 if (start
== MAP_FAILED
) {
119 log_warning("mmap(%s) failed: %m", fn
);
124 pages
= l
/ page_size();
126 memset(vec
, 0, pages
);
127 if (mincore(start
, l
, vec
) < 0) {
128 log_warning("mincore(%s) failed: %m", fn
);
136 /* Store the inode, so that we notice when the file is deleted */
137 inode
= (uint64_t) st
.st_ino
;
138 fwrite(&inode
, sizeof(inode
), 1, pack
);
141 for (c
= 0; c
< pages
; c
++) {
142 bool new_mapped
= !!(vec
[c
] & 1);
144 if (!mapped
&& new_mapped
)
146 else if (mapped
&& !new_mapped
) {
147 fwrite(&b
, sizeof(b
), 1, pack
);
148 fwrite(&c
, sizeof(c
), 1, pack
);
150 log_debug("%s: page %u to %u", fn
, b
, c
);
156 /* We don't write any range data if we should read the entire file */
157 if (mapped
&& b
> 0) {
158 fwrite(&b
, sizeof(b
), 1, pack
);
159 fwrite(&c
, sizeof(c
), 1, pack
);
161 log_debug("%s: page %u to %u", fn
, b
, c
);
166 fwrite(&b
, sizeof(b
), 1, pack
);
167 fwrite(&b
, sizeof(b
), 1, pack
);
170 if (start
!= MAP_FAILED
)
174 close_nointr_nofail(fd
);
179 static unsigned long fd_first_block(int fd
) {
181 struct fiemap fiemap
;
182 struct fiemap_extent extent
;
186 data
.fiemap
.fm_length
= ~0ULL;
187 data
.fiemap
.fm_extent_count
= 1;
189 if (ioctl(fd
, FS_IOC_FIEMAP
, &data
) < 0)
192 if (data
.fiemap
.fm_mapped_extents
<= 0)
195 if (data
.fiemap
.fm_extents
[0].fe_flags
& FIEMAP_EXTENT_UNKNOWN
)
198 return (unsigned long) data
.fiemap
.fm_extents
[0].fe_physical
;
206 static int qsort_compare(const void *a
, const void *b
) {
207 const struct item
*i
, *j
;
212 if (i
->block
< j
->block
)
214 if (i
->block
> j
->block
)
217 return strcmp(i
->path
, j
->path
);
220 static int collect(const char *root
) {
222 FD_FANOTIFY
, /* Get the actual fs events */
224 FD_INOTIFY
, /* We get notifications to quit early via this fd */
227 struct pollfd pollfd
[_FD_MAX
];
228 int fanotify_fd
= -1, signal_fd
= -1, inotify_fd
= -1, r
= 0;
230 Hashmap
*files
= NULL
;
235 char *pack_fn_new
= NULL
, *pack_fn
= NULL
;
236 bool on_ssd
, on_btrfs
;
239 uint64_t previous_block_readahead
;
240 bool previous_block_readahead_set
= false;
244 if (asprintf(&pack_fn
, "%s/.readahead", root
) < 0) {
245 log_error("Out of memory.");
250 /* If there's no pack file yet we lower the kernel readahead
251 * so that mincore() is accurate. If there is a pack file
252 * already we assume it is accurate enough so that kernel
253 * readahead is never triggered. */
254 previous_block_readahead_set
=
255 access(pack_fn
, F_OK
) < 0 &&
256 block_get_readahead(root
, &previous_block_readahead
) >= 0 &&
257 block_set_readahead(root
, 8*1024) >= 0;
259 if (ioprio_set(IOPRIO_WHO_PROCESS
, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE
, 0)) < 0)
260 log_warning("Failed to set IDLE IO priority class: %m");
262 assert_se(sigemptyset(&mask
) == 0);
263 sigset_add_many(&mask
, SIGINT
, SIGTERM
, -1);
264 assert_se(sigprocmask(SIG_SETMASK
, &mask
, NULL
) == 0);
266 if ((signal_fd
= signalfd(-1, &mask
, SFD_NONBLOCK
|SFD_CLOEXEC
)) < 0) {
267 log_error("signalfd(): %m");
272 if (!(files
= hashmap_new(string_hash_func
, string_compare_func
))) {
273 log_error("Failed to allocate set.");
278 if ((fanotify_fd
= fanotify_init(FAN_CLOEXEC
|FAN_NONBLOCK
, O_RDONLY
|O_LARGEFILE
|O_CLOEXEC
|O_NOATIME
)) < 0) {
279 log_error("Failed to create fanotify object: %m");
284 if (fanotify_mark(fanotify_fd
, FAN_MARK_ADD
|FAN_MARK_MOUNT
, FAN_OPEN
, AT_FDCWD
, root
) < 0) {
285 log_error("Failed to mark %s: %m", root
);
290 if ((inotify_fd
= open_inotify()) < 0) {
295 not_after
= now(CLOCK_MONOTONIC
) + arg_timeout
;
300 pollfd
[FD_FANOTIFY
].fd
= fanotify_fd
;
301 pollfd
[FD_FANOTIFY
].events
= POLLIN
;
302 pollfd
[FD_SIGNAL
].fd
= signal_fd
;
303 pollfd
[FD_SIGNAL
].events
= POLLIN
;
304 pollfd
[FD_INOTIFY
].fd
= inotify_fd
;
305 pollfd
[FD_INOTIFY
].events
= POLLIN
;
309 "STATUS=Collecting readahead data");
311 log_debug("Collecting...");
313 if (access("/run/systemd/readahead/cancel", F_OK
) >= 0) {
314 log_debug("Collection canceled");
319 if (access("/run/systemd/readahead/done", F_OK
) >= 0) {
320 log_debug("Got termination request");
326 struct fanotify_event_metadata metadata
;
330 struct fanotify_event_metadata
*m
;
334 if (hashmap_size(files
) > arg_files_max
) {
335 log_debug("Reached maximum number of read ahead files, ending collection.");
339 t
= now(CLOCK_MONOTONIC
);
340 if (t
>= not_after
) {
341 log_debug("Reached maximum collection time, ending collection.");
345 if ((h
= poll(pollfd
, _FD_MAX
, (int) ((not_after
- t
) / USEC_PER_MSEC
))) < 0) {
350 log_error("poll(): %m");
356 log_debug("Reached maximum collection time, ending collection.");
360 if (pollfd
[FD_SIGNAL
].revents
) {
361 log_debug("Got signal.");
365 if (pollfd
[FD_INOTIFY
].revents
) {
366 uint8_t inotify_buffer
[sizeof(struct inotify_event
) + FILENAME_MAX
];
367 struct inotify_event
*e
;
369 if ((n
= read(inotify_fd
, &inotify_buffer
, sizeof(inotify_buffer
))) < 0) {
370 if (errno
== EINTR
|| errno
== EAGAIN
)
373 log_error("Failed to read inotify event: %m");
378 e
= (struct inotify_event
*) inotify_buffer
;
382 if ((e
->mask
& IN_CREATE
) && streq(e
->name
, "cancel")) {
383 log_debug("Collection canceled");
388 if ((e
->mask
& IN_CREATE
) && streq(e
->name
, "done")) {
389 log_debug("Got termination request");
393 step
= sizeof(struct inotify_event
) + e
->len
;
394 assert(step
<= (size_t) n
);
396 e
= (struct inotify_event
*) ((uint8_t*) e
+ step
);
401 if ((n
= read(fanotify_fd
, &data
, sizeof(data
))) < 0) {
403 if (errno
== EINTR
|| errno
== EAGAIN
)
406 /* fanotify sometimes returns EACCES on read()
407 * where it shouldn't. For now let's just
408 * ignore it here (which is safe), but
409 * eventually this should be
410 * dropped when the kernel is fixed.
412 * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
416 log_error("Failed to read event: %m");
421 for (m
= &data
.metadata
; FAN_EVENT_OK(m
, n
); m
= FAN_EVENT_NEXT(m
, n
)) {
428 if (m
->pid
== my_pid
)
431 __sync_synchronize();
432 if (m
->pid
== shared
->replay
)
435 snprintf(fn
, sizeof(fn
), "/proc/self/fd/%i", m
->fd
);
438 if ((k
= readlink_malloc(fn
, &p
)) >= 0) {
439 if (startswith(p
, "/tmp") ||
440 endswith(p
, " (deleted)") ||
441 hashmap_get(files
, p
))
442 /* Not interesting, or
448 ul
= fd_first_block(m
->fd
);
450 if ((k
= hashmap_put(files
, p
, SECTOR_TO_PTR(ul
))) < 0) {
451 log_warning("set_put() failed: %s", strerror(-k
));
457 log_warning("readlink(%s) failed: %s", fn
, strerror(-k
));
461 close_nointr_nofail(m
->fd
);
466 if (fanotify_fd
>= 0) {
467 close_nointr_nofail(fanotify_fd
);
471 log_debug("Writing Pack File...");
473 on_ssd
= fs_on_ssd(root
) > 0;
474 log_debug("On SSD: %s", yes_no(on_ssd
));
476 on_btrfs
= statfs(root
, &sfs
) >= 0 && (long) sfs
.f_type
== (long) BTRFS_SUPER_MAGIC
;
477 log_debug("On btrfs: %s", yes_no(on_btrfs
));
479 if (asprintf(&pack_fn_new
, "%s/.readahead.new", root
) < 0) {
480 log_error("Out of memory.");
485 pack
= fopen(pack_fn_new
, "we");
487 log_error("Failed to open pack file: %m");
492 fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION
, pack
);
493 putc(on_ssd
? 'S' : 'R', pack
);
495 if (on_ssd
|| on_btrfs
) {
497 /* On SSD or on btrfs, just write things out in the
498 * order the files were accessed. */
500 HASHMAP_FOREACH_KEY(q
, p
, files
, i
)
501 pack_file(pack
, p
, on_btrfs
);
503 struct item
*ordered
, *j
;
506 /* On rotating media, order things by the block
509 log_debug("Ordering...");
511 n
= hashmap_size(files
);
512 if (!(ordered
= new(struct item
, n
))) {
513 log_error("Out of memory.");
519 HASHMAP_FOREACH_KEY(q
, p
, files
, i
) {
521 j
->block
= PTR_TO_SECTOR(q
);
525 assert(ordered
+ n
== j
);
527 qsort(ordered
, n
, sizeof(struct item
), qsort_compare
);
529 for (k
= 0; k
< n
; k
++)
530 pack_file(pack
, ordered
[k
].path
, on_btrfs
);
535 log_debug("Finalizing...");
540 log_error("Failed to write pack file.");
545 if (rename(pack_fn_new
, pack_fn
) < 0) {
546 log_error("Failed to rename readahead file: %m");
557 if (fanotify_fd
>= 0)
558 close_nointr_nofail(fanotify_fd
);
561 close_nointr_nofail(signal_fd
);
564 close_nointr_nofail(inotify_fd
);
573 while ((p
= hashmap_steal_first_key(files
)))
578 if (previous_block_readahead_set
) {
581 /* Restore the original kernel readahead setting if we
582 * changed it, and nobody has overwritten it since
584 if (block_get_readahead(root
, &bytes
) >= 0 && bytes
== 8*1024)
585 block_set_readahead(root
, previous_block_readahead
);
591 int main_collect(const char *root
) {
596 /* Skip this step on read-only media. Note that we check the
597 * underlying block device here, not he read-only flag of the
598 * file system on top, since that one is most likely mounted
599 * read-only anyway at boot, even if the underlying block
600 * device is theoretically writable. */
601 if (fs_on_read_only(root
) > 0) {
602 log_info("Disabling readahead collector due to read-only media.");
607 log_info("Disabling readahead collector due to low memory.");
611 shared
= shared_get();
615 shared
->collect
= getpid();
616 __sync_synchronize();
618 if (collect(root
) < 0)