src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_FANOTIFY_INIT
  48 #include <sys/fanotify.h>
  49 #endif
  50
  51 #include <systemd/sd-daemon.h>
  52
  53 #include "missing.h"
  54 #include "util.h"
  55 #include "set.h"
  56 #include "ioprio.h"
  57 #include "readahead-common.h"
  58 #include "virt.h"
  59
  60 /* fixme:
  61  *
  62  * - detect ssd on btrfs/lvm...
  63  * - read ahead directories
  64  * - gzip?
  65  * - remount rw?
  66  * - handle files where nothing is in mincore
  67  * - does ioprio_set work with fadvise()?
  68  */
  69
  70 static ReadaheadShared *shared = NULL;
  71 static usec_t starttime;
  72
  73 /* Avoid collisions with the NULL pointer */
  74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  76
  77 static int btrfs_defrag(int fd) {
  78         struct btrfs_ioctl_vol_args data;
  79
  80         zero(data);
  81         data.fd = fd;
  82
  83         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  84 }
  85
  86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  87         struct stat st;
  88         void *start = MAP_FAILED;
  89         uint8_t *vec;
  90         uint32_t b, c;
  91         uint64_t inode;
  92         size_t l, pages;
  93         bool mapped;
  94         int r = 0, fd = -1, k;
  95
  96         assert(pack);
  97         assert(fn);
  98
  99         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 100         if (fd < 0) {
 101
 102                 if (errno == ENOENT)
 103                         return 0;
 104
 105                 if (errno == EPERM || errno == EACCES)
 106                         return 0;
 107
 108                 log_warning("open(%s) failed: %m", fn);
 109                 r = -errno;
 110                 goto finish;
 111         }
 112
 113         k = file_verify(fd, fn, arg_file_size_max, &st);
 114         if (k <= 0) {
 115                 r = k;
 116                 goto finish;
 117         }
 118
 119         if (on_btrfs)
 120                 btrfs_defrag(fd);
 121
 122         l = PAGE_ALIGN(st.st_size);
 123         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 124         if (start == MAP_FAILED) {
 125                 log_warning("mmap(%s) failed: %m", fn);
 126                 r = -errno;
 127                 goto finish;
 128         }
 129
 130         pages = l / page_size();
 131         vec = alloca(pages);
 132         memset(vec, 0, pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         if (fd >= 0)
 180                 close_nointr_nofail(fd);
 181
 182         return r;
 183 }
 184
 185 static unsigned long fd_first_block(int fd) {
 186         struct {
 187                 struct fiemap fiemap;
 188                 struct fiemap_extent extent;
 189         } data;
 190
 191         zero(data);
 192         data.fiemap.fm_length = ~0ULL;
 193         data.fiemap.fm_extent_count = 1;
 194
 195         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 196                 return 0;
 197
 198         if (data.fiemap.fm_mapped_extents <= 0)
 199                 return 0;
 200
 201         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 202                 return 0;
 203
 204         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 205 }
 206
 207 struct item {
 208         const char *path;
 209         unsigned long block;
 210         unsigned long bin;
 211 };
 212
 213 static int qsort_compare(const void *a, const void *b) {
 214         const struct item *i, *j;
 215
 216         i = a;
 217         j = b;
 218
 219         /* sort by bin first */
 220         if (i->bin < j->bin)
 221                 return -1;
 222         if (i->bin > j->bin)
 223                 return 1;
 224
 225         /* then sort by sector */
 226         if (i->block < j->block)
 227                 return -1;
 228         if (i->block > j->block)
 229                 return 1;
 230
 231         return strcmp(i->path, j->path);
 232 }
 233
 234 static int collect(const char *root) {
 235         enum {
 236                 FD_FANOTIFY,  /* Get the actual fs events */
 237                 FD_SIGNAL,
 238                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 239                 _FD_MAX
 240         };
 241         struct pollfd pollfd[_FD_MAX];
 242         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 243         pid_t my_pid;
 244         Hashmap *files = NULL;
 245         Iterator i;
 246         char *p, *q;
 247         sigset_t mask;
 248         FILE *pack = NULL;
 249         char *pack_fn_new = NULL, *pack_fn = NULL;
 250         bool on_ssd, on_btrfs;
 251         struct statfs sfs;
 252         usec_t not_after;
 253         uint64_t previous_block_readahead;
 254         bool previous_block_readahead_set = false;
 255
 256         assert(root);
 257
 258         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 259                 r = log_oom();
 260                 goto finish;
 261         }
 262
 263         starttime = now(CLOCK_MONOTONIC);
 264
 265         /* If there's no pack file yet we lower the kernel readahead
 266          * so that mincore() is accurate. If there is a pack file
 267          * already we assume it is accurate enough so that kernel
 268          * readahead is never triggered. */
 269         previous_block_readahead_set =
 270                 access(pack_fn, F_OK) < 0 &&
 271                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 272                 block_set_readahead(root, 8*1024) >= 0;
 273
 274         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 275                 log_warning("Failed to set IDLE IO priority class: %m");
 276
 277         assert_se(sigemptyset(&mask) == 0);
 278         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 279         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 280
 281         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 282                 log_error("signalfd(): %m");
 283                 r = -errno;
 284                 goto finish;
 285         }
 286
 287         if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
 288                 log_error("Failed to allocate set.");
 289                 r = -ENOMEM;
 290                 goto finish;
 291         }
 292
 293         if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0)  {
 294                 log_error("Failed to create fanotify object: %m");
 295                 r = -errno;
 296                 goto finish;
 297         }
 298
 299         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 300                 log_error("Failed to mark %s: %m", root);
 301                 r = -errno;
 302                 goto finish;
 303         }
 304
 305         if ((inotify_fd = open_inotify()) < 0) {
 306                 r = inotify_fd;
 307                 goto finish;
 308         }
 309
 310         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 311
 312         my_pid = getpid();
 313
 314         zero(pollfd);
 315         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 316         pollfd[FD_FANOTIFY].events = POLLIN;
 317         pollfd[FD_SIGNAL].fd = signal_fd;
 318         pollfd[FD_SIGNAL].events = POLLIN;
 319         pollfd[FD_INOTIFY].fd = inotify_fd;
 320         pollfd[FD_INOTIFY].events = POLLIN;
 321
 322         sd_notify(0,
 323                   "READY=1\n"
 324                   "STATUS=Collecting readahead data");
 325
 326         log_debug("Collecting...");
 327
 328         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 329                 log_debug("Collection canceled");
 330                 r = -ECANCELED;
 331                 goto finish;
 332         }
 333
 334         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 335                 log_debug("Got termination request");
 336                 goto done;
 337         }
 338
 339         for (;;) {
 340                 union {
 341                         struct fanotify_event_metadata metadata;
 342                         char buffer[4096];
 343                 } data;
 344                 ssize_t n;
 345                 struct fanotify_event_metadata *m;
 346                 usec_t t;
 347                 int h;
 348
 349                 if (hashmap_size(files) > arg_files_max) {
 350                         log_debug("Reached maximum number of read ahead files, ending collection.");
 351                         break;
 352                 }
 353
 354                 t = now(CLOCK_MONOTONIC);
 355                 if (t >= not_after) {
 356                         log_debug("Reached maximum collection time, ending collection.");
 357                         break;
 358                 }
 359
 360                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 361
 362                         if (errno == EINTR)
 363                                 continue;
 364
 365                         log_error("poll(): %m");
 366                         r = -errno;
 367                         goto finish;
 368                 }
 369
 370                 if (h == 0) {
 371                         log_debug("Reached maximum collection time, ending collection.");
 372                         break;
 373                 }
 374
 375                 if (pollfd[FD_SIGNAL].revents) {
 376                         log_debug("Got signal.");
 377                         break;
 378                 }
 379
 380                 if (pollfd[FD_INOTIFY].revents) {
 381                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 382                         struct inotify_event *e;
 383
 384                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 385                                 if (errno == EINTR || errno == EAGAIN)
 386                                         continue;
 387
 388                                 log_error("Failed to read inotify event: %m");
 389                                 r = -errno;
 390                                 goto finish;
 391                         }
 392
 393                         e = (struct inotify_event*) inotify_buffer;
 394                         while (n > 0) {
 395                                 size_t step;
 396
 397                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 398                                         log_debug("Collection canceled");
 399                                         r = -ECANCELED;
 400                                         goto finish;
 401                                 }
 402
 403                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 404                                         log_debug("Got termination request");
 405                                         goto done;
 406                                 }
 407
 408                                 step = sizeof(struct inotify_event) + e->len;
 409                                 assert(step <= (size_t) n);
 410
 411                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 412                                 n -= step;
 413                         }
 414                 }
 415
 416                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 417
 418                         if (errno == EINTR || errno == EAGAIN)
 419                                 continue;
 420
 421                         /* fanotify sometimes returns EACCES on read()
 422                          * where it shouldn't. For now let's just
 423                          * ignore it here (which is safe), but
 424                          * eventually this should be
 425                          * dropped when the kernel is fixed.
 426                          *
 427                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 428                         if (errno == EACCES)
 429                                 continue;
 430
 431                         log_error("Failed to read event: %m");
 432                         r = -errno;
 433                         goto finish;
 434                 }
 435
 436                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 437                         char fn[PATH_MAX];
 438                         int k;
 439
 440                         if (m->fd < 0)
 441                                 goto next_iteration;
 442
 443                         if (m->pid == my_pid)
 444                                 goto next_iteration;
 445
 446                         __sync_synchronize();
 447                         if (m->pid == shared->replay)
 448                                 goto next_iteration;
 449
 450                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 451                         char_array_0(fn);
 452
 453                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 454                                 if (startswith(p, "/tmp") ||
 455                                     endswith(p, " (deleted)") ||
 456                                     hashmap_get(files, p))
 457                                         /* Not interesting, or
 458                                          * already read */
 459                                         free(p);
 460                                 else {
 461                                         unsigned long ul;
 462                                         usec_t entrytime;
 463                                         struct item *entry;
 464
 465                                         entry = new0(struct item, 1);
 466                                         if (!entry) {
 467                                                 r = log_oom();
 468                                                 goto finish;
 469                                         }
 470
 471                                         ul = fd_first_block(m->fd);
 472
 473                                         entrytime = now(CLOCK_MONOTONIC);
 474
 475                                         entry->block = ul;
 476                                         entry->path = strdup(p);
 477                                         if (!entry->path) {
 478                                                 free(entry);
 479                                                 r = log_oom();
 480                                                 goto finish;
 481                                         }
 482                                         entry->bin = (entrytime - starttime) / 2000000;
 483
 484                                         if ((k = hashmap_put(files, p, entry)) < 0) {
 485                                                 log_warning("set_put() failed: %s", strerror(-k));
 486                                                 free(p);
 487                                         }
 488                                 }
 489
 490                         } else
 491                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 492
 493                 next_iteration:
 494                         if (m->fd >= 0)
 495                                 close_nointr_nofail(m->fd);
 496                 }
 497         }
 498
 499 done:
 500         if (fanotify_fd >= 0) {
 501                 close_nointr_nofail(fanotify_fd);
 502                 fanotify_fd = -1;
 503         }
 504
 505         log_debug("Writing Pack File...");
 506
 507         on_ssd = fs_on_ssd(root) > 0;
 508         log_debug("On SSD: %s", yes_no(on_ssd));
 509
 510         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 511         log_debug("On btrfs: %s", yes_no(on_btrfs));
 512
 513         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 514                 r = log_oom();
 515                 goto finish;
 516         }
 517
 518         pack = fopen(pack_fn_new, "we");
 519         if (!pack) {
 520                 log_error("Failed to open pack file: %m");
 521                 r = -errno;
 522                 goto finish;
 523         }
 524
 525         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 526         putc(on_ssd ? 'S' : 'R', pack);
 527
 528         if (on_ssd || on_btrfs) {
 529
 530                 /* On SSD or on btrfs, just write things out in the
 531                  * order the files were accessed. */
 532
 533                 HASHMAP_FOREACH_KEY(q, p, files, i)
 534                         pack_file(pack, p, on_btrfs);
 535         } else {
 536                 struct item *ordered, *j;
 537                 unsigned k, n;
 538
 539                 /* On rotating media, order things by the block
 540                  * numbers */
 541
 542                 log_debug("Ordering...");
 543
 544                 n = hashmap_size(files);
 545                 if (!(ordered = new(struct item, n))) {
 546                         r = log_oom();
 547                         goto finish;
 548                 }
 549
 550                 j = ordered;
 551                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 552                         memcpy(j, q, sizeof(struct item));
 553                         j++;
 554                 }
 555
 556                 assert(ordered + n == j);
 557
 558                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 559
 560                 for (k = 0; k < n; k++)
 561                         pack_file(pack, ordered[k].path, on_btrfs);
 562
 563                 free(ordered);
 564         }
 565
 566         log_debug("Finalizing...");
 567
 568         fflush(pack);
 569
 570         if (ferror(pack)) {
 571                 log_error("Failed to write pack file.");
 572                 r = -EIO;
 573                 goto finish;
 574         }
 575
 576         if (rename(pack_fn_new, pack_fn) < 0) {
 577                 log_error("Failed to rename readahead file: %m");
 578                 r = -errno;
 579                 goto finish;
 580         }
 581
 582         fclose(pack);
 583         pack = NULL;
 584
 585         log_debug("Done.");
 586
 587 finish:
 588         if (fanotify_fd >= 0)
 589                 close_nointr_nofail(fanotify_fd);
 590
 591         if (signal_fd >= 0)
 592                 close_nointr_nofail(signal_fd);
 593
 594         if (inotify_fd >= 0)
 595                 close_nointr_nofail(inotify_fd);
 596
 597         if (pack) {
 598                 fclose(pack);
 599                 unlink(pack_fn_new);
 600         }
 601         free(pack_fn_new);
 602         free(pack_fn);
 603
 604         while ((p = hashmap_steal_first_key(files)))
 605                 free(p);
 606
 607         hashmap_free(files);
 608
 609         if (previous_block_readahead_set) {
 610                 uint64_t bytes;
 611
 612                 /* Restore the original kernel readahead setting if we
 613                  * changed it, and nobody has overwritten it since
 614                  * yet. */
 615                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 616                         block_set_readahead(root, previous_block_readahead);
 617         }
 618
 619         return r;
 620 }
 621
 622 int main_collect(const char *root) {
 623
 624         if (!root)
 625                 root = "/";
 626
 627         /* Skip this step on read-only media. Note that we check the
 628          * underlying block device here, not he read-only flag of the
 629          * file system on top, since that one is most likely mounted
 630          * read-only anyway at boot, even if the underlying block
 631          * device is theoretically writable. */
 632         if (fs_on_read_only(root) > 0) {
 633                 log_info("Disabling readahead collector due to read-only media.");
 634                 return EXIT_SUCCESS;
 635         }
 636
 637         if (!enough_ram()) {
 638                 log_info("Disabling readahead collector due to low memory.");
 639                 return EXIT_SUCCESS;
 640         }
 641
 642         shared = shared_get();
 643         if (!shared)
 644                 return EXIT_FAILURE;
 645
 646         shared->collect = getpid();
 647         __sync_synchronize();
 648
 649         if (collect(root) < 0)
 650                 return EXIT_FAILURE;
 651
 652         return EXIT_SUCCESS;
 653 }