src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45
  46 #include <systemd/sd-daemon.h>
  47
  48 #include "missing.h"
  49 #include "util.h"
  50 #include "set.h"
  51 #include "ioprio.h"
  52 #include "readahead-common.h"
  53 #include "virt.h"
  54
  55 /* fixme:
  56  *
  57  * - detect ssd on btrfs/lvm...
  58  * - read ahead directories
  59  * - gzip?
  60  * - remount rw?
  61  * - handle files where nothing is in mincore
  62  * - does ioprio_set work with fadvise()?
  63  */
  64
  65 static ReadaheadShared *shared = NULL;
  66
  67 /* Avoid collisions with the NULL pointer */
  68 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  69 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  70
  71 static int btrfs_defrag(int fd) {
  72         struct btrfs_ioctl_vol_args data;
  73
  74         zero(data);
  75         data.fd = fd;
  76
  77         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  78 }
  79
  80 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  81         struct stat st;
  82         void *start = MAP_FAILED;
  83         uint8_t *vec;
  84         uint32_t b, c;
  85         uint64_t inode;
  86         size_t l, pages;
  87         bool mapped;
  88         int r = 0, fd = -1, k;
  89
  90         assert(pack);
  91         assert(fn);
  92
  93         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
  94         if (fd < 0) {
  95
  96                 if (errno == ENOENT)
  97                         return 0;
  98
  99                 if (errno == EPERM || errno == EACCES)
 100                         return 0;
 101
 102                 log_warning("open(%s) failed: %m", fn);
 103                 r = -errno;
 104                 goto finish;
 105         }
 106
 107         k = file_verify(fd, fn, arg_file_size_max, &st);
 108         if (k <= 0) {
 109                 r = k;
 110                 goto finish;
 111         }
 112
 113         if (on_btrfs)
 114                 btrfs_defrag(fd);
 115
 116         l = PAGE_ALIGN(st.st_size);
 117         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 118         if (start == MAP_FAILED) {
 119                 log_warning("mmap(%s) failed: %m", fn);
 120                 r = -errno;
 121                 goto finish;
 122         }
 123
 124         pages = l / page_size();
 125         vec = alloca(pages);
 126         memset(vec, 0, pages);
 127         if (mincore(start, l, vec) < 0) {
 128                 log_warning("mincore(%s) failed: %m", fn);
 129                 r = -errno;
 130                 goto finish;
 131         }
 132
 133         fputs(fn, pack);
 134         fputc('\n', pack);
 135
 136         /* Store the inode, so that we notice when the file is deleted */
 137         inode = (uint64_t) st.st_ino;
 138         fwrite(&inode, sizeof(inode), 1, pack);
 139
 140         mapped = false;
 141         for (c = 0; c < pages; c++) {
 142                 bool new_mapped = !!(vec[c] & 1);
 143
 144                 if (!mapped && new_mapped)
 145                         b = c;
 146                 else if (mapped && !new_mapped) {
 147                         fwrite(&b, sizeof(b), 1, pack);
 148                         fwrite(&c, sizeof(c), 1, pack);
 149
 150                         log_debug("%s: page %u to %u", fn, b, c);
 151                 }
 152
 153                 mapped = new_mapped;
 154         }
 155
 156         /* We don't write any range data if we should read the entire file */
 157         if (mapped && b > 0) {
 158                 fwrite(&b, sizeof(b), 1, pack);
 159                 fwrite(&c, sizeof(c), 1, pack);
 160
 161                 log_debug("%s: page %u to %u", fn, b, c);
 162         }
 163
 164         /* End marker */
 165         b = 0;
 166         fwrite(&b, sizeof(b), 1, pack);
 167         fwrite(&b, sizeof(b), 1, pack);
 168
 169 finish:
 170         if (start != MAP_FAILED)
 171                 munmap(start, l);
 172
 173         if (fd >= 0)
 174                 close_nointr_nofail(fd);
 175
 176         return r;
 177 }
 178
 179 static unsigned long fd_first_block(int fd) {
 180         struct {
 181                 struct fiemap fiemap;
 182                 struct fiemap_extent extent;
 183         } data;
 184
 185         zero(data);
 186         data.fiemap.fm_length = ~0ULL;
 187         data.fiemap.fm_extent_count = 1;
 188
 189         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 190                 return 0;
 191
 192         if (data.fiemap.fm_mapped_extents <= 0)
 193                 return 0;
 194
 195         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 196                 return 0;
 197
 198         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 199 }
 200
 201 struct item {
 202         const char *path;
 203         unsigned long block;
 204 };
 205
 206 static int qsort_compare(const void *a, const void *b) {
 207         const struct item *i, *j;
 208
 209         i = a;
 210         j = b;
 211
 212         if (i->block < j->block)
 213                 return -1;
 214         if (i->block > j->block)
 215                 return 1;
 216
 217         return strcmp(i->path, j->path);
 218 }
 219
 220 static int collect(const char *root) {
 221         enum {
 222                 FD_FANOTIFY,  /* Get the actual fs events */
 223                 FD_SIGNAL,
 224                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 225                 _FD_MAX
 226         };
 227         struct pollfd pollfd[_FD_MAX];
 228         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 229         pid_t my_pid;
 230         Hashmap *files = NULL;
 231         Iterator i;
 232         char *p, *q;
 233         sigset_t mask;
 234         FILE *pack = NULL;
 235         char *pack_fn_new = NULL, *pack_fn = NULL;
 236         bool on_ssd, on_btrfs;
 237         struct statfs sfs;
 238         usec_t not_after;
 239         uint64_t previous_block_readahead;
 240         bool previous_block_readahead_set = false;
 241
 242         assert(root);
 243
 244         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 245                 log_error("Out of memory.");
 246                 r = -ENOMEM;
 247                 goto finish;
 248         }
 249
 250         /* If there's no pack file yet we lower the kernel readahead
 251          * so that mincore() is accurate. If there is a pack file
 252          * already we assume it is accurate enough so that kernel
 253          * readahead is never triggered. */
 254         previous_block_readahead_set =
 255                 access(pack_fn, F_OK) < 0 &&
 256                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 257                 block_set_readahead(root, 8*1024) >= 0;
 258
 259         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 260                 log_warning("Failed to set IDLE IO priority class: %m");
 261
 262         assert_se(sigemptyset(&mask) == 0);
 263         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 264         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 265
 266         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 267                 log_error("signalfd(): %m");
 268                 r = -errno;
 269                 goto finish;
 270         }
 271
 272         if (!(files = hashmap_new(string_hash_func, string_compare_func))) {
 273                 log_error("Failed to allocate set.");
 274                 r = -ENOMEM;
 275                 goto finish;
 276         }
 277
 278         if ((fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK, O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME)) < 0)  {
 279                 log_error("Failed to create fanotify object: %m");
 280                 r = -errno;
 281                 goto finish;
 282         }
 283
 284         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 285                 log_error("Failed to mark %s: %m", root);
 286                 r = -errno;
 287                 goto finish;
 288         }
 289
 290         if ((inotify_fd = open_inotify()) < 0) {
 291                 r = inotify_fd;
 292                 goto finish;
 293         }
 294
 295         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 296
 297         my_pid = getpid();
 298
 299         zero(pollfd);
 300         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 301         pollfd[FD_FANOTIFY].events = POLLIN;
 302         pollfd[FD_SIGNAL].fd = signal_fd;
 303         pollfd[FD_SIGNAL].events = POLLIN;
 304         pollfd[FD_INOTIFY].fd = inotify_fd;
 305         pollfd[FD_INOTIFY].events = POLLIN;
 306
 307         sd_notify(0,
 308                   "READY=1\n"
 309                   "STATUS=Collecting readahead data");
 310
 311         log_debug("Collecting...");
 312
 313         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 314                 log_debug("Collection canceled");
 315                 r = -ECANCELED;
 316                 goto finish;
 317         }
 318
 319         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 320                 log_debug("Got termination request");
 321                 goto done;
 322         }
 323
 324         for (;;) {
 325                 union {
 326                         struct fanotify_event_metadata metadata;
 327                         char buffer[4096];
 328                 } data;
 329                 ssize_t n;
 330                 struct fanotify_event_metadata *m;
 331                 usec_t t;
 332                 int h;
 333
 334                 if (hashmap_size(files) > arg_files_max) {
 335                         log_debug("Reached maximum number of read ahead files, ending collection.");
 336                         break;
 337                 }
 338
 339                 t = now(CLOCK_MONOTONIC);
 340                 if (t >= not_after) {
 341                         log_debug("Reached maximum collection time, ending collection.");
 342                         break;
 343                 }
 344
 345                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 346
 347                         if (errno == EINTR)
 348                                 continue;
 349
 350                         log_error("poll(): %m");
 351                         r = -errno;
 352                         goto finish;
 353                 }
 354
 355                 if (h == 0) {
 356                         log_debug("Reached maximum collection time, ending collection.");
 357                         break;
 358                 }
 359
 360                 if (pollfd[FD_SIGNAL].revents) {
 361                         log_debug("Got signal.");
 362                         break;
 363                 }
 364
 365                 if (pollfd[FD_INOTIFY].revents) {
 366                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 367                         struct inotify_event *e;
 368
 369                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 370                                 if (errno == EINTR || errno == EAGAIN)
 371                                         continue;
 372
 373                                 log_error("Failed to read inotify event: %m");
 374                                 r = -errno;
 375                                 goto finish;
 376                         }
 377
 378                         e = (struct inotify_event*) inotify_buffer;
 379                         while (n > 0) {
 380                                 size_t step;
 381
 382                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 383                                         log_debug("Collection canceled");
 384                                         r = -ECANCELED;
 385                                         goto finish;
 386                                 }
 387
 388                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 389                                         log_debug("Got termination request");
 390                                         goto done;
 391                                 }
 392
 393                                 step = sizeof(struct inotify_event) + e->len;
 394                                 assert(step <= (size_t) n);
 395
 396                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 397                                 n -= step;
 398                         }
 399                 }
 400
 401                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 402
 403                         if (errno == EINTR || errno == EAGAIN)
 404                                 continue;
 405
 406                         /* fanotify sometimes returns EACCES on read()
 407                          * where it shouldn't. For now let's just
 408                          * ignore it here (which is safe), but
 409                          * eventually this should be
 410                          * dropped when the kernel is fixed.
 411                          *
 412                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 413                         if (errno == EACCES)
 414                                 continue;
 415
 416                         log_error("Failed to read event: %m");
 417                         r = -errno;
 418                         goto finish;
 419                 }
 420
 421                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 422                         char fn[PATH_MAX];
 423                         int k;
 424
 425                         if (m->fd < 0)
 426                                 goto next_iteration;
 427
 428                         if (m->pid == my_pid)
 429                                 goto next_iteration;
 430
 431                         __sync_synchronize();
 432                         if (m->pid == shared->replay)
 433                                 goto next_iteration;
 434
 435                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 436                         char_array_0(fn);
 437
 438                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 439                                 if (startswith(p, "/tmp") ||
 440                                     endswith(p, " (deleted)") ||
 441                                     hashmap_get(files, p))
 442                                         /* Not interesting, or
 443                                          * already read */
 444                                         free(p);
 445                                 else {
 446                                         unsigned long ul;
 447
 448                                         ul = fd_first_block(m->fd);
 449
 450                                         if ((k = hashmap_put(files, p, SECTOR_TO_PTR(ul))) < 0) {
 451                                                 log_warning("set_put() failed: %s", strerror(-k));
 452                                                 free(p);
 453                                         }
 454                                 }
 455
 456                         } else
 457                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 458
 459                 next_iteration:
 460                         if (m->fd)
 461                                 close_nointr_nofail(m->fd);
 462                 }
 463         }
 464
 465 done:
 466         if (fanotify_fd >= 0) {
 467                 close_nointr_nofail(fanotify_fd);
 468                 fanotify_fd = -1;
 469         }
 470
 471         log_debug("Writing Pack File...");
 472
 473         on_ssd = fs_on_ssd(root) > 0;
 474         log_debug("On SSD: %s", yes_no(on_ssd));
 475
 476         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 477         log_debug("On btrfs: %s", yes_no(on_btrfs));
 478
 479         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 480                 log_error("Out of memory.");
 481                 r = -ENOMEM;
 482                 goto finish;
 483         }
 484
 485         pack = fopen(pack_fn_new, "we");
 486         if (!pack) {
 487                 log_error("Failed to open pack file: %m");
 488                 r = -errno;
 489                 goto finish;
 490         }
 491
 492         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 493         putc(on_ssd ? 'S' : 'R', pack);
 494
 495         if (on_ssd || on_btrfs) {
 496
 497                 /* On SSD or on btrfs, just write things out in the
 498                  * order the files were accessed. */
 499
 500                 HASHMAP_FOREACH_KEY(q, p, files, i)
 501                         pack_file(pack, p, on_btrfs);
 502         } else {
 503                 struct item *ordered, *j;
 504                 unsigned k, n;
 505
 506                 /* On rotating media, order things by the block
 507                  * numbers */
 508
 509                 log_debug("Ordering...");
 510
 511                 n = hashmap_size(files);
 512                 if (!(ordered = new(struct item, n))) {
 513                         log_error("Out of memory.");
 514                         r = -ENOMEM;
 515                         goto finish;
 516                 }
 517
 518                 j = ordered;
 519                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 520                         j->path = p;
 521                         j->block = PTR_TO_SECTOR(q);
 522                         j++;
 523                 }
 524
 525                 assert(ordered + n == j);
 526
 527                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 528
 529                 for (k = 0; k < n; k++)
 530                         pack_file(pack, ordered[k].path, on_btrfs);
 531
 532                 free(ordered);
 533         }
 534
 535         log_debug("Finalizing...");
 536
 537         fflush(pack);
 538
 539         if (ferror(pack)) {
 540                 log_error("Failed to write pack file.");
 541                 r = -EIO;
 542                 goto finish;
 543         }
 544
 545         if (rename(pack_fn_new, pack_fn) < 0) {
 546                 log_error("Failed to rename readahead file: %m");
 547                 r = -errno;
 548                 goto finish;
 549         }
 550
 551         fclose(pack);
 552         pack = NULL;
 553
 554         log_debug("Done.");
 555
 556 finish:
 557         if (fanotify_fd >= 0)
 558                 close_nointr_nofail(fanotify_fd);
 559
 560         if (signal_fd >= 0)
 561                 close_nointr_nofail(signal_fd);
 562
 563         if (inotify_fd >= 0)
 564                 close_nointr_nofail(inotify_fd);
 565
 566         if (pack) {
 567                 fclose(pack);
 568                 unlink(pack_fn_new);
 569         }
 570         free(pack_fn_new);
 571         free(pack_fn);
 572
 573         while ((p = hashmap_steal_first_key(files)))
 574                 free(p);
 575
 576         hashmap_free(files);
 577
 578         if (previous_block_readahead_set) {
 579                 uint64_t bytes;
 580
 581                 /* Restore the original kernel readahead setting if we
 582                  * changed it, and nobody has overwritten it since
 583                  * yet. */
 584                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 585                         block_set_readahead(root, previous_block_readahead);
 586         }
 587
 588         return r;
 589 }
 590
 591 int main_collect(const char *root) {
 592
 593         if (!root)
 594                 root = "/";
 595
 596         /* Skip this step on read-only media. Note that we check the
 597          * underlying block device here, not he read-only flag of the
 598          * file system on top, since that one is most likely mounted
 599          * read-only anyway at boot, even if the underlying block
 600          * device is theoretically writable. */
 601         if (fs_on_read_only(root) > 0) {
 602                 log_info("Disabling readahead collector due to read-only media.");
 603                 return EXIT_SUCCESS;
 604         }
 605
 606         if (!enough_ram()) {
 607                 log_info("Disabling readahead collector due to low memory.");
 608                 return EXIT_SUCCESS;
 609         }
 610
 611         shared = shared_get();
 612         if (!shared)
 613                 return EXIT_FAILURE;
 614
 615         shared->collect = getpid();
 616         __sync_synchronize();
 617
 618         if (collect(root) < 0)
 619                 return EXIT_FAILURE;
 620
 621         return EXIT_SUCCESS;
 622 }