src/readahead/readahead-collect.c

   1 /*-*- Mode: C; c-basic-offset: 8; indent-tabs-mode: nil -*-*/
   2
   3 /***
   4   This file is part of systemd.
   5
   6   Copyright 2010 Lennart Poettering
   7
   8   systemd is free software; you can redistribute it and/or modify it
   9   under the terms of the GNU Lesser General Public License as published by
  10   the Free Software Foundation; either version 2.1 of the License, or
  11   (at your option) any later version.
  12
  13   systemd is distributed in the hope that it will be useful, but
  14   WITHOUT ANY WARRANTY; without even the implied warranty of
  15   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  16   Lesser General Public License for more details.
  17
  18   You should have received a copy of the GNU Lesser General Public License
  19   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  20 ***/
  21
  22 #include <errno.h>
  23 #include <inttypes.h>
  24 #include <fcntl.h>
  25 #include <linux/limits.h>
  26 #include <stdbool.h>
  27 #include <stdio.h>
  28 #include <stdlib.h>
  29 #include <string.h>
  30 #include <sys/select.h>
  31 #include <sys/time.h>
  32 #include <sys/types.h>
  33 #include <sys/stat.h>
  34 #include <unistd.h>
  35 #include <linux/fanotify.h>
  36 #include <sys/signalfd.h>
  37 #include <sys/poll.h>
  38 #include <sys/mman.h>
  39 #include <linux/fs.h>
  40 #include <linux/fiemap.h>
  41 #include <sys/ioctl.h>
  42 #include <sys/vfs.h>
  43 #include <getopt.h>
  44 #include <sys/inotify.h>
  45 #include <math.h>
  46
  47 #ifdef HAVE_FANOTIFY_INIT
  48 #include <sys/fanotify.h>
  49 #endif
  50
  51 #include <systemd/sd-daemon.h>
  52
  53 #include "missing.h"
  54 #include "util.h"
  55 #include "set.h"
  56 #include "ioprio.h"
  57 #include "readahead-common.h"
  58 #include "virt.h"
  59
  60 /* fixme:
  61  *
  62  * - detect ssd on btrfs/lvm...
  63  * - read ahead directories
  64  * - gzip?
  65  * - remount rw?
  66  * - handle files where nothing is in mincore
  67  * - does ioprio_set work with fadvise()?
  68  */
  69
  70 static ReadaheadShared *shared = NULL;
  71 static usec_t starttime;
  72
  73 /* Avoid collisions with the NULL pointer */
  74 #define SECTOR_TO_PTR(s) ULONG_TO_PTR((s)+1)
  75 #define PTR_TO_SECTOR(p) (PTR_TO_ULONG(p)-1)
  76
  77 static int btrfs_defrag(int fd) {
  78         struct btrfs_ioctl_vol_args data;
  79
  80         zero(data);
  81         data.fd = fd;
  82
  83         return ioctl(fd, BTRFS_IOC_DEFRAG, &data);
  84 }
  85
  86 static int pack_file(FILE *pack, const char *fn, bool on_btrfs) {
  87         struct stat st;
  88         void *start = MAP_FAILED;
  89         uint8_t *vec;
  90         uint32_t b, c;
  91         uint64_t inode;
  92         size_t l, pages;
  93         bool mapped;
  94         int r = 0, fd = -1, k;
  95
  96         assert(pack);
  97         assert(fn);
  98
  99         fd = open(fn, O_RDONLY|O_CLOEXEC|O_NOATIME|O_NOCTTY|O_NOFOLLOW);
 100         if (fd < 0) {
 101
 102                 if (errno == ENOENT)
 103                         return 0;
 104
 105                 if (errno == EPERM || errno == EACCES)
 106                         return 0;
 107
 108                 log_warning("open(%s) failed: %m", fn);
 109                 r = -errno;
 110                 goto finish;
 111         }
 112
 113         k = file_verify(fd, fn, arg_file_size_max, &st);
 114         if (k <= 0) {
 115                 r = k;
 116                 goto finish;
 117         }
 118
 119         if (on_btrfs)
 120                 btrfs_defrag(fd);
 121
 122         l = PAGE_ALIGN(st.st_size);
 123         start = mmap(NULL, l, PROT_READ, MAP_SHARED, fd, 0);
 124         if (start == MAP_FAILED) {
 125                 log_warning("mmap(%s) failed: %m", fn);
 126                 r = -errno;
 127                 goto finish;
 128         }
 129
 130         pages = l / page_size();
 131         vec = alloca(pages);
 132         memset(vec, 0, pages);
 133         if (mincore(start, l, vec) < 0) {
 134                 log_warning("mincore(%s) failed: %m", fn);
 135                 r = -errno;
 136                 goto finish;
 137         }
 138
 139         fputs(fn, pack);
 140         fputc('\n', pack);
 141
 142         /* Store the inode, so that we notice when the file is deleted */
 143         inode = (uint64_t) st.st_ino;
 144         fwrite(&inode, sizeof(inode), 1, pack);
 145
 146         mapped = false;
 147         for (c = 0; c < pages; c++) {
 148                 bool new_mapped = !!(vec[c] & 1);
 149
 150                 if (!mapped && new_mapped)
 151                         b = c;
 152                 else if (mapped && !new_mapped) {
 153                         fwrite(&b, sizeof(b), 1, pack);
 154                         fwrite(&c, sizeof(c), 1, pack);
 155
 156                         log_debug("%s: page %u to %u", fn, b, c);
 157                 }
 158
 159                 mapped = new_mapped;
 160         }
 161
 162         /* We don't write any range data if we should read the entire file */
 163         if (mapped && b > 0) {
 164                 fwrite(&b, sizeof(b), 1, pack);
 165                 fwrite(&c, sizeof(c), 1, pack);
 166
 167                 log_debug("%s: page %u to %u", fn, b, c);
 168         }
 169
 170         /* End marker */
 171         b = 0;
 172         fwrite(&b, sizeof(b), 1, pack);
 173         fwrite(&b, sizeof(b), 1, pack);
 174
 175 finish:
 176         if (start != MAP_FAILED)
 177                 munmap(start, l);
 178
 179         if (fd >= 0)
 180                 close_nointr_nofail(fd);
 181
 182         return r;
 183 }
 184
 185 static unsigned long fd_first_block(int fd) {
 186         struct {
 187                 struct fiemap fiemap;
 188                 struct fiemap_extent extent;
 189         } data;
 190
 191         zero(data);
 192         data.fiemap.fm_length = ~0ULL;
 193         data.fiemap.fm_extent_count = 1;
 194
 195         if (ioctl(fd, FS_IOC_FIEMAP, &data) < 0)
 196                 return 0;
 197
 198         if (data.fiemap.fm_mapped_extents <= 0)
 199                 return 0;
 200
 201         if (data.fiemap.fm_extents[0].fe_flags & FIEMAP_EXTENT_UNKNOWN)
 202                 return 0;
 203
 204         return (unsigned long) data.fiemap.fm_extents[0].fe_physical;
 205 }
 206
 207 struct item {
 208         const char *path;
 209         unsigned long block;
 210         unsigned long bin;
 211 };
 212
 213 static int qsort_compare(const void *a, const void *b) {
 214         const struct item *i, *j;
 215
 216         i = a;
 217         j = b;
 218
 219         /* sort by bin first */
 220         if (i->bin < j->bin)
 221                 return -1;
 222         if (i->bin > j->bin)
 223                 return 1;
 224
 225         /* then sort by sector */
 226         if (i->block < j->block)
 227                 return -1;
 228         if (i->block > j->block)
 229                 return 1;
 230
 231         return strcmp(i->path, j->path);
 232 }
 233
 234 static int collect(const char *root) {
 235         enum {
 236                 FD_FANOTIFY,  /* Get the actual fs events */
 237                 FD_SIGNAL,
 238                 FD_INOTIFY,   /* We get notifications to quit early via this fd */
 239                 _FD_MAX
 240         };
 241         struct pollfd pollfd[_FD_MAX];
 242         int fanotify_fd = -1, signal_fd = -1, inotify_fd = -1, r = 0;
 243         pid_t my_pid;
 244         Hashmap *files = NULL;
 245         Iterator i;
 246         char *p, *q;
 247         sigset_t mask;
 248         FILE *pack = NULL;
 249         char *pack_fn_new = NULL, *pack_fn = NULL;
 250         bool on_ssd, on_btrfs;
 251         struct statfs sfs;
 252         usec_t not_after;
 253         uint64_t previous_block_readahead;
 254         bool previous_block_readahead_set = false;
 255
 256         assert(root);
 257
 258         if (asprintf(&pack_fn, "%s/.readahead", root) < 0) {
 259                 r = log_oom();
 260                 goto finish;
 261         }
 262
 263         starttime = now(CLOCK_MONOTONIC);
 264
 265         /* If there's no pack file yet we lower the kernel readahead
 266          * so that mincore() is accurate. If there is a pack file
 267          * already we assume it is accurate enough so that kernel
 268          * readahead is never triggered. */
 269         previous_block_readahead_set =
 270                 access(pack_fn, F_OK) < 0 &&
 271                 block_get_readahead(root, &previous_block_readahead) >= 0 &&
 272                 block_set_readahead(root, 8*1024) >= 0;
 273
 274         if (ioprio_set(IOPRIO_WHO_PROCESS, getpid(), IOPRIO_PRIO_VALUE(IOPRIO_CLASS_IDLE, 0)) < 0)
 275                 log_warning("Failed to set IDLE IO priority class: %m");
 276
 277         assert_se(sigemptyset(&mask) == 0);
 278         sigset_add_many(&mask, SIGINT, SIGTERM, -1);
 279         assert_se(sigprocmask(SIG_SETMASK, &mask, NULL) == 0);
 280
 281         if ((signal_fd = signalfd(-1, &mask, SFD_NONBLOCK|SFD_CLOEXEC)) < 0) {
 282                 log_error("signalfd(): %m");
 283                 r = -errno;
 284                 goto finish;
 285         }
 286
 287         files = hashmap_new(string_hash_func, string_compare_func);
 288         if (!files) {
 289                 log_error("Failed to allocate set.");
 290                 r = -ENOMEM;
 291                 goto finish;
 292         }
 293
 294         fanotify_fd = fanotify_init(FAN_CLOEXEC|FAN_NONBLOCK,
 295                                     O_RDONLY|O_LARGEFILE|O_CLOEXEC|O_NOATIME);
 296         if (fanotify_fd < 0)  {
 297                 log_error("Failed to create fanotify object: %m");
 298                 r = -errno;
 299                 goto finish;
 300         }
 301
 302         if (fanotify_mark(fanotify_fd, FAN_MARK_ADD|FAN_MARK_MOUNT, FAN_OPEN, AT_FDCWD, root) < 0) {
 303                 log_error("Failed to mark %s: %m", root);
 304                 r = -errno;
 305                 goto finish;
 306         }
 307
 308         inotify_fd = open_inotify();
 309         if (inotify_fd < 0) {
 310                 r = inotify_fd;
 311                 goto finish;
 312         }
 313
 314         not_after = now(CLOCK_MONOTONIC) + arg_timeout;
 315
 316         my_pid = getpid();
 317
 318         zero(pollfd);
 319         pollfd[FD_FANOTIFY].fd = fanotify_fd;
 320         pollfd[FD_FANOTIFY].events = POLLIN;
 321         pollfd[FD_SIGNAL].fd = signal_fd;
 322         pollfd[FD_SIGNAL].events = POLLIN;
 323         pollfd[FD_INOTIFY].fd = inotify_fd;
 324         pollfd[FD_INOTIFY].events = POLLIN;
 325
 326         sd_notify(0,
 327                   "READY=1\n"
 328                   "STATUS=Collecting readahead data");
 329
 330         log_debug("Collecting...");
 331
 332         if (access("/run/systemd/readahead/cancel", F_OK) >= 0) {
 333                 log_debug("Collection canceled");
 334                 r = -ECANCELED;
 335                 goto finish;
 336         }
 337
 338         if (access("/run/systemd/readahead/done", F_OK) >= 0) {
 339                 log_debug("Got termination request");
 340                 goto done;
 341         }
 342
 343         for (;;) {
 344                 union {
 345                         struct fanotify_event_metadata metadata;
 346                         char buffer[4096];
 347                 } data;
 348                 ssize_t n;
 349                 struct fanotify_event_metadata *m;
 350                 usec_t t;
 351                 int h;
 352
 353                 if (hashmap_size(files) > arg_files_max) {
 354                         log_debug("Reached maximum number of read ahead files, ending collection.");
 355                         break;
 356                 }
 357
 358                 t = now(CLOCK_MONOTONIC);
 359                 if (t >= not_after) {
 360                         log_debug("Reached maximum collection time, ending collection.");
 361                         break;
 362                 }
 363
 364                 if ((h = poll(pollfd, _FD_MAX, (int) ((not_after - t) / USEC_PER_MSEC))) < 0) {
 365
 366                         if (errno == EINTR)
 367                                 continue;
 368
 369                         log_error("poll(): %m");
 370                         r = -errno;
 371                         goto finish;
 372                 }
 373
 374                 if (h == 0) {
 375                         log_debug("Reached maximum collection time, ending collection.");
 376                         break;
 377                 }
 378
 379                 if (pollfd[FD_SIGNAL].revents) {
 380                         log_debug("Got signal.");
 381                         break;
 382                 }
 383
 384                 if (pollfd[FD_INOTIFY].revents) {
 385                         uint8_t inotify_buffer[sizeof(struct inotify_event) + FILENAME_MAX];
 386                         struct inotify_event *e;
 387
 388                         if ((n = read(inotify_fd, &inotify_buffer, sizeof(inotify_buffer))) < 0) {
 389                                 if (errno == EINTR || errno == EAGAIN)
 390                                         continue;
 391
 392                                 log_error("Failed to read inotify event: %m");
 393                                 r = -errno;
 394                                 goto finish;
 395                         }
 396
 397                         e = (struct inotify_event*) inotify_buffer;
 398                         while (n > 0) {
 399                                 size_t step;
 400
 401                                 if ((e->mask & IN_CREATE) && streq(e->name, "cancel")) {
 402                                         log_debug("Collection canceled");
 403                                         r = -ECANCELED;
 404                                         goto finish;
 405                                 }
 406
 407                                 if ((e->mask & IN_CREATE) && streq(e->name, "done")) {
 408                                         log_debug("Got termination request");
 409                                         goto done;
 410                                 }
 411
 412                                 step = sizeof(struct inotify_event) + e->len;
 413                                 assert(step <= (size_t) n);
 414
 415                                 e = (struct inotify_event*) ((uint8_t*) e + step);
 416                                 n -= step;
 417                         }
 418                 }
 419
 420                 if ((n = read(fanotify_fd, &data, sizeof(data))) < 0) {
 421
 422                         if (errno == EINTR || errno == EAGAIN)
 423                                 continue;
 424
 425                         /* fanotify sometimes returns EACCES on read()
 426                          * where it shouldn't. For now let's just
 427                          * ignore it here (which is safe), but
 428                          * eventually this should be
 429                          * dropped when the kernel is fixed.
 430                          *
 431                          * https://bugzilla.redhat.com/show_bug.cgi?id=707577 */
 432                         if (errno == EACCES)
 433                                 continue;
 434
 435                         log_error("Failed to read event: %m");
 436                         r = -errno;
 437                         goto finish;
 438                 }
 439
 440                 for (m = &data.metadata; FAN_EVENT_OK(m, n); m = FAN_EVENT_NEXT(m, n)) {
 441                         char fn[PATH_MAX];
 442                         int k;
 443
 444                         if (m->fd < 0)
 445                                 goto next_iteration;
 446
 447                         if (m->pid == my_pid)
 448                                 goto next_iteration;
 449
 450                         __sync_synchronize();
 451                         if (m->pid == shared->replay)
 452                                 goto next_iteration;
 453
 454                         snprintf(fn, sizeof(fn), "/proc/self/fd/%i", m->fd);
 455                         char_array_0(fn);
 456
 457                         if ((k = readlink_malloc(fn, &p)) >= 0) {
 458                                 if (startswith(p, "/tmp") ||
 459                                     endswith(p, " (deleted)") ||
 460                                     hashmap_get(files, p))
 461                                         /* Not interesting, or
 462                                          * already read */
 463                                         free(p);
 464                                 else {
 465                                         unsigned long ul;
 466                                         usec_t entrytime;
 467                                         struct item *entry;
 468
 469                                         entry = new0(struct item, 1);
 470                                         if (!entry) {
 471                                                 r = log_oom();
 472                                                 goto finish;
 473                                         }
 474
 475                                         ul = fd_first_block(m->fd);
 476
 477                                         entrytime = now(CLOCK_MONOTONIC);
 478
 479                                         entry->block = ul;
 480                                         entry->path = strdup(p);
 481                                         if (!entry->path) {
 482                                                 free(entry);
 483                                                 r = log_oom();
 484                                                 goto finish;
 485                                         }
 486                                         entry->bin = (entrytime - starttime) / 2000000;
 487
 488                                         if ((k = hashmap_put(files, p, entry)) < 0) {
 489                                                 log_warning("set_put() failed: %s", strerror(-k));
 490                                                 free(p);
 491                                         }
 492                                 }
 493
 494                         } else
 495                                 log_warning("readlink(%s) failed: %s", fn, strerror(-k));
 496
 497                 next_iteration:
 498                         if (m->fd >= 0)
 499                                 close_nointr_nofail(m->fd);
 500                 }
 501         }
 502
 503 done:
 504         if (fanotify_fd >= 0) {
 505                 close_nointr_nofail(fanotify_fd);
 506                 fanotify_fd = -1;
 507         }
 508
 509         log_debug("Writing Pack File...");
 510
 511         on_ssd = fs_on_ssd(root) > 0;
 512         log_debug("On SSD: %s", yes_no(on_ssd));
 513
 514         on_btrfs = statfs(root, &sfs) >= 0 && (long) sfs.f_type == (long) BTRFS_SUPER_MAGIC;
 515         log_debug("On btrfs: %s", yes_no(on_btrfs));
 516
 517         if (asprintf(&pack_fn_new, "%s/.readahead.new", root) < 0) {
 518                 r = log_oom();
 519                 goto finish;
 520         }
 521
 522         pack = fopen(pack_fn_new, "we");
 523         if (!pack) {
 524                 log_error("Failed to open pack file: %m");
 525                 r = -errno;
 526                 goto finish;
 527         }
 528
 529         fputs(CANONICAL_HOST READAHEAD_PACK_FILE_VERSION, pack);
 530         putc(on_ssd ? 'S' : 'R', pack);
 531
 532         if (on_ssd || on_btrfs) {
 533
 534                 /* On SSD or on btrfs, just write things out in the
 535                  * order the files were accessed. */
 536
 537                 HASHMAP_FOREACH_KEY(q, p, files, i)
 538                         pack_file(pack, p, on_btrfs);
 539         } else {
 540                 struct item *ordered, *j;
 541                 unsigned k, n;
 542
 543                 /* On rotating media, order things by the block
 544                  * numbers */
 545
 546                 log_debug("Ordering...");
 547
 548                 n = hashmap_size(files);
 549                 if (!(ordered = new(struct item, n))) {
 550                         r = log_oom();
 551                         goto finish;
 552                 }
 553
 554                 j = ordered;
 555                 HASHMAP_FOREACH_KEY(q, p, files, i) {
 556                         memcpy(j, q, sizeof(struct item));
 557                         j++;
 558                 }
 559
 560                 assert(ordered + n == j);
 561
 562                 qsort(ordered, n, sizeof(struct item), qsort_compare);
 563
 564                 for (k = 0; k < n; k++)
 565                         pack_file(pack, ordered[k].path, on_btrfs);
 566
 567                 free(ordered);
 568         }
 569
 570         log_debug("Finalizing...");
 571
 572         fflush(pack);
 573
 574         if (ferror(pack)) {
 575                 log_error("Failed to write pack file.");
 576                 r = -EIO;
 577                 goto finish;
 578         }
 579
 580         if (rename(pack_fn_new, pack_fn) < 0) {
 581                 log_error("Failed to rename readahead file: %m");
 582                 r = -errno;
 583                 goto finish;
 584         }
 585
 586         fclose(pack);
 587         pack = NULL;
 588
 589         log_debug("Done.");
 590
 591 finish:
 592         if (fanotify_fd >= 0)
 593                 close_nointr_nofail(fanotify_fd);
 594
 595         if (signal_fd >= 0)
 596                 close_nointr_nofail(signal_fd);
 597
 598         if (inotify_fd >= 0)
 599                 close_nointr_nofail(inotify_fd);
 600
 601         if (pack) {
 602                 fclose(pack);
 603                 unlink(pack_fn_new);
 604         }
 605         free(pack_fn_new);
 606         free(pack_fn);
 607
 608         while ((p = hashmap_steal_first_key(files)))
 609                 free(p);
 610
 611         hashmap_free(files);
 612
 613         if (previous_block_readahead_set) {
 614                 uint64_t bytes;
 615
 616                 /* Restore the original kernel readahead setting if we
 617                  * changed it, and nobody has overwritten it since
 618                  * yet. */
 619                 if (block_get_readahead(root, &bytes) >= 0 && bytes == 8*1024)
 620                         block_set_readahead(root, previous_block_readahead);
 621         }
 622
 623         return r;
 624 }
 625
 626 int main_collect(const char *root) {
 627
 628         if (!root)
 629                 root = "/";
 630
 631         /* Skip this step on read-only media. Note that we check the
 632          * underlying block device here, not he read-only flag of the
 633          * file system on top, since that one is most likely mounted
 634          * read-only anyway at boot, even if the underlying block
 635          * device is theoretically writable. */
 636         if (fs_on_read_only(root) > 0) {
 637                 log_info("Disabling readahead collector due to read-only media.");
 638                 return EXIT_SUCCESS;
 639         }
 640
 641         if (!enough_ram()) {
 642                 log_info("Disabling readahead collector due to low memory.");
 643                 return EXIT_SUCCESS;
 644         }
 645
 646         shared = shared_get();
 647         if (!shared)
 648                 return EXIT_FAILURE;
 649
 650         shared->collect = getpid();
 651         __sync_synchronize();
 652
 653         if (collect(root) < 0)
 654                 return EXIT_FAILURE;
 655
 656         return EXIT_SUCCESS;
 657 }