src/journal/journald-server.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2
   3 #if HAVE_SELINUX
   4 #include <selinux/selinux.h>
   5 #endif
   6 #include <sys/ioctl.h>
   7 #include <sys/mman.h>
   8 #include <sys/signalfd.h>
   9 #include <sys/statvfs.h>
  10 #include <linux/sockios.h>
  11
  12 #include "sd-daemon.h"
  13 #include "sd-journal.h"
  14 #include "sd-messages.h"
  15
  16 #include "acl-util.h"
  17 #include "alloc-util.h"
  18 #include "audit-util.h"
  19 #include "cgroup-util.h"
  20 #include "conf-parser.h"
  21 #include "dirent-util.h"
  22 #include "extract-word.h"
  23 #include "fd-util.h"
  24 #include "fileio.h"
  25 #include "format-util.h"
  26 #include "fs-util.h"
  27 #include "hashmap.h"
  28 #include "hostname-util.h"
  29 #include "id128-util.h"
  30 #include "io-util.h"
  31 #include "journal-authenticate.h"
  32 #include "journal-file.h"
  33 #include "journal-internal.h"
  34 #include "journal-vacuum.h"
  35 #include "journald-audit.h"
  36 #include "journald-context.h"
  37 #include "journald-kmsg.h"
  38 #include "journald-native.h"
  39 #include "journald-rate-limit.h"
  40 #include "journald-server.h"
  41 #include "journald-stream.h"
  42 #include "journald-syslog.h"
  43 #include "log.h"
  44 #include "missing.h"
  45 #include "mkdir.h"
  46 #include "parse-util.h"
  47 #include "proc-cmdline.h"
  48 #include "process-util.h"
  49 #include "rm-rf.h"
  50 #include "selinux-util.h"
  51 #include "signal-util.h"
  52 #include "socket-util.h"
  53 #include "stdio-util.h"
  54 #include "string-table.h"
  55 #include "string-util.h"
  56 #include "syslog-util.h"
  57 #include "user-util.h"
  58
  59 #define USER_JOURNALS_MAX 1024
  60
  61 #define DEFAULT_SYNC_INTERVAL_USEC (5*USEC_PER_MINUTE)
  62 #define DEFAULT_RATE_LIMIT_INTERVAL (30*USEC_PER_SEC)
  63 #define DEFAULT_RATE_LIMIT_BURST 10000
  64 #define DEFAULT_MAX_FILE_USEC USEC_PER_MONTH
  65
  66 #define RECHECK_SPACE_USEC (30*USEC_PER_SEC)
  67
  68 #define NOTIFY_SNDBUF_SIZE (8*1024*1024)
  69
  70 /* The period to insert between posting changes for coalescing */
  71 #define POST_CHANGE_TIMER_INTERVAL_USEC (250*USEC_PER_MSEC)
  72
  73 /* Pick a good default that is likely to fit into AF_UNIX and AF_INET SOCK_DGRAM datagrams, and even leaves some room
  74  * for a bit of additional metadata. */
  75 #define DEFAULT_LINE_MAX (48*1024)
  76
  77 #define DEFERRED_CLOSES_MAX (4096)
  78
  79 static int determine_path_usage(Server *s, const char *path, uint64_t *ret_used, uint64_t *ret_free) {
  80         _cleanup_closedir_ DIR *d = NULL;
  81         struct dirent *de;
  82         struct statvfs ss;
  83
  84         assert(ret_used);
  85         assert(ret_free);
  86
  87         d = opendir(path);
  88         if (!d)
  89                 return log_full_errno(errno == ENOENT ? LOG_DEBUG : LOG_ERR,
  90                                       errno, "Failed to open %s: %m", path);
  91
  92         if (fstatvfs(dirfd(d), &ss) < 0)
  93                 return log_error_errno(errno, "Failed to fstatvfs(%s): %m", path);
  94
  95         *ret_free = ss.f_bsize * ss.f_bavail;
  96         *ret_used = 0;
  97         FOREACH_DIRENT_ALL(de, d, break) {
  98                 struct stat st;
  99
 100                 if (!endswith(de->d_name, ".journal") &&
 101                     !endswith(de->d_name, ".journal~"))
 102                         continue;
 103
 104                 if (fstatat(dirfd(d), de->d_name, &st, AT_SYMLINK_NOFOLLOW) < 0) {
 105                         log_debug_errno(errno, "Failed to stat %s/%s, ignoring: %m", path, de->d_name);
 106                         continue;
 107                 }
 108
 109                 if (!S_ISREG(st.st_mode))
 110                         continue;
 111
 112                 *ret_used += (uint64_t) st.st_blocks * 512UL;
 113         }
 114
 115         return 0;
 116 }
 117
 118 static void cache_space_invalidate(JournalStorageSpace *space) {
 119         zero(*space);
 120 }
 121
 122 static int cache_space_refresh(Server *s, JournalStorage *storage) {
 123         JournalStorageSpace *space;
 124         JournalMetrics *metrics;
 125         uint64_t vfs_used, vfs_avail, avail;
 126         usec_t ts;
 127         int r;
 128
 129         assert(s);
 130
 131         metrics = &storage->metrics;
 132         space = &storage->space;
 133
 134         ts = now(CLOCK_MONOTONIC);
 135
 136         if (space->timestamp != 0 && space->timestamp + RECHECK_SPACE_USEC > ts)
 137                 return 0;
 138
 139         r = determine_path_usage(s, storage->path, &vfs_used, &vfs_avail);
 140         if (r < 0)
 141                 return r;
 142
 143         space->vfs_used = vfs_used;
 144         space->vfs_available = vfs_avail;
 145
 146         avail = LESS_BY(vfs_avail, metrics->keep_free);
 147
 148         space->limit = MIN(MAX(vfs_used + avail, metrics->min_use), metrics->max_use);
 149         space->available = LESS_BY(space->limit, vfs_used);
 150         space->timestamp = ts;
 151         return 1;
 152 }
 153
 154 static void patch_min_use(JournalStorage *storage) {
 155         assert(storage);
 156
 157         /* Let's bump the min_use limit to the current usage on disk. We do
 158          * this when starting up and first opening the journal files. This way
 159          * sudden spikes in disk usage will not cause journald to vacuum files
 160          * without bounds. Note that this means that only a restart of journald
 161          * will make it reset this value. */
 162
 163         storage->metrics.min_use = MAX(storage->metrics.min_use, storage->space.vfs_used);
 164 }
 165
 166 static int determine_space(Server *s, uint64_t *available, uint64_t *limit) {
 167         JournalStorage *js;
 168         int r;
 169
 170         assert(s);
 171
 172         js = s->system_journal ? &s->system_storage : &s->runtime_storage;
 173
 174         r = cache_space_refresh(s, js);
 175         if (r >= 0) {
 176                 if (available)
 177                         *available = js->space.available;
 178                 if (limit)
 179                         *limit = js->space.limit;
 180         }
 181         return r;
 182 }
 183
 184 void server_space_usage_message(Server *s, JournalStorage *storage) {
 185         char fb1[FORMAT_BYTES_MAX], fb2[FORMAT_BYTES_MAX], fb3[FORMAT_BYTES_MAX],
 186              fb4[FORMAT_BYTES_MAX], fb5[FORMAT_BYTES_MAX], fb6[FORMAT_BYTES_MAX];
 187         JournalMetrics *metrics;
 188
 189         assert(s);
 190
 191         if (!storage)
 192                 storage = s->system_journal ? &s->system_storage : &s->runtime_storage;
 193
 194         if (cache_space_refresh(s, storage) < 0)
 195                 return;
 196
 197         metrics = &storage->metrics;
 198         format_bytes(fb1, sizeof(fb1), storage->space.vfs_used);
 199         format_bytes(fb2, sizeof(fb2), metrics->max_use);
 200         format_bytes(fb3, sizeof(fb3), metrics->keep_free);
 201         format_bytes(fb4, sizeof(fb4), storage->space.vfs_available);
 202         format_bytes(fb5, sizeof(fb5), storage->space.limit);
 203         format_bytes(fb6, sizeof(fb6), storage->space.available);
 204
 205         server_driver_message(s, 0,
 206                               "MESSAGE_ID=" SD_MESSAGE_JOURNAL_USAGE_STR,
 207                               LOG_MESSAGE("%s (%s) is %s, max %s, %s free.",
 208                                           storage->name, storage->path, fb1, fb5, fb6),
 209                               "JOURNAL_NAME=%s", storage->name,
 210                               "JOURNAL_PATH=%s", storage->path,
 211                               "CURRENT_USE=%"PRIu64, storage->space.vfs_used,
 212                               "CURRENT_USE_PRETTY=%s", fb1,
 213                               "MAX_USE=%"PRIu64, metrics->max_use,
 214                               "MAX_USE_PRETTY=%s", fb2,
 215                               "DISK_KEEP_FREE=%"PRIu64, metrics->keep_free,
 216                               "DISK_KEEP_FREE_PRETTY=%s", fb3,
 217                               "DISK_AVAILABLE=%"PRIu64, storage->space.vfs_available,
 218                               "DISK_AVAILABLE_PRETTY=%s", fb4,
 219                               "LIMIT=%"PRIu64, storage->space.limit,
 220                               "LIMIT_PRETTY=%s", fb5,
 221                               "AVAILABLE=%"PRIu64, storage->space.available,
 222                               "AVAILABLE_PRETTY=%s", fb6,
 223                               NULL);
 224 }
 225
 226 static bool uid_for_system_journal(uid_t uid) {
 227
 228         /* Returns true if the specified UID shall get its data stored in the system journal*/
 229
 230         return uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY;
 231 }
 232
 233 static void server_add_acls(JournalFile *f, uid_t uid) {
 234 #if HAVE_ACL
 235         int r;
 236 #endif
 237         assert(f);
 238
 239 #if HAVE_ACL
 240         if (uid_for_system_journal(uid))
 241                 return;
 242
 243         r = add_acls_for_user(f->fd, uid);
 244         if (r < 0)
 245                 log_warning_errno(r, "Failed to set ACL on %s, ignoring: %m", f->path);
 246 #endif
 247 }
 248
 249 static int open_journal(
 250                 Server *s,
 251                 bool reliably,
 252                 const char *fname,
 253                 int flags,
 254                 bool seal,
 255                 JournalMetrics *metrics,
 256                 JournalFile **ret) {
 257
 258         JournalFile *f;
 259         int r;
 260
 261         assert(s);
 262         assert(fname);
 263         assert(ret);
 264
 265         if (reliably)
 266                 r = journal_file_open_reliably(fname, flags, 0640, s->compress.enabled, s->compress.threshold_bytes,
 267                                                seal, metrics, s->mmap, s->deferred_closes, NULL, &f);
 268         else
 269                 r = journal_file_open(-1, fname, flags, 0640, s->compress.enabled, s->compress.threshold_bytes, seal,
 270                                       metrics, s->mmap, s->deferred_closes, NULL, &f);
 271
 272         if (r < 0)
 273                 return r;
 274
 275         r = journal_file_enable_post_change_timer(f, s->event, POST_CHANGE_TIMER_INTERVAL_USEC);
 276         if (r < 0) {
 277                 (void) journal_file_close(f);
 278                 return r;
 279         }
 280
 281         *ret = f;
 282         return r;
 283 }
 284
 285 static bool flushed_flag_is_set(void) {
 286         return access("/run/systemd/journal/flushed", F_OK) >= 0;
 287 }
 288
 289 static int system_journal_open(Server *s, bool flush_requested, bool relinquish_requested) {
 290         const char *fn;
 291         int r = 0;
 292
 293         if (!s->system_journal &&
 294             IN_SET(s->storage, STORAGE_PERSISTENT, STORAGE_AUTO) &&
 295             (flush_requested || flushed_flag_is_set()) &&
 296             !relinquish_requested) {
 297
 298                 /* If in auto mode: first try to create the machine
 299                  * path, but not the prefix.
 300                  *
 301                  * If in persistent mode: create /var/log/journal and
 302                  * the machine path */
 303
 304                 if (s->storage == STORAGE_PERSISTENT)
 305                         (void) mkdir_p("/var/log/journal/", 0755);
 306
 307                 (void) mkdir(s->system_storage.path, 0755);
 308
 309                 fn = strjoina(s->system_storage.path, "/system.journal");
 310                 r = open_journal(s, true, fn, O_RDWR|O_CREAT, s->seal, &s->system_storage.metrics, &s->system_journal);
 311                 if (r >= 0) {
 312                         server_add_acls(s->system_journal, 0);
 313                         (void) cache_space_refresh(s, &s->system_storage);
 314                         patch_min_use(&s->system_storage);
 315                 } else {
 316                         if (!IN_SET(r, -ENOENT, -EROFS))
 317                                 log_warning_errno(r, "Failed to open system journal: %m");
 318
 319                         r = 0;
 320                 }
 321
 322                 /* If the runtime journal is open, and we're post-flush, we're
 323                  * recovering from a failed system journal rotate (ENOSPC)
 324                  * for which the runtime journal was reopened.
 325                  *
 326                  * Perform an implicit flush to var, leaving the runtime
 327                  * journal closed, now that the system journal is back.
 328                  */
 329                 if (!flush_requested)
 330                         (void) server_flush_to_var(s, true);
 331         }
 332
 333         if (!s->runtime_journal &&
 334             (s->storage != STORAGE_NONE)) {
 335
 336                 fn = strjoina(s->runtime_storage.path, "/system.journal");
 337
 338                 if (s->system_journal && !relinquish_requested) {
 339
 340                         /* Try to open the runtime journal, but only
 341                          * if it already exists, so that we can flush
 342                          * it into the system journal */
 343
 344                         r = open_journal(s, false, fn, O_RDWR, false, &s->runtime_storage.metrics, &s->runtime_journal);
 345                         if (r < 0) {
 346                                 if (r != -ENOENT)
 347                                         log_warning_errno(r, "Failed to open runtime journal: %m");
 348
 349                                 r = 0;
 350                         }
 351
 352                 } else {
 353
 354                         /* OK, we really need the runtime journal, so create
 355                          * it if necessary. */
 356
 357                         (void) mkdir("/run/log", 0755);
 358                         (void) mkdir("/run/log/journal", 0755);
 359                         (void) mkdir_parents(fn, 0750);
 360
 361                         r = open_journal(s, true, fn, O_RDWR|O_CREAT, false, &s->runtime_storage.metrics, &s->runtime_journal);
 362                         if (r < 0)
 363                                 return log_error_errno(r, "Failed to open runtime journal: %m");
 364                 }
 365
 366                 if (s->runtime_journal) {
 367                         server_add_acls(s->runtime_journal, 0);
 368                         (void) cache_space_refresh(s, &s->runtime_storage);
 369                         patch_min_use(&s->runtime_storage);
 370                 }
 371         }
 372
 373         return r;
 374 }
 375
 376 static JournalFile* find_journal(Server *s, uid_t uid) {
 377         _cleanup_free_ char *p = NULL;
 378         int r;
 379         JournalFile *f;
 380         sd_id128_t machine;
 381
 382         assert(s);
 383
 384         /* A rotate that fails to create the new journal (ENOSPC) leaves the
 385          * rotated journal as NULL.  Unless we revisit opening, even after
 386          * space is made available we'll continue to return NULL indefinitely.
 387          *
 388          * system_journal_open() is a noop if the journals are already open, so
 389          * we can just call it here to recover from failed rotates (or anything
 390          * else that's left the journals as NULL).
 391          *
 392          * Fixes https://github.com/systemd/systemd/issues/3968 */
 393         (void) system_journal_open(s, false, false);
 394
 395         /* We split up user logs only on /var, not on /run. If the
 396          * runtime file is open, we write to it exclusively, in order
 397          * to guarantee proper order as soon as we flush /run to
 398          * /var and close the runtime file. */
 399
 400         if (s->runtime_journal)
 401                 return s->runtime_journal;
 402
 403         if (uid_for_system_journal(uid))
 404                 return s->system_journal;
 405
 406         f = ordered_hashmap_get(s->user_journals, UID_TO_PTR(uid));
 407         if (f)
 408                 return f;
 409
 410         r = sd_id128_get_machine(&machine);
 411         if (r < 0) {
 412                 log_debug_errno(r, "Failed to determine machine ID, using system log: %m");
 413                 return s->system_journal;
 414         }
 415
 416         if (asprintf(&p, "/var/log/journal/" SD_ID128_FORMAT_STR "/user-"UID_FMT".journal",
 417                      SD_ID128_FORMAT_VAL(machine), uid) < 0) {
 418                 log_oom();
 419                 return s->system_journal;
 420         }
 421
 422         while (ordered_hashmap_size(s->user_journals) >= USER_JOURNALS_MAX) {
 423                 /* Too many open? Then let's close one */
 424                 f = ordered_hashmap_steal_first(s->user_journals);
 425                 assert(f);
 426                 (void) journal_file_close(f);
 427         }
 428
 429         r = open_journal(s, true, p, O_RDWR|O_CREAT, s->seal, &s->system_storage.metrics, &f);
 430         if (r < 0)
 431                 return s->system_journal;
 432
 433         server_add_acls(f, uid);
 434
 435         r = ordered_hashmap_put(s->user_journals, UID_TO_PTR(uid), f);
 436         if (r < 0) {
 437                 (void) journal_file_close(f);
 438                 return s->system_journal;
 439         }
 440
 441         return f;
 442 }
 443
 444 static int do_rotate(
 445                 Server *s,
 446                 JournalFile **f,
 447                 const char* name,
 448                 bool seal,
 449                 uint32_t uid) {
 450
 451         int r;
 452         assert(s);
 453
 454         if (!*f)
 455                 return -EINVAL;
 456
 457         r = journal_file_rotate(f, s->compress.enabled, s->compress.threshold_bytes, seal, s->deferred_closes);
 458         if (r < 0) {
 459                 if (*f)
 460                         return log_error_errno(r, "Failed to rotate %s: %m", (*f)->path);
 461                 else
 462                         return log_error_errno(r, "Failed to create new %s journal: %m", name);
 463         }
 464
 465         server_add_acls(*f, uid);
 466
 467         return r;
 468 }
 469
 470 static void server_process_deferred_closes(Server *s) {
 471         JournalFile *f;
 472         Iterator i;
 473
 474         /* Perform any deferred closes which aren't still offlining. */
 475         SET_FOREACH(f, s->deferred_closes, i) {
 476                 if (journal_file_is_offlining(f))
 477                         continue;
 478
 479                 (void) set_remove(s->deferred_closes, f);
 480                 (void) journal_file_close(f);
 481         }
 482 }
 483
 484 static void server_vacuum_deferred_closes(Server *s) {
 485         assert(s);
 486
 487         /* Make some room in the deferred closes list, so that it doesn't grow without bounds */
 488         if (set_size(s->deferred_closes) < DEFERRED_CLOSES_MAX)
 489                 return;
 490
 491         /* Let's first remove all journal files that might already have completed closing */
 492         server_process_deferred_closes(s);
 493
 494         /* And now, let's close some more until we reach the limit again. */
 495         while (set_size(s->deferred_closes) >= DEFERRED_CLOSES_MAX) {
 496                 JournalFile *f;
 497
 498                 assert_se(f = set_steal_first(s->deferred_closes));
 499                 journal_file_close(f);
 500         }
 501 }
 502
 503 static int open_user_journal_directory(Server *s, DIR **ret_dir, char **ret_path) {
 504         _cleanup_closedir_ DIR *dir = NULL;
 505         _cleanup_free_ char *path = NULL;
 506         sd_id128_t machine;
 507         int r;
 508
 509         assert(s);
 510
 511         r = sd_id128_get_machine(&machine);
 512         if (r < 0)
 513                 return log_error_errno(r, "Failed to determine machine ID, ignoring: %m");
 514
 515         if (asprintf(&path, "/var/log/journal/" SD_ID128_FORMAT_STR "/", SD_ID128_FORMAT_VAL(machine)) < 0)
 516                 return log_oom();
 517
 518         dir = opendir(path);
 519         if (!dir)
 520                 return log_error_errno(errno, "Failed to open user journal directory '%s': %m", path);
 521
 522         if (ret_dir)
 523                 *ret_dir = TAKE_PTR(dir);
 524         if (ret_path)
 525                 *ret_path = TAKE_PTR(path);
 526
 527         return 0;
 528 }
 529
 530 void server_rotate(Server *s) {
 531         _cleanup_free_ char *path = NULL;
 532         _cleanup_closedir_ DIR *d = NULL;
 533         JournalFile *f;
 534         Iterator i;
 535         void *k;
 536         int r;
 537
 538         log_debug("Rotating...");
 539
 540         /* First, rotate the system journal (either in its runtime flavour or in its runtime flavour) */
 541         (void) do_rotate(s, &s->runtime_journal, "runtime", false, 0);
 542         (void) do_rotate(s, &s->system_journal, "system", s->seal, 0);
 543
 544         /* Then, rotate all user journals we have open (keeping them open) */
 545         ORDERED_HASHMAP_FOREACH_KEY(f, k, s->user_journals, i) {
 546                 r = do_rotate(s, &f, "user", s->seal, PTR_TO_UID(k));
 547                 if (r >= 0)
 548                         ordered_hashmap_replace(s->user_journals, k, f);
 549                 else if (!f)
 550                         /* Old file has been closed and deallocated */
 551                         ordered_hashmap_remove(s->user_journals, k);
 552         }
 553
 554         /* Finally, also rotate all user journals we currently do not have open. (But do so only if we actually have
 555          * access to /var, i.e. are not in the log-to-runtime-journal mode). */
 556         if (!s->runtime_journal &&
 557             open_user_journal_directory(s, &d, &path) >= 0) {
 558
 559                 struct dirent *de;
 560
 561                 FOREACH_DIRENT(de, d, log_warning_errno(errno, "Failed to enumerate %s, ignoring: %m", path)) {
 562                         _cleanup_free_ char *u = NULL, *full = NULL;
 563                         _cleanup_close_ int fd = -1;
 564                         const char *a, *b;
 565                         uid_t uid;
 566
 567                         a = startswith(de->d_name, "user-");
 568                         if (!a)
 569                                 continue;
 570                         b = endswith(de->d_name, ".journal");
 571                         if (!b)
 572                                 continue;
 573
 574                         u = strndup(a, b-a);
 575                         if (!u) {
 576                                 log_oom();
 577                                 break;
 578                         }
 579
 580                         r = parse_uid(u, &uid);
 581                         if (r < 0) {
 582                                 log_debug_errno(r, "Failed to parse UID from file name '%s', ignoring: %m", de->d_name);
 583                                 continue;
 584                         }
 585
 586                         /* Already rotated in the above loop? i.e. is it an open user journal? */
 587                         if (ordered_hashmap_contains(s->user_journals, UID_TO_PTR(uid)))
 588                                 continue;
 589
 590                         full = strjoin(path, de->d_name);
 591                         if (!full) {
 592                                 log_oom();
 593                                 break;
 594                         }
 595
 596                         fd = openat(dirfd(d), de->d_name, O_RDWR|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|O_NONBLOCK);
 597                         if (fd < 0) {
 598                                 log_full_errno(IN_SET(errno, ELOOP, ENOENT) ? LOG_DEBUG : LOG_WARNING, errno,
 599                                                "Failed to open journal file '%s' for rotation: %m", full);
 600                                 continue;
 601                         }
 602
 603                         /* Make some room in the set of deferred close()s */
 604                         server_vacuum_deferred_closes(s);
 605
 606                         /* Open the file briefly, so that we can archive it */
 607                         r = journal_file_open(fd,
 608                                               full,
 609                                               O_RDWR,
 610                                               0640,
 611                                               s->compress.enabled,
 612                                               s->compress.threshold_bytes,
 613                                               s->seal,
 614                                               &s->system_storage.metrics,
 615                                               s->mmap,
 616                                               s->deferred_closes,
 617                                               NULL,
 618                                               &f);
 619                         if (r < 0) {
 620                                 log_warning_errno(r, "Failed to read journal file %s for rotation, trying to move it out of the way: %m", full);
 621
 622                                 r = journal_file_dispose(dirfd(d), de->d_name);
 623                                 if (r < 0)
 624                                         log_warning_errno(r, "Failed to move %s out of the way, ignoring: %m", full);
 625                                 else
 626                                         log_debug("Successfully moved %s out of the way.", full);
 627
 628                                 continue;
 629                         }
 630
 631                         TAKE_FD(fd); /* Donated to journal_file_open() */
 632
 633                         r = journal_file_archive(f);
 634                         if (r < 0)
 635                                 log_debug_errno(r, "Failed to archive journal file '%s', ignoring: %m", full);
 636
 637                         f = journal_initiate_close(f, s->deferred_closes);
 638                 }
 639         }
 640
 641         server_process_deferred_closes(s);
 642 }
 643
 644 void server_sync(Server *s) {
 645         JournalFile *f;
 646         Iterator i;
 647         int r;
 648
 649         if (s->system_journal) {
 650                 r = journal_file_set_offline(s->system_journal, false);
 651                 if (r < 0)
 652                         log_warning_errno(r, "Failed to sync system journal, ignoring: %m");
 653         }
 654
 655         ORDERED_HASHMAP_FOREACH(f, s->user_journals, i) {
 656                 r = journal_file_set_offline(f, false);
 657                 if (r < 0)
 658                         log_warning_errno(r, "Failed to sync user journal, ignoring: %m");
 659         }
 660
 661         if (s->sync_event_source) {
 662                 r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_OFF);
 663                 if (r < 0)
 664                         log_error_errno(r, "Failed to disable sync timer source: %m");
 665         }
 666
 667         s->sync_scheduled = false;
 668 }
 669
 670 static void do_vacuum(Server *s, JournalStorage *storage, bool verbose) {
 671
 672         int r;
 673
 674         assert(s);
 675         assert(storage);
 676
 677         (void) cache_space_refresh(s, storage);
 678
 679         if (verbose)
 680                 server_space_usage_message(s, storage);
 681
 682         r = journal_directory_vacuum(storage->path, storage->space.limit,
 683                                      storage->metrics.n_max_files, s->max_retention_usec,
 684                                      &s->oldest_file_usec, verbose);
 685         if (r < 0 && r != -ENOENT)
 686                 log_warning_errno(r, "Failed to vacuum %s, ignoring: %m", storage->path);
 687
 688         cache_space_invalidate(&storage->space);
 689 }
 690
 691 int server_vacuum(Server *s, bool verbose) {
 692         assert(s);
 693
 694         log_debug("Vacuuming...");
 695
 696         s->oldest_file_usec = 0;
 697
 698         if (s->system_journal)
 699                 do_vacuum(s, &s->system_storage, verbose);
 700         if (s->runtime_journal)
 701                 do_vacuum(s, &s->runtime_storage, verbose);
 702
 703         return 0;
 704 }
 705
 706 static void server_cache_machine_id(Server *s) {
 707         sd_id128_t id;
 708         int r;
 709
 710         assert(s);
 711
 712         r = sd_id128_get_machine(&id);
 713         if (r < 0)
 714                 return;
 715
 716         sd_id128_to_string(id, stpcpy(s->machine_id_field, "_MACHINE_ID="));
 717 }
 718
 719 static void server_cache_boot_id(Server *s) {
 720         sd_id128_t id;
 721         int r;
 722
 723         assert(s);
 724
 725         r = sd_id128_get_boot(&id);
 726         if (r < 0)
 727                 return;
 728
 729         sd_id128_to_string(id, stpcpy(s->boot_id_field, "_BOOT_ID="));
 730 }
 731
 732 static void server_cache_hostname(Server *s) {
 733         _cleanup_free_ char *t = NULL;
 734         char *x;
 735
 736         assert(s);
 737
 738         t = gethostname_malloc();
 739         if (!t)
 740                 return;
 741
 742         x = strappend("_HOSTNAME=", t);
 743         if (!x)
 744                 return;
 745
 746         free(s->hostname_field);
 747         s->hostname_field = x;
 748 }
 749
 750 static bool shall_try_append_again(JournalFile *f, int r) {
 751         switch(r) {
 752
 753         case -E2BIG:           /* Hit configured limit          */
 754         case -EFBIG:           /* Hit fs limit                  */
 755         case -EDQUOT:          /* Quota limit hit               */
 756         case -ENOSPC:          /* Disk full                     */
 757                 log_debug("%s: Allocation limit reached, rotating.", f->path);
 758                 return true;
 759
 760         case -EIO:             /* I/O error of some kind (mmap) */
 761                 log_warning("%s: IO error, rotating.", f->path);
 762                 return true;
 763
 764         case -EHOSTDOWN:       /* Other machine                 */
 765                 log_info("%s: Journal file from other machine, rotating.", f->path);
 766                 return true;
 767
 768         case -EBUSY:           /* Unclean shutdown              */
 769                 log_info("%s: Unclean shutdown, rotating.", f->path);
 770                 return true;
 771
 772         case -EPROTONOSUPPORT: /* Unsupported feature           */
 773                 log_info("%s: Unsupported feature, rotating.", f->path);
 774                 return true;
 775
 776         case -EBADMSG:         /* Corrupted                     */
 777         case -ENODATA:         /* Truncated                     */
 778         case -ESHUTDOWN:       /* Already archived              */
 779                 log_warning("%s: Journal file corrupted, rotating.", f->path);
 780                 return true;
 781
 782         case -EIDRM:           /* Journal file has been deleted */
 783                 log_warning("%s: Journal file has been deleted, rotating.", f->path);
 784                 return true;
 785
 786         case -ETXTBSY:         /* Journal file is from the future */
 787                 log_warning("%s: Journal file is from the future, rotating.", f->path);
 788                 return true;
 789
 790         case -EAFNOSUPPORT:
 791                 log_warning("%s: underlying file system does not support memory mapping or another required file system feature.", f->path);
 792                 return false;
 793
 794         default:
 795                 return false;
 796         }
 797 }
 798
 799 static void write_to_journal(Server *s, uid_t uid, struct iovec *iovec, size_t n, int priority) {
 800         bool vacuumed = false, rotate = false;
 801         struct dual_timestamp ts;
 802         JournalFile *f;
 803         int r;
 804
 805         assert(s);
 806         assert(iovec);
 807         assert(n > 0);
 808
 809         /* Get the closest, linearized time we have for this log event from the event loop. (Note that we do not use
 810          * the source time, and not even the time the event was originally seen, but instead simply the time we started
 811          * processing it, as we want strictly linear ordering in what we write out.) */
 812         assert_se(sd_event_now(s->event, CLOCK_REALTIME, &ts.realtime) >= 0);
 813         assert_se(sd_event_now(s->event, CLOCK_MONOTONIC, &ts.monotonic) >= 0);
 814
 815         if (ts.realtime < s->last_realtime_clock) {
 816                 /* When the time jumps backwards, let's immediately rotate. Of course, this should not happen during
 817                  * regular operation. However, when it does happen, then we should make sure that we start fresh files
 818                  * to ensure that the entries in the journal files are strictly ordered by time, in order to ensure
 819                  * bisection works correctly. */
 820
 821                 log_debug("Time jumped backwards, rotating.");
 822                 rotate = true;
 823         } else {
 824
 825                 f = find_journal(s, uid);
 826                 if (!f)
 827                         return;
 828
 829                 if (journal_file_rotate_suggested(f, s->max_file_usec)) {
 830                         log_debug("%s: Journal header limits reached or header out-of-date, rotating.", f->path);
 831                         rotate = true;
 832                 }
 833         }
 834
 835         if (rotate) {
 836                 server_rotate(s);
 837                 server_vacuum(s, false);
 838                 vacuumed = true;
 839
 840                 f = find_journal(s, uid);
 841                 if (!f)
 842                         return;
 843         }
 844
 845         s->last_realtime_clock = ts.realtime;
 846
 847         r = journal_file_append_entry(f, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL);
 848         if (r >= 0) {
 849                 server_schedule_sync(s, priority);
 850                 return;
 851         }
 852
 853         if (vacuumed || !shall_try_append_again(f, r)) {
 854                 log_error_errno(r, "Failed to write entry (%zu items, %zu bytes), ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
 855                 return;
 856         }
 857
 858         server_rotate(s);
 859         server_vacuum(s, false);
 860
 861         f = find_journal(s, uid);
 862         if (!f)
 863                 return;
 864
 865         log_debug("Retrying write.");
 866         r = journal_file_append_entry(f, &ts, NULL, iovec, n, &s->seqnum, NULL, NULL);
 867         if (r < 0)
 868                 log_error_errno(r, "Failed to write entry (%zu items, %zu bytes) despite vacuuming, ignoring: %m", n, IOVEC_TOTAL_SIZE(iovec, n));
 869         else
 870                 server_schedule_sync(s, priority);
 871 }
 872
 873 #define IOVEC_ADD_NUMERIC_FIELD(iovec, n, value, type, isset, format, field)  \
 874         if (isset(value)) {                                             \
 875                 char *k;                                                \
 876                 k = newa(char, STRLEN(field "=") + DECIMAL_STR_MAX(type) + 1); \
 877                 sprintf(k, field "=" format, value);                    \
 878                 iovec[n++] = IOVEC_MAKE_STRING(k);                      \
 879         }
 880
 881 #define IOVEC_ADD_STRING_FIELD(iovec, n, value, field)                  \
 882         if (!isempty(value)) {                                          \
 883                 char *k;                                                \
 884                 k = strjoina(field "=", value);                         \
 885                 iovec[n++] = IOVEC_MAKE_STRING(k);                      \
 886         }
 887
 888 #define IOVEC_ADD_ID128_FIELD(iovec, n, value, field)                   \
 889         if (!sd_id128_is_null(value)) {                                 \
 890                 char *k;                                                \
 891                 k = newa(char, STRLEN(field "=") + SD_ID128_STRING_MAX); \
 892                 sd_id128_to_string(value, stpcpy(k, field "="));        \
 893                 iovec[n++] = IOVEC_MAKE_STRING(k);                      \
 894         }
 895
 896 #define IOVEC_ADD_SIZED_FIELD(iovec, n, value, value_size, field)       \
 897         if (value_size > 0) {                                           \
 898                 char *k;                                                \
 899                 k = newa(char, STRLEN(field "=") + value_size + 1);     \
 900                 *((char*) mempcpy(stpcpy(k, field "="), value, value_size)) = 0; \
 901                 iovec[n++] = IOVEC_MAKE_STRING(k);                      \
 902         }                                                               \
 903
 904 static void dispatch_message_real(
 905                 Server *s,
 906                 struct iovec *iovec, size_t n, size_t m,
 907                 const ClientContext *c,
 908                 const struct timeval *tv,
 909                 int priority,
 910                 pid_t object_pid) {
 911
 912         char source_time[sizeof("_SOURCE_REALTIME_TIMESTAMP=") + DECIMAL_STR_MAX(usec_t)];
 913         _cleanup_free_ char *cmdline1 = NULL, *cmdline2 = NULL;
 914         uid_t journal_uid;
 915         ClientContext *o;
 916
 917         assert(s);
 918         assert(iovec);
 919         assert(n > 0);
 920         assert(n +
 921                N_IOVEC_META_FIELDS +
 922                (pid_is_valid(object_pid) ? N_IOVEC_OBJECT_FIELDS : 0) +
 923                client_context_extra_fields_n_iovec(c) <= m);
 924
 925         if (c) {
 926                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->pid, pid_t, pid_is_valid, PID_FMT, "_PID");
 927                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->uid, uid_t, uid_is_valid, UID_FMT, "_UID");
 928                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->gid, gid_t, gid_is_valid, GID_FMT, "_GID");
 929
 930                 IOVEC_ADD_STRING_FIELD(iovec, n, c->comm, "_COMM"); /* At most TASK_COMM_LENGTH (16 bytes) */
 931                 IOVEC_ADD_STRING_FIELD(iovec, n, c->exe, "_EXE"); /* A path, so at most PATH_MAX (4096 bytes) */
 932
 933                 if (c->cmdline)
 934                         /* At most _SC_ARG_MAX (2MB usually), which is too much to put on stack.
 935                          * Let's use a heap allocation for this one. */
 936                         cmdline1 = set_iovec_string_field(iovec, &n, "_CMDLINE=", c->cmdline);
 937
 938                 IOVEC_ADD_STRING_FIELD(iovec, n, c->capeff, "_CAP_EFFECTIVE"); /* Read from /proc/.../status */
 939                 IOVEC_ADD_SIZED_FIELD(iovec, n, c->label, c->label_size, "_SELINUX_CONTEXT");
 940                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "_AUDIT_SESSION");
 941                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->loginuid, uid_t, uid_is_valid, UID_FMT, "_AUDIT_LOGINUID");
 942
 943                 IOVEC_ADD_STRING_FIELD(iovec, n, c->cgroup, "_SYSTEMD_CGROUP"); /* A path */
 944                 IOVEC_ADD_STRING_FIELD(iovec, n, c->session, "_SYSTEMD_SESSION");
 945                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, c->owner_uid, uid_t, uid_is_valid, UID_FMT, "_SYSTEMD_OWNER_UID");
 946                 IOVEC_ADD_STRING_FIELD(iovec, n, c->unit, "_SYSTEMD_UNIT"); /* Unit names are bounded by UNIT_NAME_MAX */
 947                 IOVEC_ADD_STRING_FIELD(iovec, n, c->user_unit, "_SYSTEMD_USER_UNIT");
 948                 IOVEC_ADD_STRING_FIELD(iovec, n, c->slice, "_SYSTEMD_SLICE");
 949                 IOVEC_ADD_STRING_FIELD(iovec, n, c->user_slice, "_SYSTEMD_USER_SLICE");
 950
 951                 IOVEC_ADD_ID128_FIELD(iovec, n, c->invocation_id, "_SYSTEMD_INVOCATION_ID");
 952
 953                 if (c->extra_fields_n_iovec > 0) {
 954                         memcpy(iovec + n, c->extra_fields_iovec, c->extra_fields_n_iovec * sizeof(struct iovec));
 955                         n += c->extra_fields_n_iovec;
 956                 }
 957         }
 958
 959         assert(n <= m);
 960
 961         if (pid_is_valid(object_pid) && client_context_get(s, object_pid, NULL, NULL, 0, NULL, &o) >= 0) {
 962
 963                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->pid, pid_t, pid_is_valid, PID_FMT, "OBJECT_PID");
 964                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_UID");
 965                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->gid, gid_t, gid_is_valid, GID_FMT, "OBJECT_GID");
 966
 967                 /* See above for size limits, only ->cmdline may be large, so use a heap allocation for it. */
 968                 IOVEC_ADD_STRING_FIELD(iovec, n, o->comm, "OBJECT_COMM");
 969                 IOVEC_ADD_STRING_FIELD(iovec, n, o->exe, "OBJECT_EXE");
 970                 if (o->cmdline)
 971                         cmdline2 = set_iovec_string_field(iovec, &n, "OBJECT_CMDLINE=", o->cmdline);
 972
 973                 IOVEC_ADD_STRING_FIELD(iovec, n, o->capeff, "OBJECT_CAP_EFFECTIVE");
 974                 IOVEC_ADD_SIZED_FIELD(iovec, n, o->label, o->label_size, "OBJECT_SELINUX_CONTEXT");
 975                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->auditid, uint32_t, audit_session_is_valid, "%" PRIu32, "OBJECT_AUDIT_SESSION");
 976                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->loginuid, uid_t, uid_is_valid, UID_FMT, "OBJECT_AUDIT_LOGINUID");
 977
 978                 IOVEC_ADD_STRING_FIELD(iovec, n, o->cgroup, "OBJECT_SYSTEMD_CGROUP");
 979                 IOVEC_ADD_STRING_FIELD(iovec, n, o->session, "OBJECT_SYSTEMD_SESSION");
 980                 IOVEC_ADD_NUMERIC_FIELD(iovec, n, o->owner_uid, uid_t, uid_is_valid, UID_FMT, "OBJECT_SYSTEMD_OWNER_UID");
 981                 IOVEC_ADD_STRING_FIELD(iovec, n, o->unit, "OBJECT_SYSTEMD_UNIT");
 982                 IOVEC_ADD_STRING_FIELD(iovec, n, o->user_unit, "OBJECT_SYSTEMD_USER_UNIT");
 983                 IOVEC_ADD_STRING_FIELD(iovec, n, o->slice, "OBJECT_SYSTEMD_SLICE");
 984                 IOVEC_ADD_STRING_FIELD(iovec, n, o->user_slice, "OBJECT_SYSTEMD_USER_SLICE");
 985
 986                 IOVEC_ADD_ID128_FIELD(iovec, n, o->invocation_id, "OBJECT_SYSTEMD_INVOCATION_ID=");
 987         }
 988
 989         assert(n <= m);
 990
 991         if (tv) {
 992                 sprintf(source_time, "_SOURCE_REALTIME_TIMESTAMP=" USEC_FMT, timeval_load(tv));
 993                 iovec[n++] = IOVEC_MAKE_STRING(source_time);
 994         }
 995
 996         /* Note that strictly speaking storing the boot id here is
 997          * redundant since the entry includes this in-line
 998          * anyway. However, we need this indexed, too. */
 999         if (!isempty(s->boot_id_field))
1000                 iovec[n++] = IOVEC_MAKE_STRING(s->boot_id_field);
1001
1002         if (!isempty(s->machine_id_field))
1003                 iovec[n++] = IOVEC_MAKE_STRING(s->machine_id_field);
1004
1005         if (!isempty(s->hostname_field))
1006                 iovec[n++] = IOVEC_MAKE_STRING(s->hostname_field);
1007
1008         assert(n <= m);
1009
1010         if (s->split_mode == SPLIT_UID && c && uid_is_valid(c->uid))
1011                 /* Split up strictly by (non-root) UID */
1012                 journal_uid = c->uid;
1013         else if (s->split_mode == SPLIT_LOGIN && c && c->uid > 0 && uid_is_valid(c->owner_uid))
1014                 /* Split up by login UIDs.  We do this only if the
1015                  * realuid is not root, in order not to accidentally
1016                  * leak privileged information to the user that is
1017                  * logged by a privileged process that is part of an
1018                  * unprivileged session. */
1019                 journal_uid = c->owner_uid;
1020         else
1021                 journal_uid = 0;
1022
1023         write_to_journal(s, journal_uid, iovec, n, priority);
1024 }
1025
1026 void server_driver_message(Server *s, pid_t object_pid, const char *message_id, const char *format, ...) {
1027
1028         struct iovec *iovec;
1029         size_t n = 0, k, m;
1030         va_list ap;
1031         int r;
1032
1033         assert(s);
1034         assert(format);
1035
1036         m = N_IOVEC_META_FIELDS + 5 + N_IOVEC_PAYLOAD_FIELDS + client_context_extra_fields_n_iovec(s->my_context) + N_IOVEC_OBJECT_FIELDS;
1037         iovec = newa(struct iovec, m);
1038
1039         assert_cc(3 == LOG_FAC(LOG_DAEMON));
1040         iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_FACILITY=3");
1041         iovec[n++] = IOVEC_MAKE_STRING("SYSLOG_IDENTIFIER=systemd-journald");
1042
1043         iovec[n++] = IOVEC_MAKE_STRING("_TRANSPORT=driver");
1044         assert_cc(6 == LOG_INFO);
1045         iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=6");
1046
1047         if (message_id)
1048                 iovec[n++] = IOVEC_MAKE_STRING(message_id);
1049         k = n;
1050
1051         va_start(ap, format);
1052         r = log_format_iovec(iovec, m, &n, false, 0, format, ap);
1053         /* Error handling below */
1054         va_end(ap);
1055
1056         if (r >= 0)
1057                 dispatch_message_real(s, iovec, n, m, s->my_context, NULL, LOG_INFO, object_pid);
1058
1059         while (k < n)
1060                 free(iovec[k++].iov_base);
1061
1062         if (r < 0) {
1063                 /* We failed to format the message. Emit a warning instead. */
1064                 char buf[LINE_MAX];
1065
1066                 xsprintf(buf, "MESSAGE=Entry printing failed: %s", strerror(-r));
1067
1068                 n = 3;
1069                 iovec[n++] = IOVEC_MAKE_STRING("PRIORITY=4");
1070                 iovec[n++] = IOVEC_MAKE_STRING(buf);
1071                 dispatch_message_real(s, iovec, n, m, s->my_context, NULL, LOG_INFO, object_pid);
1072         }
1073 }
1074
1075 void server_dispatch_message(
1076                 Server *s,
1077                 struct iovec *iovec, size_t n, size_t m,
1078                 ClientContext *c,
1079                 const struct timeval *tv,
1080                 int priority,
1081                 pid_t object_pid) {
1082
1083         uint64_t available = 0;
1084         int rl;
1085
1086         assert(s);
1087         assert(iovec || n == 0);
1088
1089         if (n == 0)
1090                 return;
1091
1092         if (LOG_PRI(priority) > s->max_level_store)
1093                 return;
1094
1095         /* Stop early in case the information will not be stored
1096          * in a journal. */
1097         if (s->storage == STORAGE_NONE)
1098                 return;
1099
1100         if (c && c->unit) {
1101                 (void) determine_space(s, &available, NULL);
1102
1103                 rl = journal_rate_limit_test(s->rate_limit, c->unit, c->log_rate_limit_interval, c->log_rate_limit_burst, priority & LOG_PRIMASK, available);
1104                 if (rl == 0)
1105                         return;
1106
1107                 /* Write a suppression message if we suppressed something */
1108                 if (rl > 1)
1109                         server_driver_message(s, c->pid,
1110                                               "MESSAGE_ID=" SD_MESSAGE_JOURNAL_DROPPED_STR,
1111                                               LOG_MESSAGE("Suppressed %i messages from %s", rl - 1, c->unit),
1112                                               "N_DROPPED=%i", rl - 1,
1113                                               NULL);
1114         }
1115
1116         dispatch_message_real(s, iovec, n, m, c, tv, priority, object_pid);
1117 }
1118
1119 int server_flush_to_var(Server *s, bool require_flag_file) {
1120         sd_id128_t machine;
1121         sd_journal *j = NULL;
1122         char ts[FORMAT_TIMESPAN_MAX];
1123         usec_t start;
1124         unsigned n = 0;
1125         int r, k;
1126
1127         assert(s);
1128
1129         if (!IN_SET(s->storage, STORAGE_AUTO, STORAGE_PERSISTENT))
1130                 return 0;
1131
1132         if (!s->runtime_journal)
1133                 return 0;
1134
1135         if (require_flag_file && !flushed_flag_is_set())
1136                 return 0;
1137
1138         (void) system_journal_open(s, true, false);
1139
1140         if (!s->system_journal)
1141                 return 0;
1142
1143         log_debug("Flushing to /var...");
1144
1145         start = now(CLOCK_MONOTONIC);
1146
1147         r = sd_id128_get_machine(&machine);
1148         if (r < 0)
1149                 return r;
1150
1151         r = sd_journal_open(&j, SD_JOURNAL_RUNTIME_ONLY);
1152         if (r < 0)
1153                 return log_error_errno(r, "Failed to read runtime journal: %m");
1154
1155         sd_journal_set_data_threshold(j, 0);
1156
1157         SD_JOURNAL_FOREACH(j) {
1158                 Object *o = NULL;
1159                 JournalFile *f;
1160
1161                 f = j->current_file;
1162                 assert(f && f->current_offset > 0);
1163
1164                 n++;
1165
1166                 r = journal_file_move_to_object(f, OBJECT_ENTRY, f->current_offset, &o);
1167                 if (r < 0) {
1168                         log_error_errno(r, "Can't read entry: %m");
1169                         goto finish;
1170                 }
1171
1172                 r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset);
1173                 if (r >= 0)
1174                         continue;
1175
1176                 if (!shall_try_append_again(s->system_journal, r)) {
1177                         log_error_errno(r, "Can't write entry: %m");
1178                         goto finish;
1179                 }
1180
1181                 server_rotate(s);
1182                 server_vacuum(s, false);
1183
1184                 if (!s->system_journal) {
1185                         log_notice("Didn't flush runtime journal since rotation of system journal wasn't successful.");
1186                         r = -EIO;
1187                         goto finish;
1188                 }
1189
1190                 log_debug("Retrying write.");
1191                 r = journal_file_copy_entry(f, s->system_journal, o, f->current_offset);
1192                 if (r < 0) {
1193                         log_error_errno(r, "Can't write entry: %m");
1194                         goto finish;
1195                 }
1196         }
1197
1198         r = 0;
1199
1200 finish:
1201         if (s->system_journal)
1202                 journal_file_post_change(s->system_journal);
1203
1204         s->runtime_journal = journal_file_close(s->runtime_journal);
1205
1206         if (r >= 0)
1207                 (void) rm_rf("/run/log/journal", REMOVE_ROOT);
1208
1209         sd_journal_close(j);
1210
1211         server_driver_message(s, 0, NULL,
1212                               LOG_MESSAGE("Time spent on flushing to /var is %s for %u entries.",
1213                                           format_timespan(ts, sizeof(ts), now(CLOCK_MONOTONIC) - start, 0),
1214                                           n),
1215                               NULL);
1216
1217         k = touch("/run/systemd/journal/flushed");
1218         if (k < 0)
1219                 log_warning_errno(k, "Failed to touch /run/systemd/journal/flushed, ignoring: %m");
1220
1221         return r;
1222 }
1223
1224 static int server_relinquish_var(Server *s) {
1225         assert(s);
1226
1227         if (s->storage == STORAGE_NONE)
1228                 return 0;
1229
1230         if (s->runtime_journal && !s->system_journal)
1231                 return 0;
1232
1233         log_debug("Relinquishing /var...");
1234
1235         (void) system_journal_open(s, false, true);
1236
1237         s->system_journal = journal_file_close(s->system_journal);
1238         ordered_hashmap_clear_with_destructor(s->user_journals, journal_file_close);
1239         set_clear_with_destructor(s->deferred_closes, journal_file_close);
1240
1241         if (unlink("/run/systemd/journal/flushed") < 0 && errno != ENOENT)
1242                 log_warning_errno(errno, "Failed to unlink /run/systemd/journal/flushed, ignoring: %m");
1243
1244         return 0;
1245 }
1246
1247 int server_process_datagram(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
1248         Server *s = userdata;
1249         struct ucred *ucred = NULL;
1250         struct timeval *tv = NULL;
1251         struct cmsghdr *cmsg;
1252         char *label = NULL;
1253         size_t label_len = 0, m;
1254         struct iovec iovec;
1255         ssize_t n;
1256         int *fds = NULL, v = 0;
1257         size_t n_fds = 0;
1258
1259         union {
1260                 struct cmsghdr cmsghdr;
1261
1262                 /* We use NAME_MAX space for the SELinux label
1263                  * here. The kernel currently enforces no
1264                  * limit, but according to suggestions from
1265                  * the SELinux people this will change and it
1266                  * will probably be identical to NAME_MAX. For
1267                  * now we use that, but this should be updated
1268                  * one day when the final limit is known. */
1269                 uint8_t buf[CMSG_SPACE(sizeof(struct ucred)) +
1270                             CMSG_SPACE(sizeof(struct timeval)) +
1271                             CMSG_SPACE(sizeof(int)) + /* fd */
1272                             CMSG_SPACE(NAME_MAX)]; /* selinux label */
1273         } control = {};
1274
1275         union sockaddr_union sa = {};
1276
1277         struct msghdr msghdr = {
1278                 .msg_iov = &iovec,
1279                 .msg_iovlen = 1,
1280                 .msg_control = &control,
1281                 .msg_controllen = sizeof(control),
1282                 .msg_name = &sa,
1283                 .msg_namelen = sizeof(sa),
1284         };
1285
1286         assert(s);
1287         assert(fd == s->native_fd || fd == s->syslog_fd || fd == s->audit_fd);
1288
1289         if (revents != EPOLLIN)
1290                 return log_error_errno(SYNTHETIC_ERRNO(EIO),
1291                                        "Got invalid event from epoll for datagram fd: %" PRIx32,
1292                                        revents);
1293
1294         /* Try to get the right size, if we can. (Not all sockets support SIOCINQ, hence we just try, but don't rely on
1295          * it.) */
1296         (void) ioctl(fd, SIOCINQ, &v);
1297
1298         /* Fix it up, if it is too small. We use the same fixed value as auditd here. Awful! */
1299         m = PAGE_ALIGN(MAX3((size_t) v + 1,
1300                             (size_t) LINE_MAX,
1301                             ALIGN(sizeof(struct nlmsghdr)) + ALIGN((size_t) MAX_AUDIT_MESSAGE_LENGTH)) + 1);
1302
1303         if (!GREEDY_REALLOC(s->buffer, s->buffer_size, m))
1304                 return log_oom();
1305
1306         iovec = IOVEC_MAKE(s->buffer, s->buffer_size - 1); /* Leave room for trailing NUL we add later */
1307
1308         n = recvmsg(fd, &msghdr, MSG_DONTWAIT|MSG_CMSG_CLOEXEC);
1309         if (n < 0) {
1310                 if (IN_SET(errno, EINTR, EAGAIN))
1311                         return 0;
1312
1313                 return log_error_errno(errno, "recvmsg() failed: %m");
1314         }
1315
1316         CMSG_FOREACH(cmsg, &msghdr)
1317                 if (cmsg->cmsg_level == SOL_SOCKET &&
1318                     cmsg->cmsg_type == SCM_CREDENTIALS &&
1319                     cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred)))
1320                         ucred = (struct ucred*) CMSG_DATA(cmsg);
1321                 else if (cmsg->cmsg_level == SOL_SOCKET &&
1322                          cmsg->cmsg_type == SCM_SECURITY) {
1323                         label = (char*) CMSG_DATA(cmsg);
1324                         label_len = cmsg->cmsg_len - CMSG_LEN(0);
1325                 } else if (cmsg->cmsg_level == SOL_SOCKET &&
1326                            cmsg->cmsg_type == SO_TIMESTAMP &&
1327                            cmsg->cmsg_len == CMSG_LEN(sizeof(struct timeval)))
1328                         tv = (struct timeval*) CMSG_DATA(cmsg);
1329                 else if (cmsg->cmsg_level == SOL_SOCKET &&
1330                          cmsg->cmsg_type == SCM_RIGHTS) {
1331                         fds = (int*) CMSG_DATA(cmsg);
1332                         n_fds = (cmsg->cmsg_len - CMSG_LEN(0)) / sizeof(int);
1333                 }
1334
1335         /* And a trailing NUL, just in case */
1336         s->buffer[n] = 0;
1337
1338         if (fd == s->syslog_fd) {
1339                 if (n > 0 && n_fds == 0)
1340                         server_process_syslog_message(s, s->buffer, n, ucred, tv, label, label_len);
1341                 else if (n_fds > 0)
1342                         log_warning("Got file descriptors via syslog socket. Ignoring.");
1343
1344         } else if (fd == s->native_fd) {
1345                 if (n > 0 && n_fds == 0)
1346                         server_process_native_message(s, s->buffer, n, ucred, tv, label, label_len);
1347                 else if (n == 0 && n_fds == 1)
1348                         server_process_native_file(s, fds[0], ucred, tv, label, label_len);
1349                 else if (n_fds > 0)
1350                         log_warning("Got too many file descriptors via native socket. Ignoring.");
1351
1352         } else {
1353                 assert(fd == s->audit_fd);
1354
1355                 if (n > 0 && n_fds == 0)
1356                         server_process_audit_message(s, s->buffer, n, ucred, &sa, msghdr.msg_namelen);
1357                 else if (n_fds > 0)
1358                         log_warning("Got file descriptors via audit socket. Ignoring.");
1359         }
1360
1361         close_many(fds, n_fds);
1362         return 0;
1363 }
1364
1365 static void server_full_flush(Server *s) {
1366         assert(s);
1367
1368         (void) server_flush_to_var(s, false);
1369         server_sync(s);
1370         server_vacuum(s, false);
1371
1372         server_space_usage_message(s, NULL);
1373 }
1374
1375 static int dispatch_sigusr1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) {
1376         Server *s = userdata;
1377
1378         assert(s);
1379
1380         log_info("Received SIGUSR1 signal from PID " PID_FMT ", as request to flush runtime journal.", si->ssi_pid);
1381         server_full_flush(s);
1382
1383         return 0;
1384 }
1385
1386 static void server_full_rotate(Server *s) {
1387         int r;
1388
1389         assert(s);
1390
1391         server_rotate(s);
1392         server_vacuum(s, true);
1393
1394         if (s->system_journal)
1395                 patch_min_use(&s->system_storage);
1396         if (s->runtime_journal)
1397                 patch_min_use(&s->runtime_storage);
1398
1399         /* Let clients know when the most recent rotation happened. */
1400         r = write_timestamp_file_atomic("/run/systemd/journal/rotated", now(CLOCK_MONOTONIC));
1401         if (r < 0)
1402                 log_warning_errno(r, "Failed to write /run/systemd/journal/rotated, ignoring: %m");
1403 }
1404
1405 static int dispatch_sigusr2(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) {
1406         Server *s = userdata;
1407
1408         assert(s);
1409
1410         log_info("Received SIGUSR2 signal from PID " PID_FMT ", as request to rotate journal.", si->ssi_pid);
1411         server_full_rotate(s);
1412
1413         return 0;
1414 }
1415
1416 static int dispatch_sigterm(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) {
1417         Server *s = userdata;
1418
1419         assert(s);
1420
1421         log_received_signal(LOG_INFO, si);
1422
1423         sd_event_exit(s->event, 0);
1424         return 0;
1425 }
1426
1427 static void server_full_sync(Server *s) {
1428         int r;
1429
1430         assert(s);
1431
1432         server_sync(s);
1433
1434         /* Let clients know when the most recent sync happened. */
1435         r = write_timestamp_file_atomic("/run/systemd/journal/synced", now(CLOCK_MONOTONIC));
1436         if (r < 0)
1437                 log_warning_errno(r, "Failed to write /run/systemd/journal/synced, ignoring: %m");
1438
1439         return;
1440 }
1441
1442 static int dispatch_sigrtmin1(sd_event_source *es, const struct signalfd_siginfo *si, void *userdata) {
1443         Server *s = userdata;
1444
1445         assert(s);
1446
1447         log_debug("Received SIGRTMIN1 signal from PID " PID_FMT ", as request to sync.", si->ssi_pid );
1448         server_full_sync(s);
1449
1450         return 0;
1451 }
1452
1453 static int setup_signals(Server *s) {
1454         int r;
1455
1456         assert(s);
1457
1458         assert_se(sigprocmask_many(SIG_SETMASK, NULL, SIGINT, SIGTERM, SIGUSR1, SIGUSR2, SIGRTMIN+1, -1) >= 0);
1459
1460         r = sd_event_add_signal(s->event, &s->sigusr1_event_source, SIGUSR1, dispatch_sigusr1, s);
1461         if (r < 0)
1462                 return r;
1463
1464         r = sd_event_add_signal(s->event, &s->sigusr2_event_source, SIGUSR2, dispatch_sigusr2, s);
1465         if (r < 0)
1466                 return r;
1467
1468         r = sd_event_add_signal(s->event, &s->sigterm_event_source, SIGTERM, dispatch_sigterm, s);
1469         if (r < 0)
1470                 return r;
1471
1472         /* Let's process SIGTERM late, so that we flush all queued messages to disk before we exit */
1473         r = sd_event_source_set_priority(s->sigterm_event_source, SD_EVENT_PRIORITY_NORMAL+20);
1474         if (r < 0)
1475                 return r;
1476
1477         /* When journald is invoked on the terminal (when debugging), it's useful if C-c is handled
1478          * equivalent to SIGTERM. */
1479         r = sd_event_add_signal(s->event, &s->sigint_event_source, SIGINT, dispatch_sigterm, s);
1480         if (r < 0)
1481                 return r;
1482
1483         r = sd_event_source_set_priority(s->sigint_event_source, SD_EVENT_PRIORITY_NORMAL+20);
1484         if (r < 0)
1485                 return r;
1486
1487         /* SIGRTMIN+1 causes an immediate sync. We process this very late, so that everything else queued at
1488          * this point is really written to disk. Clients can watch /run/systemd/journal/synced with inotify
1489          * until its mtime changes to see when a sync happened. */
1490         r = sd_event_add_signal(s->event, &s->sigrtmin1_event_source, SIGRTMIN+1, dispatch_sigrtmin1, s);
1491         if (r < 0)
1492                 return r;
1493
1494         r = sd_event_source_set_priority(s->sigrtmin1_event_source, SD_EVENT_PRIORITY_NORMAL+15);
1495         if (r < 0)
1496                 return r;
1497
1498         return 0;
1499 }
1500
1501 static int parse_proc_cmdline_item(const char *key, const char *value, void *data) {
1502         Server *s = data;
1503         int r;
1504
1505         assert(s);
1506
1507         if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_syslog")) {
1508
1509                 r = value ? parse_boolean(value) : true;
1510                 if (r < 0)
1511                         log_warning("Failed to parse forward to syslog switch \"%s\". Ignoring.", value);
1512                 else
1513                         s->forward_to_syslog = r;
1514
1515         } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_kmsg")) {
1516
1517                 r = value ? parse_boolean(value) : true;
1518                 if (r < 0)
1519                         log_warning("Failed to parse forward to kmsg switch \"%s\". Ignoring.", value);
1520                 else
1521                         s->forward_to_kmsg = r;
1522
1523         } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_console")) {
1524
1525                 r = value ? parse_boolean(value) : true;
1526                 if (r < 0)
1527                         log_warning("Failed to parse forward to console switch \"%s\". Ignoring.", value);
1528                 else
1529                         s->forward_to_console = r;
1530
1531         } else if (proc_cmdline_key_streq(key, "systemd.journald.forward_to_wall")) {
1532
1533                 r = value ? parse_boolean(value) : true;
1534                 if (r < 0)
1535                         log_warning("Failed to parse forward to wall switch \"%s\". Ignoring.", value);
1536                 else
1537                         s->forward_to_wall = r;
1538
1539         } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_console")) {
1540
1541                 if (proc_cmdline_value_missing(key, value))
1542                         return 0;
1543
1544                 r = log_level_from_string(value);
1545                 if (r < 0)
1546                         log_warning("Failed to parse max level console value \"%s\". Ignoring.", value);
1547                 else
1548                         s->max_level_console = r;
1549
1550         } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_store")) {
1551
1552                 if (proc_cmdline_value_missing(key, value))
1553                         return 0;
1554
1555                 r = log_level_from_string(value);
1556                 if (r < 0)
1557                         log_warning("Failed to parse max level store value \"%s\". Ignoring.", value);
1558                 else
1559                         s->max_level_store = r;
1560
1561         } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_syslog")) {
1562
1563                 if (proc_cmdline_value_missing(key, value))
1564                         return 0;
1565
1566                 r = log_level_from_string(value);
1567                 if (r < 0)
1568                         log_warning("Failed to parse max level syslog value \"%s\". Ignoring.", value);
1569                 else
1570                         s->max_level_syslog = r;
1571
1572         } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_kmsg")) {
1573
1574                 if (proc_cmdline_value_missing(key, value))
1575                         return 0;
1576
1577                 r = log_level_from_string(value);
1578                 if (r < 0)
1579                         log_warning("Failed to parse max level kmsg value \"%s\". Ignoring.", value);
1580                 else
1581                         s->max_level_kmsg = r;
1582
1583         } else if (proc_cmdline_key_streq(key, "systemd.journald.max_level_wall")) {
1584
1585                 if (proc_cmdline_value_missing(key, value))
1586                         return 0;
1587
1588                 r = log_level_from_string(value);
1589                 if (r < 0)
1590                         log_warning("Failed to parse max level wall value \"%s\". Ignoring.", value);
1591                 else
1592                         s->max_level_wall = r;
1593
1594         } else if (startswith(key, "systemd.journald"))
1595                 log_warning("Unknown journald kernel command line option \"%s\". Ignoring.", key);
1596
1597         /* do not warn about state here, since probably systemd already did */
1598         return 0;
1599 }
1600
1601 static int server_parse_config_file(Server *s) {
1602         assert(s);
1603
1604         return config_parse_many_nulstr(PKGSYSCONFDIR "/journald.conf",
1605                                         CONF_PATHS_NULSTR("systemd/journald.conf.d"),
1606                                         "Journal\0",
1607                                         config_item_perf_lookup, journald_gperf_lookup,
1608                                         CONFIG_PARSE_WARN, s);
1609 }
1610
1611 static int server_dispatch_sync(sd_event_source *es, usec_t t, void *userdata) {
1612         Server *s = userdata;
1613
1614         assert(s);
1615
1616         server_sync(s);
1617         return 0;
1618 }
1619
1620 int server_schedule_sync(Server *s, int priority) {
1621         int r;
1622
1623         assert(s);
1624
1625         if (priority <= LOG_CRIT) {
1626                 /* Immediately sync to disk when this is of priority CRIT, ALERT, EMERG */
1627                 server_sync(s);
1628                 return 0;
1629         }
1630
1631         if (s->sync_scheduled)
1632                 return 0;
1633
1634         if (s->sync_interval_usec > 0) {
1635                 usec_t when;
1636
1637                 r = sd_event_now(s->event, CLOCK_MONOTONIC, &when);
1638                 if (r < 0)
1639                         return r;
1640
1641                 when += s->sync_interval_usec;
1642
1643                 if (!s->sync_event_source) {
1644                         r = sd_event_add_time(
1645                                         s->event,
1646                                         &s->sync_event_source,
1647                                         CLOCK_MONOTONIC,
1648                                         when, 0,
1649                                         server_dispatch_sync, s);
1650                         if (r < 0)
1651                                 return r;
1652
1653                         r = sd_event_source_set_priority(s->sync_event_source, SD_EVENT_PRIORITY_IMPORTANT);
1654                 } else {
1655                         r = sd_event_source_set_time(s->sync_event_source, when);
1656                         if (r < 0)
1657                                 return r;
1658
1659                         r = sd_event_source_set_enabled(s->sync_event_source, SD_EVENT_ONESHOT);
1660                 }
1661                 if (r < 0)
1662                         return r;
1663
1664                 s->sync_scheduled = true;
1665         }
1666
1667         return 0;
1668 }
1669
1670 static int dispatch_hostname_change(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
1671         Server *s = userdata;
1672
1673         assert(s);
1674
1675         server_cache_hostname(s);
1676         return 0;
1677 }
1678
1679 static int server_open_hostname(Server *s) {
1680         int r;
1681
1682         assert(s);
1683
1684         s->hostname_fd = open("/proc/sys/kernel/hostname",
1685                               O_RDONLY|O_CLOEXEC|O_NONBLOCK|O_NOCTTY);
1686         if (s->hostname_fd < 0)
1687                 return log_error_errno(errno, "Failed to open /proc/sys/kernel/hostname: %m");
1688
1689         r = sd_event_add_io(s->event, &s->hostname_event_source, s->hostname_fd, 0, dispatch_hostname_change, s);
1690         if (r < 0) {
1691                 /* kernels prior to 3.2 don't support polling this file. Ignore
1692                  * the failure. */
1693                 if (r == -EPERM) {
1694                         log_warning_errno(r, "Failed to register hostname fd in event loop, ignoring: %m");
1695                         s->hostname_fd = safe_close(s->hostname_fd);
1696                         return 0;
1697                 }
1698
1699                 return log_error_errno(r, "Failed to register hostname fd in event loop: %m");
1700         }
1701
1702         r = sd_event_source_set_priority(s->hostname_event_source, SD_EVENT_PRIORITY_IMPORTANT-10);
1703         if (r < 0)
1704                 return log_error_errno(r, "Failed to adjust priority of host name event source: %m");
1705
1706         return 0;
1707 }
1708
1709 static int dispatch_notify_event(sd_event_source *es, int fd, uint32_t revents, void *userdata) {
1710         Server *s = userdata;
1711         int r;
1712
1713         assert(s);
1714         assert(s->notify_event_source == es);
1715         assert(s->notify_fd == fd);
1716
1717         /* The $NOTIFY_SOCKET is writable again, now send exactly one
1718          * message on it. Either it's the watchdog event, the initial
1719          * READY=1 event or an stdout stream event. If there's nothing
1720          * to write anymore, turn our event source off. The next time
1721          * there's something to send it will be turned on again. */
1722
1723         if (!s->sent_notify_ready) {
1724                 static const char p[] =
1725                         "READY=1\n"
1726                         "STATUS=Processing requests...";
1727                 ssize_t l;
1728
1729                 l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
1730                 if (l < 0) {
1731                         if (errno == EAGAIN)
1732                                 return 0;
1733
1734                         return log_error_errno(errno, "Failed to send READY=1 notification message: %m");
1735                 }
1736
1737                 s->sent_notify_ready = true;
1738                 log_debug("Sent READY=1 notification.");
1739
1740         } else if (s->send_watchdog) {
1741
1742                 static const char p[] =
1743                         "WATCHDOG=1";
1744
1745                 ssize_t l;
1746
1747                 l = send(s->notify_fd, p, strlen(p), MSG_DONTWAIT);
1748                 if (l < 0) {
1749                         if (errno == EAGAIN)
1750                                 return 0;
1751
1752                         return log_error_errno(errno, "Failed to send WATCHDOG=1 notification message: %m");
1753                 }
1754
1755                 s->send_watchdog = false;
1756                 log_debug("Sent WATCHDOG=1 notification.");
1757
1758         } else if (s->stdout_streams_notify_queue)
1759                 /* Dispatch one stream notification event */
1760                 stdout_stream_send_notify(s->stdout_streams_notify_queue);
1761
1762         /* Leave us enabled if there's still more to do. */
1763         if (s->send_watchdog || s->stdout_streams_notify_queue)
1764                 return 0;
1765
1766         /* There was nothing to do anymore, let's turn ourselves off. */
1767         r = sd_event_source_set_enabled(es, SD_EVENT_OFF);
1768         if (r < 0)
1769                 return log_error_errno(r, "Failed to turn off notify event source: %m");
1770
1771         return 0;
1772 }
1773
1774 static int dispatch_watchdog(sd_event_source *es, uint64_t usec, void *userdata) {
1775         Server *s = userdata;
1776         int r;
1777
1778         assert(s);
1779
1780         s->send_watchdog = true;
1781
1782         r = sd_event_source_set_enabled(s->notify_event_source, SD_EVENT_ON);
1783         if (r < 0)
1784                 log_warning_errno(r, "Failed to turn on notify event source: %m");
1785
1786         r = sd_event_source_set_time(s->watchdog_event_source, usec + s->watchdog_usec / 2);
1787         if (r < 0)
1788                 return log_error_errno(r, "Failed to restart watchdog event source: %m");
1789
1790         r = sd_event_source_set_enabled(s->watchdog_event_source, SD_EVENT_ON);
1791         if (r < 0)
1792                 return log_error_errno(r, "Failed to enable watchdog event source: %m");
1793
1794         return 0;
1795 }
1796
1797 static int server_connect_notify(Server *s) {
1798         union sockaddr_union sa = {};
1799         const char *e;
1800         int r, salen;
1801
1802         assert(s);
1803         assert(s->notify_fd < 0);
1804         assert(!s->notify_event_source);
1805
1806         /*
1807          * So here's the problem: we'd like to send notification messages to PID 1, but we cannot do that via
1808          * sd_notify(), since that's synchronous, and we might end up blocking on it. Specifically: given
1809          * that PID 1 might block on dbus-daemon during IPC, and dbus-daemon is logging to us, and might
1810          * hence block on us, we might end up in a deadlock if we block on sending PID 1 notification
1811          * messages — by generating a full blocking circle. To avoid this, let's create a non-blocking
1812          * socket, and connect it to the notification socket, and then wait for POLLOUT before we send
1813          * anything. This should efficiently avoid any deadlocks, as we'll never block on PID 1, hence PID 1
1814          * can safely block on dbus-daemon which can safely block on us again.
1815          *
1816          * Don't think that this issue is real? It is, see: https://github.com/systemd/systemd/issues/1505
1817          */
1818
1819         e = getenv("NOTIFY_SOCKET");
1820         if (!e)
1821                 return 0;
1822
1823         salen = sockaddr_un_set_path(&sa.un, e);
1824         if (salen < 0)
1825                 return log_error_errno(salen, "NOTIFY_SOCKET set to invalid value '%s': %m", e);
1826
1827         s->notify_fd = socket(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC|SOCK_NONBLOCK, 0);
1828         if (s->notify_fd < 0)
1829                 return log_error_errno(errno, "Failed to create notify socket: %m");
1830
1831         (void) fd_inc_sndbuf(s->notify_fd, NOTIFY_SNDBUF_SIZE);
1832
1833         r = connect(s->notify_fd, &sa.sa, salen);
1834         if (r < 0)
1835                 return log_error_errno(errno, "Failed to connect to notify socket: %m");
1836
1837         r = sd_event_add_io(s->event, &s->notify_event_source, s->notify_fd, EPOLLOUT, dispatch_notify_event, s);
1838         if (r < 0)
1839                 return log_error_errno(r, "Failed to watch notification socket: %m");
1840
1841         if (sd_watchdog_enabled(false, &s->watchdog_usec) > 0) {
1842                 s->send_watchdog = true;
1843
1844                 r = sd_event_add_time(s->event, &s->watchdog_event_source, CLOCK_MONOTONIC, now(CLOCK_MONOTONIC) + s->watchdog_usec/2, s->watchdog_usec/4, dispatch_watchdog, s);
1845                 if (r < 0)
1846                         return log_error_errno(r, "Failed to add watchdog time event: %m");
1847         }
1848
1849         /* This should fire pretty soon, which we'll use to send the READY=1 event. */
1850
1851         return 0;
1852 }
1853
1854 static int synchronize_second_half(sd_event_source *event_source, void *userdata) {
1855         Varlink *link = userdata;
1856         Server *s;
1857         int r;
1858
1859         assert(link);
1860         assert_se(s = varlink_get_userdata(link));
1861
1862         /* This is the "second half" of the Synchronize() varlink method. This function is called as deferred
1863          * event source at a low priority to ensure the synchronization completes after all queued log
1864          * messages are processed. */
1865         server_full_sync(s);
1866
1867         /* Let's get rid of the event source now, by marking it as non-floating again. It then has no ref
1868          * anymore and is immediately destroyed after we return from this function, i.e. from this event
1869          * source handler at the end. */
1870         r = sd_event_source_set_floating(event_source, false);
1871         if (r < 0)
1872                 return log_error_errno(r, "Failed to mark event source as non-floating: %m");
1873
1874         return varlink_reply(link, NULL);
1875 }
1876
1877 static void synchronize_destroy(void *userdata) {
1878         varlink_unref(userdata);
1879 }
1880
1881 static int vl_method_synchronize(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
1882         _cleanup_(sd_event_source_unrefp) sd_event_source *event_source = NULL;
1883         Server *s = userdata;
1884         int r;
1885
1886         assert(link);
1887         assert(s);
1888
1889         if (json_variant_elements(parameters) > 0)
1890                 return varlink_error_invalid_parameter(link, parameters);
1891
1892         log_info("Received client request to rotate journal.");
1893
1894         /* We don't do the main work now, but instead enqueue a deferred event loop job which will do
1895          * it. That job is scheduled at low priority, so that we return from this method call only after all
1896          * queued but not processed log messages are written to disk, so that this method call returning can
1897          * be used as nice synchronization point. */
1898         r = sd_event_add_defer(s->event, &event_source, synchronize_second_half, link);
1899         if (r < 0)
1900                 return log_error_errno(r, "Failed to allocate defer event source: %m");
1901
1902         r = sd_event_source_set_destroy_callback(event_source, synchronize_destroy);
1903         if (r < 0)
1904                 return log_error_errno(r, "Failed to set event source destroy callback: %m");
1905
1906         varlink_ref(link); /* The varlink object is now left to the destroy callack to unref */
1907
1908         r = sd_event_source_set_priority(event_source, SD_EVENT_PRIORITY_NORMAL+15);
1909         if (r < 0)
1910                 return log_error_errno(r, "Failed to set defer event source priority: %m");
1911
1912         /* Give up ownership of this event source. It will now be destroyed along with event loop itself,
1913          * unless it destroys itself earlier. */
1914         r = sd_event_source_set_floating(event_source, true);
1915         if (r < 0)
1916                 return log_error_errno(r, "Failed to mark event source as floating: %m");
1917
1918         (void) sd_event_source_set_description(event_source, "deferred-sync");
1919
1920         return 0;
1921 }
1922
1923 static int vl_method_rotate(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
1924         Server *s = userdata;
1925
1926         assert(link);
1927         assert(s);
1928
1929         if (json_variant_elements(parameters) > 0)
1930                 return varlink_error_invalid_parameter(link, parameters);
1931
1932         log_info("Received client request to rotate journal.");
1933         server_full_rotate(s);
1934
1935         return varlink_reply(link, NULL);
1936 }
1937
1938 static int vl_method_flush_to_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
1939         Server *s = userdata;
1940
1941         assert(link);
1942         assert(s);
1943
1944         if (json_variant_elements(parameters) > 0)
1945                 return varlink_error_invalid_parameter(link, parameters);
1946
1947         log_info("Received client request to flush runtime journal.");
1948         server_full_flush(s);
1949
1950         return varlink_reply(link, NULL);
1951 }
1952
1953 static int vl_method_relinquish_var(Varlink *link, JsonVariant *parameters, VarlinkMethodFlags flags, void *userdata) {
1954         Server *s = userdata;
1955
1956         assert(link);
1957         assert(s);
1958
1959         if (json_variant_elements(parameters) > 0)
1960                 return varlink_error_invalid_parameter(link, parameters);
1961
1962         log_info("Received client request to relinquish /var access.");
1963         server_relinquish_var(s);
1964
1965         return varlink_reply(link, NULL);
1966 }
1967
1968 static int server_open_varlink(Server *s) {
1969         int r;
1970
1971         assert(s);
1972
1973         r = varlink_server_new(&s->varlink_server, VARLINK_SERVER_ROOT_ONLY);
1974         if (r < 0)
1975                 return r;
1976
1977         varlink_server_set_userdata(s->varlink_server, s);
1978
1979         r = varlink_server_bind_method_many(
1980                         s->varlink_server,
1981                         "io.systemd.Journal.Synchronize",   vl_method_synchronize,
1982                         "io.systemd.Journal.Rotate",        vl_method_rotate,
1983                         "io.systemd.Journal.FlushToVar",    vl_method_flush_to_var,
1984                         "io.systemd.Journal.RelinquishVar", vl_method_relinquish_var);
1985         if (r < 0)
1986                 return r;
1987
1988         r = varlink_server_listen_address(s->varlink_server, "/run/systemd/journal/io.systemd.journal", 0600);
1989         if (r < 0)
1990                 return r;
1991
1992         r = varlink_server_attach_event(s->varlink_server, s->event, SD_EVENT_PRIORITY_NORMAL);
1993         if (r < 0)
1994                 return r;
1995
1996         return 0;
1997 }
1998
1999 int server_init(Server *s) {
2000         _cleanup_fdset_free_ FDSet *fds = NULL;
2001         int n, r, fd;
2002         bool no_sockets;
2003
2004         assert(s);
2005
2006         *s = (Server) {
2007                 .syslog_fd = -1,
2008                 .native_fd = -1,
2009                 .stdout_fd = -1,
2010                 .dev_kmsg_fd = -1,
2011                 .audit_fd = -1,
2012                 .hostname_fd = -1,
2013                 .notify_fd = -1,
2014
2015                 .compress.enabled = true,
2016                 .compress.threshold_bytes = (uint64_t) -1,
2017                 .seal = true,
2018                 .read_kmsg = true,
2019
2020                 .watchdog_usec = USEC_INFINITY,
2021
2022                 .sync_interval_usec = DEFAULT_SYNC_INTERVAL_USEC,
2023                 .sync_scheduled = false,
2024
2025                 .rate_limit_interval = DEFAULT_RATE_LIMIT_INTERVAL,
2026                 .rate_limit_burst = DEFAULT_RATE_LIMIT_BURST,
2027
2028                 .forward_to_wall = true,
2029
2030                 .max_file_usec = DEFAULT_MAX_FILE_USEC,
2031
2032                 .max_level_store = LOG_DEBUG,
2033                 .max_level_syslog = LOG_DEBUG,
2034                 .max_level_kmsg = LOG_NOTICE,
2035                 .max_level_console = LOG_INFO,
2036                 .max_level_wall = LOG_EMERG,
2037
2038                 .line_max = DEFAULT_LINE_MAX,
2039
2040                 .runtime_storage.name = "Runtime Journal",
2041                 .system_storage.name = "System Journal",
2042         };
2043
2044         journal_reset_metrics(&s->system_storage.metrics);
2045         journal_reset_metrics(&s->runtime_storage.metrics);
2046
2047         server_parse_config_file(s);
2048
2049         r = proc_cmdline_parse(parse_proc_cmdline_item, s, PROC_CMDLINE_STRIP_RD_PREFIX);
2050         if (r < 0)
2051                 log_warning_errno(r, "Failed to parse kernel command line, ignoring: %m");
2052
2053         if (!!s->rate_limit_interval ^ !!s->rate_limit_burst) {
2054                 log_debug("Setting both rate limit interval and burst from "USEC_FMT",%u to 0,0",
2055                           s->rate_limit_interval, s->rate_limit_burst);
2056                 s->rate_limit_interval = s->rate_limit_burst = 0;
2057         }
2058
2059         (void) mkdir_p("/run/systemd/journal", 0755);
2060
2061         s->user_journals = ordered_hashmap_new(NULL);
2062         if (!s->user_journals)
2063                 return log_oom();
2064
2065         s->mmap = mmap_cache_new();
2066         if (!s->mmap)
2067                 return log_oom();
2068
2069         s->deferred_closes = set_new(NULL);
2070         if (!s->deferred_closes)
2071                 return log_oom();
2072
2073         r = sd_event_default(&s->event);
2074         if (r < 0)
2075                 return log_error_errno(r, "Failed to create event loop: %m");
2076
2077         n = sd_listen_fds(true);
2078         if (n < 0)
2079                 return log_error_errno(n, "Failed to read listening file descriptors from environment: %m");
2080
2081         for (fd = SD_LISTEN_FDS_START; fd < SD_LISTEN_FDS_START + n; fd++) {
2082
2083                 if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, "/run/systemd/journal/socket", 0) > 0) {
2084
2085                         if (s->native_fd >= 0)
2086                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2087                                                        "Too many native sockets passed.");
2088
2089                         s->native_fd = fd;
2090
2091                 } else if (sd_is_socket_unix(fd, SOCK_STREAM, 1, "/run/systemd/journal/stdout", 0) > 0) {
2092
2093                         if (s->stdout_fd >= 0)
2094                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2095                                                        "Too many stdout sockets passed.");
2096
2097                         s->stdout_fd = fd;
2098
2099                 } else if (sd_is_socket_unix(fd, SOCK_DGRAM, -1, "/dev/log", 0) > 0 ||
2100                            sd_is_socket_unix(fd, SOCK_DGRAM, -1, "/run/systemd/journal/dev-log", 0) > 0) {
2101
2102                         if (s->syslog_fd >= 0)
2103                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2104                                                        "Too many /dev/log sockets passed.");
2105
2106                         s->syslog_fd = fd;
2107
2108                 } else if (sd_is_socket(fd, AF_NETLINK, SOCK_RAW, -1) > 0) {
2109
2110                         if (s->audit_fd >= 0)
2111                                 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2112                                                        "Too many audit sockets passed.");
2113
2114                         s->audit_fd = fd;
2115
2116                 } else {
2117
2118                         if (!fds) {
2119                                 fds = fdset_new();
2120                                 if (!fds)
2121                                         return log_oom();
2122                         }
2123
2124                         r = fdset_put(fds, fd);
2125                         if (r < 0)
2126                                 return log_oom();
2127                 }
2128         }
2129
2130         /* Try to restore streams, but don't bother if this fails */
2131         (void) server_restore_streams(s, fds);
2132
2133         if (fdset_size(fds) > 0) {
2134                 log_warning("%u unknown file descriptors passed, closing.", fdset_size(fds));
2135                 fds = fdset_free(fds);
2136         }
2137
2138         no_sockets = s->native_fd < 0 && s->stdout_fd < 0 && s->syslog_fd < 0 && s->audit_fd < 0;
2139
2140         /* always open stdout, syslog, native, and kmsg sockets */
2141
2142         /* systemd-journald.socket: /run/systemd/journal/stdout */
2143         r = server_open_stdout_socket(s);
2144         if (r < 0)
2145                 return r;
2146
2147         /* systemd-journald-dev-log.socket: /run/systemd/journal/dev-log */
2148         r = server_open_syslog_socket(s);
2149         if (r < 0)
2150                 return r;
2151
2152         /* systemd-journald.socket: /run/systemd/journal/socket */
2153         r = server_open_native_socket(s);
2154         if (r < 0)
2155                 return r;
2156
2157         /* /dev/kmsg */
2158         r = server_open_dev_kmsg(s);
2159         if (r < 0)
2160                 return r;
2161
2162         /* Unless we got *some* sockets and not audit, open audit socket */
2163         if (s->audit_fd >= 0 || no_sockets) {
2164                 r = server_open_audit(s);
2165                 if (r < 0)
2166                         return r;
2167         }
2168
2169         r = server_open_varlink(s);
2170         if (r < 0)
2171                 return r;
2172
2173         r = server_open_kernel_seqnum(s);
2174         if (r < 0)
2175                 return r;
2176
2177         r = server_open_hostname(s);
2178         if (r < 0)
2179                 return r;
2180
2181         r = setup_signals(s);
2182         if (r < 0)
2183                 return r;
2184
2185         s->rate_limit = journal_rate_limit_new();
2186         if (!s->rate_limit)
2187                 return -ENOMEM;
2188
2189         r = cg_get_root_path(&s->cgroup_root);
2190         if (r < 0)
2191                 return r;
2192
2193         server_cache_hostname(s);
2194         server_cache_boot_id(s);
2195         server_cache_machine_id(s);
2196
2197         s->runtime_storage.path = strjoin("/run/log/journal/", SERVER_MACHINE_ID(s));
2198         s->system_storage.path  = strjoin("/var/log/journal/", SERVER_MACHINE_ID(s));
2199         if (!s->runtime_storage.path || !s->system_storage.path)
2200                 return -ENOMEM;
2201
2202         (void) server_connect_notify(s);
2203
2204         (void) client_context_acquire_default(s);
2205
2206         return system_journal_open(s, false, false);
2207 }
2208
2209 void server_maybe_append_tags(Server *s) {
2210 #if HAVE_GCRYPT
2211         JournalFile *f;
2212         Iterator i;
2213         usec_t n;
2214
2215         n = now(CLOCK_REALTIME);
2216
2217         if (s->system_journal)
2218                 journal_file_maybe_append_tag(s->system_journal, n);
2219
2220         ORDERED_HASHMAP_FOREACH(f, s->user_journals, i)
2221                 journal_file_maybe_append_tag(f, n);
2222 #endif
2223 }
2224
2225 void server_done(Server *s) {
2226         assert(s);
2227
2228         set_free_with_destructor(s->deferred_closes, journal_file_close);
2229
2230         while (s->stdout_streams)
2231                 stdout_stream_free(s->stdout_streams);
2232
2233         client_context_flush_all(s);
2234
2235         if (s->system_journal)
2236                 (void) journal_file_close(s->system_journal);
2237
2238         if (s->runtime_journal)
2239                 (void) journal_file_close(s->runtime_journal);
2240
2241         ordered_hashmap_free_with_destructor(s->user_journals, journal_file_close);
2242
2243         varlink_server_unref(s->varlink_server);
2244
2245         sd_event_source_unref(s->syslog_event_source);
2246         sd_event_source_unref(s->native_event_source);
2247         sd_event_source_unref(s->stdout_event_source);
2248         sd_event_source_unref(s->dev_kmsg_event_source);
2249         sd_event_source_unref(s->audit_event_source);
2250         sd_event_source_unref(s->sync_event_source);
2251         sd_event_source_unref(s->sigusr1_event_source);
2252         sd_event_source_unref(s->sigusr2_event_source);
2253         sd_event_source_unref(s->sigterm_event_source);
2254         sd_event_source_unref(s->sigint_event_source);
2255         sd_event_source_unref(s->sigrtmin1_event_source);
2256         sd_event_source_unref(s->hostname_event_source);
2257         sd_event_source_unref(s->notify_event_source);
2258         sd_event_source_unref(s->watchdog_event_source);
2259         sd_event_unref(s->event);
2260
2261         safe_close(s->syslog_fd);
2262         safe_close(s->native_fd);
2263         safe_close(s->stdout_fd);
2264         safe_close(s->dev_kmsg_fd);
2265         safe_close(s->audit_fd);
2266         safe_close(s->hostname_fd);
2267         safe_close(s->notify_fd);
2268
2269         if (s->rate_limit)
2270                 journal_rate_limit_free(s->rate_limit);
2271
2272         if (s->kernel_seqnum)
2273                 munmap(s->kernel_seqnum, sizeof(uint64_t));
2274
2275         free(s->buffer);
2276         free(s->tty_path);
2277         free(s->cgroup_root);
2278         free(s->hostname_field);
2279         free(s->runtime_storage.path);
2280         free(s->system_storage.path);
2281
2282         mmap_cache_unref(s->mmap);
2283 }
2284
2285 static const char* const storage_table[_STORAGE_MAX] = {
2286         [STORAGE_AUTO] = "auto",
2287         [STORAGE_VOLATILE] = "volatile",
2288         [STORAGE_PERSISTENT] = "persistent",
2289         [STORAGE_NONE] = "none"
2290 };
2291
2292 DEFINE_STRING_TABLE_LOOKUP(storage, Storage);
2293 DEFINE_CONFIG_PARSE_ENUM(config_parse_storage, storage, Storage, "Failed to parse storage setting");
2294
2295 static const char* const split_mode_table[_SPLIT_MAX] = {
2296         [SPLIT_LOGIN] = "login",
2297         [SPLIT_UID] = "uid",
2298         [SPLIT_NONE] = "none",
2299 };
2300
2301 DEFINE_STRING_TABLE_LOOKUP(split_mode, SplitMode);
2302 DEFINE_CONFIG_PARSE_ENUM(config_parse_split_mode, split_mode, SplitMode, "Failed to parse split mode setting");
2303
2304 int config_parse_line_max(
2305                 const char* unit,
2306                 const char *filename,
2307                 unsigned line,
2308                 const char *section,
2309                 unsigned section_line,
2310                 const char *lvalue,
2311                 int ltype,
2312                 const char *rvalue,
2313                 void *data,
2314                 void *userdata) {
2315
2316         size_t *sz = data;
2317         int r;
2318
2319         assert(filename);
2320         assert(lvalue);
2321         assert(rvalue);
2322         assert(data);
2323
2324         if (isempty(rvalue))
2325                 /* Empty assignment means default */
2326                 *sz = DEFAULT_LINE_MAX;
2327         else {
2328                 uint64_t v;
2329
2330                 r = parse_size(rvalue, 1024, &v);
2331                 if (r < 0) {
2332                         log_syntax(unit, LOG_ERR, filename, line, r, "Failed to parse LineMax= value, ignoring: %s", rvalue);
2333                         return 0;
2334                 }
2335
2336                 if (v < 79) {
2337                         /* Why specify 79 here as minimum line length? Simply, because the most common traditional
2338                          * terminal size is 80ch, and it might make sense to break one character before the natural
2339                          * line break would occur on that. */
2340                         log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too small, clamping to 79: %s", rvalue);
2341                         *sz = 79;
2342                 } else if (v > (uint64_t) (SSIZE_MAX-1)) {
2343                         /* So, why specify SSIZE_MAX-1 here? Because that's one below the largest size value read()
2344                          * can return, and we need one extra byte for the trailing NUL byte. Of course IRL such large
2345                          * memory allocations will fail anyway, hence this limit is mostly theoretical anyway, as we'll
2346                          * fail much earlier anyway. */
2347                         log_syntax(unit, LOG_WARNING, filename, line, 0, "LineMax= too large, clamping to %" PRIu64 ": %s", (uint64_t) (SSIZE_MAX-1), rvalue);
2348                         *sz = SSIZE_MAX-1;
2349                 } else
2350                         *sz = (size_t) v;
2351         }
2352
2353         return 0;
2354 }
2355
2356 int config_parse_compress(
2357                 const char* unit,
2358                 const char *filename,
2359                 unsigned line,
2360                 const char *section,
2361                 unsigned section_line,
2362                 const char *lvalue,
2363                 int ltype,
2364                 const char *rvalue,
2365                 void *data,
2366                 void *userdata) {
2367
2368         JournalCompressOptions* compress = data;
2369         int r;
2370
2371         if (isempty(rvalue)) {
2372                 compress->enabled = true;
2373                 compress->threshold_bytes = (uint64_t) -1;
2374         } else if (streq(rvalue, "1")) {
2375                 log_syntax(unit, LOG_WARNING, filename, line, 0,
2376                            "Compress= ambiguously specified as 1, enabling compression with default threshold");
2377                 compress->enabled = true;
2378         } else if (streq(rvalue, "0")) {
2379                 log_syntax(unit, LOG_WARNING, filename, line, 0,
2380                            "Compress= ambiguously specified as 0, disabling compression");
2381                 compress->enabled = false;
2382         } else {
2383                 r = parse_boolean(rvalue);
2384                 if (r < 0) {
2385                         r = parse_size(rvalue, 1024, &compress->threshold_bytes);
2386                         if (r < 0)
2387                                 log_syntax(unit, LOG_ERR, filename, line, r,
2388                                            "Failed to parse Compress= value, ignoring: %s", rvalue);
2389                         else
2390                                 compress->enabled = true;
2391                 } else
2392                         compress->enabled = r;
2393         }
2394
2395         return 0;
2396 }