]> git.ipfire.org Git - thirdparty/systemd.git/blame_incremental - src/coredump/coredump.c
Kill several SysV compat functionalities (v258) (#38178)
[thirdparty/systemd.git] / src / coredump / coredump.c
... / ...
CommitLineData
1/* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3#include <elf.h>
4#include <stdio.h>
5#include <sys/mount.h>
6#include <sys/statvfs.h>
7#include <sys/xattr.h>
8#include <unistd.h>
9
10#include "sd-bus.h"
11#include "sd-daemon.h"
12#include "sd-journal.h"
13#include "sd-json.h"
14#include "sd-login.h"
15#include "sd-messages.h"
16
17#include "acl-util.h"
18#include "alloc-util.h"
19#include "bus-error.h"
20#include "capability-util.h"
21#include "cgroup-util.h"
22#include "compress.h"
23#include "conf-parser.h"
24#include "copy.h"
25#include "coredump-util.h"
26#include "coredump-vacuum.h"
27#include "dirent-util.h"
28#include "elf-util.h"
29#include "errno-util.h"
30#include "escape.h"
31#include "fd-util.h"
32#include "fileio.h"
33#include "fs-util.h"
34#include "io-util.h"
35#include "iovec-util.h"
36#include "journal-importer.h"
37#include "journal-send.h"
38#include "json-util.h"
39#include "log.h"
40#include "main-func.h"
41#include "memory-util.h"
42#include "memstream-util.h"
43#include "mkdir-label.h"
44#include "namespace-util.h"
45#include "parse-util.h"
46#include "path-util.h"
47#include "pidref.h"
48#include "process-util.h"
49#include "signal-util.h"
50#include "socket-util.h"
51#include "special.h"
52#include "stat-util.h"
53#include "string-table.h"
54#include "string-util.h"
55#include "tmpfile-util.h"
56#include "uid-classification.h"
57#include "user-util.h"
58
59/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
60#if __SIZEOF_POINTER__ == 4
61#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
62#elif __SIZEOF_POINTER__ == 8
63#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
64#else
65#error "Unexpected pointer size"
66#endif
67
68/* The maximum size up to which we leave the coredump around on disk */
69#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
70
71/* The maximum size up to which we store the coredump in the journal */
72#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
73#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
74#else
75/* oss-fuzz limits memory usage. */
76#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
77#endif
78
79/* When checking for available memory and setting lower limits, don't
80 * go below 4MB for writing core files to storage. */
81#define PROCESS_SIZE_MIN (4U*1024U*1024U)
82
83/* Make sure to not make this larger than the maximum journal entry
84 * size. See DATA_SIZE_MAX in journal-importer.h. */
85assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
86
87#define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs"
88
89typedef enum {
90 /* We use these as array indexes for our process metadata cache.
91 *
92 * The first indices of the cache stores the same metadata as the ones passed by the kernel via
93 * argv[], i.e. the strings specified in our pattern defined in /proc/sys/kernel/core_pattern,
94 * see core(5). */
95
96 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
97 META_ARGV_UID, /* %u: as seen in the initial user namespace */
98 META_ARGV_GID, /* %g: as seen in the initial user namespace */
99 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
100 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
101 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
102 _META_ARGV_REQUIRED,
103 /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */
104 META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */
105 META_ARGV_DUMPABLE, /* %d: as set by the kernel */
106 META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */
107 /* If new fields are added, they should be added here, to maintain compatibility
108 * with callers which don't know about the new fields. */
109 _META_ARGV_MAX,
110
111 /* The following indexes are cached for a couple of special fields we use (and
112 * thereby need to be retrieved quickly) for naming coredump files, and attaching
113 * xattrs. Unlike the previous ones they are retrieved from the runtime
114 * environment. */
115
116 META_COMM = _META_ARGV_MAX,
117
118 /* The rest are similar to the previous ones except that we won't fail if one of
119 * them is missing in a message sent over the socket. */
120
121 META_EXE,
122 META_UNIT,
123 META_PROC_AUXV,
124 _META_MAX
125} meta_argv_t;
126
127static const char * const meta_field_names[_META_MAX] = {
128 [META_ARGV_PID] = "COREDUMP_PID=",
129 [META_ARGV_UID] = "COREDUMP_UID=",
130 [META_ARGV_GID] = "COREDUMP_GID=",
131 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
132 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
133 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
134 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
135 [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=",
136 [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=",
137 [META_COMM] = "COREDUMP_COMM=",
138 [META_EXE] = "COREDUMP_EXE=",
139 [META_UNIT] = "COREDUMP_UNIT=",
140 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
141};
142
143typedef struct Context {
144 PidRef pidref;
145 uid_t uid;
146 gid_t gid;
147 unsigned dumpable;
148 int signo;
149 uint64_t rlimit;
150 bool is_pid1;
151 bool is_journald;
152 bool got_pidfd;
153 int mount_tree_fd;
154
155 /* These point into external memory, are not owned by this object */
156 const char *meta[_META_MAX];
157 size_t meta_size[_META_MAX];
158} Context;
159
160#define CONTEXT_NULL \
161 (Context) { \
162 .pidref = PIDREF_NULL, \
163 .uid = UID_INVALID, \
164 .gid = GID_INVALID, \
165 .mount_tree_fd = -EBADF, \
166 }
167
168typedef enum CoredumpStorage {
169 COREDUMP_STORAGE_NONE,
170 COREDUMP_STORAGE_EXTERNAL,
171 COREDUMP_STORAGE_JOURNAL,
172 _COREDUMP_STORAGE_MAX,
173 _COREDUMP_STORAGE_INVALID = -EINVAL,
174} CoredumpStorage;
175
176static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
177 [COREDUMP_STORAGE_NONE] = "none",
178 [COREDUMP_STORAGE_EXTERNAL] = "external",
179 [COREDUMP_STORAGE_JOURNAL] = "journal",
180};
181
182DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
183static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage);
184
185static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
186static bool arg_compress = true;
187static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
188static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
189static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
190static uint64_t arg_keep_free = UINT64_MAX;
191static uint64_t arg_max_use = UINT64_MAX;
192#if HAVE_DWFL_SET_SYSROOT
193static bool arg_enter_namespace = false;
194#endif
195
196static void context_done(Context *c) {
197 assert(c);
198
199 pidref_done(&c->pidref);
200 c->mount_tree_fd = safe_close(c->mount_tree_fd);
201}
202
203static int parse_config(void) {
204 static const ConfigTableItem items[] = {
205 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
206 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
207 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
208 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
209 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
210 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
211 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
212#if HAVE_DWFL_SET_SYSROOT
213 { "Coredump", "EnterNamespace", config_parse_bool, 0, &arg_enter_namespace },
214#else
215 { "Coredump", "EnterNamespace", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
216#endif
217 {}
218 };
219
220 int r;
221
222 r = config_parse_standard_file_with_dropins(
223 "systemd/coredump.conf",
224 "Coredump\0",
225 config_item_table_lookup,
226 items,
227 CONFIG_PARSE_WARN,
228 /* userdata= */ NULL);
229 if (r < 0)
230 return r;
231
232 /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
233 * efficiency reasons. journald wouldn't accept anything larger anyway. */
234 if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
235 log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
236 FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
237 arg_journal_size_max = JOURNAL_SIZE_MAX;
238 }
239
240 return 0;
241}
242
243static uint64_t storage_size_max(void) {
244 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
245 return arg_external_size_max;
246 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
247 return arg_journal_size_max;
248 assert(arg_storage == COREDUMP_STORAGE_NONE);
249 return 0;
250}
251
252static int fix_acl(int fd, uid_t uid, bool allow_user) {
253 assert(fd >= 0);
254 assert(uid_is_valid(uid));
255
256#if HAVE_ACL
257 int r;
258
259 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
260 if (!allow_user)
261 return 0;
262
263 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid_is_greeter(uid) || uid == UID_NOBODY)
264 return 0;
265
266 /* Make sure normal users can read (but not write or delete) their own coredumps */
267 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
268 if (r < 0)
269 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
270#endif
271
272 return 0;
273}
274
275static int fix_xattr(int fd, const Context *context) {
276 static const char * const xattrs[_META_MAX] = {
277 [META_ARGV_PID] = "user.coredump.pid",
278 [META_ARGV_UID] = "user.coredump.uid",
279 [META_ARGV_GID] = "user.coredump.gid",
280 [META_ARGV_SIGNAL] = "user.coredump.signal",
281 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
282 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
283 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
284 [META_COMM] = "user.coredump.comm",
285 [META_EXE] = "user.coredump.exe",
286 };
287
288 int r = 0;
289
290 assert(fd >= 0);
291
292 /* Attach some metadata to coredumps via extended attributes. Just because we can. */
293
294 for (unsigned i = 0; i < _META_MAX; i++) {
295 int k;
296
297 if (isempty(context->meta[i]) || !xattrs[i])
298 continue;
299
300 k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
301 RET_GATHER(r, k);
302 }
303
304 return r;
305}
306
307#define filename_escape(s) xescape((s), "./ ")
308
309static const char *coredump_tmpfile_name(const char *s) {
310 return s ?: "(unnamed temporary file)";
311}
312
313static int fix_permissions_and_link(
314 int fd,
315 const char *filename,
316 const char *target,
317 const Context *context,
318 bool allow_user) {
319
320 int r;
321
322 assert(fd >= 0);
323 assert(target);
324 assert(context);
325
326 /* Ignore errors on these */
327 (void) fchmod(fd, 0640);
328 (void) fix_acl(fd, context->uid, allow_user);
329 (void) fix_xattr(fd, context);
330
331 r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
332 if (r < 0)
333 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
334
335 return 0;
336}
337
338static int maybe_remove_external_coredump(
339 const Context *c,
340 const char *filename,
341 uint64_t size) {
342
343 assert(c);
344
345 /* Returns true if might remove, false if will not remove, < 0 on error. */
346
347 if (arg_storage != COREDUMP_STORAGE_NONE &&
348 (c->is_pid1 || c->is_journald)) /* Always keep around in case of journald/pid1, since we cannot rely on the journal to accept them */
349 return false;
350
351 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
352 size <= arg_external_size_max)
353 return false;
354
355 if (!filename)
356 return true;
357
358 if (unlink(filename) < 0 && errno != ENOENT)
359 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
360
361 return true;
362}
363
364static int make_filename(const Context *context, char **ret) {
365 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
366 sd_id128_t boot = {};
367 int r;
368
369 assert(context);
370
371 c = filename_escape(context->meta[META_COMM]);
372 if (!c)
373 return -ENOMEM;
374
375 u = filename_escape(context->meta[META_ARGV_UID]);
376 if (!u)
377 return -ENOMEM;
378
379 r = sd_id128_get_boot(&boot);
380 if (r < 0)
381 return r;
382
383 p = filename_escape(context->meta[META_ARGV_PID]);
384 if (!p)
385 return -ENOMEM;
386
387 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
388 if (!t)
389 return -ENOMEM;
390
391 if (asprintf(ret,
392 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
393 c,
394 u,
395 SD_ID128_FORMAT_VAL(boot),
396 p,
397 t) < 0)
398 return -ENOMEM;
399
400 return 0;
401}
402
403static int grant_user_access(int core_fd, const Context *context) {
404 int at_secure = -1;
405 uid_t uid = UID_INVALID, euid = UID_INVALID;
406 uid_t gid = GID_INVALID, egid = GID_INVALID;
407 int r;
408
409 assert(core_fd >= 0);
410 assert(context);
411
412 if (!context->meta[META_PROC_AUXV])
413 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
414
415 uint8_t elf[EI_NIDENT];
416 errno = 0;
417 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
418 return log_warning_errno(errno_or_else(EIO),
419 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
420
421 if (elf[EI_MAG0] != ELFMAG0 ||
422 elf[EI_MAG1] != ELFMAG1 ||
423 elf[EI_MAG2] != ELFMAG2 ||
424 elf[EI_MAG3] != ELFMAG3 ||
425 elf[EI_VERSION] != EV_CURRENT)
426 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
427 "Core file does not have ELF header, not adjusting permissions.");
428 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
429 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
430 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
431 "Core file has strange ELF class, not adjusting permissions.");
432
433 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
434 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
435 "Core file has non-native endianness, not adjusting permissions.");
436
437 r = parse_auxv(LOG_WARNING,
438 /* elf_class= */ elf[EI_CLASS],
439 context->meta[META_PROC_AUXV],
440 context->meta_size[META_PROC_AUXV],
441 &at_secure, &uid, &euid, &gid, &egid);
442 if (r < 0)
443 return r;
444
445 /* We allow access if %d/dumpable on the command line was exactly 1, we got all the data,
446 * at_secure is not set, and the uid/gid match euid/egid. */
447 bool ret =
448 context->dumpable == SUID_DUMP_USER &&
449 at_secure == 0 &&
450 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
451 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
452 log_debug("Will %s access (dumpable=%u uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
453 ret ? "permit" : "restrict",
454 context->dumpable,
455 uid, euid, gid, egid, yes_no(at_secure));
456 return ret;
457}
458
459static int save_external_coredump(
460 const Context *context,
461 int input_fd,
462 char **ret_filename,
463 int *ret_node_fd,
464 int *ret_data_fd,
465 uint64_t *ret_size,
466 uint64_t *ret_compressed_size,
467 bool *ret_truncated) {
468
469 _cleanup_(unlink_and_freep) char *tmp = NULL;
470 _cleanup_free_ char *fn = NULL;
471 _cleanup_close_ int fd = -EBADF;
472 uint64_t process_limit, max_size;
473 bool truncated, storage_on_tmpfs;
474 struct stat st;
475 int r;
476
477 assert(context);
478 assert(ret_filename);
479 assert(ret_node_fd);
480 assert(ret_data_fd);
481 assert(ret_size);
482 assert(ret_compressed_size);
483 assert(ret_truncated);
484
485 if (context->rlimit < page_size())
486 /* Is coredumping disabled? Then don't bother saving/processing the
487 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
488 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
489 * is usually the same as PAGE_SIZE. */
490 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
491 "Resource limits disable core dumping for process %s (%s).",
492 context->meta[META_ARGV_PID], context->meta[META_COMM]);
493
494 process_limit = MAX(arg_process_size_max, storage_size_max());
495 if (process_limit == 0)
496 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
497 "Limits for coredump processing and storage are both 0, not dumping core.");
498
499 /* Never store more than the process configured, or than we actually shall keep or process */
500 max_size = MIN(context->rlimit, process_limit);
501
502 r = make_filename(context, &fn);
503 if (r < 0)
504 return log_error_errno(r, "Failed to determine coredump file name: %m");
505
506 (void) mkdir_parents_label(fn, 0755);
507
508 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
509 if (fd < 0)
510 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
511
512 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
513 * the service or the slice it belongs to. This is common on low-resources systems,
514 * to avoid crashing processes to take away too many system resources.
515 * Check the cgroup settings, and set max_size to a bit less than half of the
516 * available memory left to the process.
517 * Then, attempt to write the core file uncompressed first - if the write gets
518 * interrupted, we know we won't be able to write it all, so instead compress what
519 * was written so far, delete the uncompressed truncated core, and then continue
520 * compressing from STDIN. Given the compressed core cannot be larger than the
521 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
522 * should be able to at least store the full compressed core file. */
523
524 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
525 if (storage_on_tmpfs && arg_compress) {
526 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
527 uint64_t cgroup_limit = UINT64_MAX;
528 struct statvfs sv;
529
530 /* If we can't get the cgroup limit, just ignore it, but don't fail,
531 * try anyway with the config settings. */
532 r = sd_bus_default_system(&bus);
533 if (r < 0)
534 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
535 else {
536 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
537
538 r = sd_bus_get_property_trivial(
539 bus,
540 "org.freedesktop.systemd1",
541 "/org/freedesktop/systemd1/unit/self",
542 "org.freedesktop.systemd1.Service",
543 "MemoryAvailable",
544 &error,
545 't', &cgroup_limit);
546 if (r < 0)
547 log_warning_errno(r,
548 "Failed to query MemoryAvailable for current unit, "
549 "falling back to static config settings: %s",
550 bus_error_message(&error, r));
551 }
552
553 /* First, ensure we are not going to go over the cgroup limit */
554 max_size = MIN(cgroup_limit, max_size);
555 /* tmpfs might get full quickly, so check the available space too. But don't worry about
556 * errors here, failing to access the storage location will be better logged when writing to
557 * it. */
558 if (fstatvfs(fd, &sv) >= 0)
559 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
560 /* Impose a lower minimum, otherwise we will miss the basic headers. */
561 max_size = MAX(PROCESS_SIZE_MIN, max_size);
562 /* Ensure we can always switch to compressing on the fly in case we are running out of space
563 * by keeping half of the space/memory available, plus 1KB metadata overhead from the
564 * compression algorithm. */
565 max_size = LESS_BY(max_size, 1024U) / 2;
566
567 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup and/or filesystem limits.", max_size);
568 }
569
570 r = copy_bytes(input_fd, fd, max_size, 0);
571 if (r < 0)
572 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
573 context->meta[META_ARGV_PID], context->meta[META_COMM]);
574 truncated = r == 1;
575
576 bool allow_user = grant_user_access(fd, context) > 0;
577
578#if HAVE_COMPRESSION
579 if (arg_compress) {
580 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
581 _cleanup_free_ char *fn_compressed = NULL;
582 _cleanup_close_ int fd_compressed = -EBADF;
583 uint64_t uncompressed_size = 0;
584
585 if (lseek(fd, 0, SEEK_SET) < 0)
586 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
587
588 fn_compressed = strjoin(fn, default_compression_extension());
589 if (!fn_compressed)
590 return log_oom();
591
592 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
593 if (fd_compressed < 0)
594 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
595
596 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
597 if (r < 0)
598 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
599
600 if (truncated && storage_on_tmpfs) {
601 uint64_t partial_uncompressed_size = 0;
602
603 /* Uncompressed write was truncated and we are writing to tmpfs: delete
604 * the uncompressed core, and compress the remaining part from STDIN. */
605
606 tmp = unlink_and_free(tmp);
607 fd = safe_close(fd);
608
609 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
610 if (r < 0)
611 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
612 uncompressed_size += partial_uncompressed_size;
613 }
614
615 r = fix_permissions_and_link(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
616 if (r < 0)
617 return r;
618
619 if (fstat(fd_compressed, &st) < 0)
620 return log_error_errno(errno,
621 "Failed to fstat core file %s: %m",
622 coredump_tmpfile_name(tmp_compressed));
623
624 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
625 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
626 *ret_data_fd = TAKE_FD(fd);
627 *ret_size = uncompressed_size;
628 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
629 *ret_truncated = truncated;
630
631 return 0;
632 }
633#endif
634
635 if (truncated)
636 log_struct(LOG_INFO,
637 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
638 LOG_ITEM("SIZE_LIMIT=%"PRIu64, max_size),
639 LOG_MESSAGE_ID(SD_MESSAGE_TRUNCATED_CORE_STR));
640
641 r = fix_permissions_and_link(fd, tmp, fn, context, allow_user);
642 if (r < 0)
643 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
644
645 if (fstat(fd, &st) < 0)
646 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
647
648 if (lseek(fd, 0, SEEK_SET) < 0)
649 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
650
651 *ret_filename = TAKE_PTR(fn);
652 *ret_node_fd = -EBADF;
653 *ret_data_fd = TAKE_FD(fd);
654 *ret_size = (uint64_t) st.st_size;
655 *ret_compressed_size = UINT64_MAX;
656 *ret_truncated = truncated;
657
658 return 0;
659}
660
661static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
662 _cleanup_free_ char *field = NULL;
663 ssize_t n;
664
665 assert(fd >= 0);
666 assert(ret);
667 assert(ret_size);
668
669 if (lseek(fd, 0, SEEK_SET) < 0)
670 return log_warning_errno(errno, "Failed to seek: %m");
671
672 field = malloc(9 + size);
673 if (!field)
674 return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
675 "Failed to allocate memory for coredump, coredump will not be stored.");
676
677 memcpy(field, "COREDUMP=", 9);
678
679 /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
680 * 0x7ffff000 bytes max. Hence call things in a loop. */
681 n = loop_read(fd, field + 9, size, /* do_poll= */ false);
682 if (n < 0)
683 return log_error_errno((int) n, "Failed to read core data: %m");
684 if ((size_t) n < size)
685 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Core data too short.");
686
687 *ret = TAKE_PTR(field);
688 *ret_size = size + 9;
689
690 return 0;
691}
692
693/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
694 * 0:/dev/pts/23
695 * pos: 0
696 * flags: 0100002
697 *
698 * 1:/dev/pts/23
699 * pos: 0
700 * flags: 0100002
701 *
702 * 2:/dev/pts/23
703 * pos: 0
704 * flags: 0100002
705 * EOF
706 */
707static int compose_open_fds(pid_t pid, char **ret) {
708 _cleanup_(memstream_done) MemStream m = {};
709 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
710 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
711 const char *fddelim = "", *path;
712 FILE *stream;
713 int r;
714
715 assert(pid >= 0);
716 assert(ret);
717
718 path = procfs_file_alloca(pid, "fd");
719 proc_fd_dir = opendir(path);
720 if (!proc_fd_dir)
721 return -errno;
722
723 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
724 if (proc_fdinfo_fd < 0)
725 return -errno;
726
727 stream = memstream_init(&m);
728 if (!stream)
729 return -ENOMEM;
730
731 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
732 _cleanup_fclose_ FILE *fdinfo = NULL;
733 _cleanup_free_ char *fdname = NULL;
734 _cleanup_close_ int fd = -EBADF;
735
736 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
737 if (r < 0)
738 return r;
739
740 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
741 fddelim = "\n";
742
743 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
744 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
745 if (fd < 0)
746 continue;
747
748 fdinfo = take_fdopen(&fd, "r");
749 if (!fdinfo)
750 continue;
751
752 for (;;) {
753 _cleanup_free_ char *line = NULL;
754
755 r = read_line(fdinfo, LONG_LINE_MAX, &line);
756 if (r < 0)
757 return r;
758 if (r == 0)
759 break;
760
761 fputs(line, stream);
762 fputc('\n', stream);
763 }
764 }
765
766 return memstream_finalize(&m, ret, NULL);
767}
768
769/* Returns 1 if the parent was found.
770 * Returns 0 if there is not a process we can call the pid's
771 * container parent (the pid's process isn't 'containerized').
772 * Returns a negative number on errors.
773 */
774static int get_process_container_parent_cmdline(PidRef *pid, char** ret_cmdline) {
775 int r;
776
777 assert(pidref_is_set(pid));
778 assert(!pidref_is_remote(pid));
779
780 r = pidref_from_same_root_fs(pid, &PIDREF_MAKE_FROM_PID(1));
781 if (r < 0)
782 return r;
783 if (r > 0) {
784 /* The process uses system root. */
785 *ret_cmdline = NULL;
786 return 0;
787 }
788
789 _cleanup_(pidref_done) PidRef container_pid = PIDREF_NULL;
790 r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
791 if (r < 0)
792 return r;
793
794 r = pidref_get_cmdline(&container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, ret_cmdline);
795 if (r < 0)
796 return r;
797
798 return 1;
799}
800
801static int change_uid_gid(const Context *context) {
802 int r;
803
804 assert(context);
805
806 uid_t uid = context->uid;
807 gid_t gid = context->gid;
808
809 if (uid_is_system(uid)) {
810 const char *user = "systemd-coredump";
811
812 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
813 if (r < 0) {
814 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
815 uid = gid = 0;
816 }
817 }
818
819 return drop_privileges(uid, gid, 0);
820}
821
822static int attach_mount_tree(int mount_tree_fd) {
823 int r;
824
825 assert(mount_tree_fd >= 0);
826
827 r = detach_mount_namespace();
828 if (r < 0)
829 return log_warning_errno(r, "Failed to detach mount namespace: %m");
830
831 r = mkdir_p_label(MOUNT_TREE_ROOT, 0555);
832 if (r < 0)
833 return log_warning_errno(r, "Failed to create directory: %m");
834
835 r = mount_setattr(mount_tree_fd, "", AT_EMPTY_PATH,
836 &(struct mount_attr) {
837 /* MOUNT_ATTR_NOSYMFOLLOW is left out on purpose to allow libdwfl to resolve symlinks.
838 * libdwfl will use openat2() with RESOLVE_IN_ROOT so there is no risk of symlink escape.
839 * https://sourceware.org/git/?p=elfutils.git;a=patch;h=06f0520f9a78b07c11c343181d552791dd630346 */
840 .attr_set = MOUNT_ATTR_RDONLY|MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC,
841 .propagation = MS_SLAVE,
842 }, sizeof(struct mount_attr));
843 if (r < 0)
844 return log_warning_errno(errno, "Failed to change properties of mount tree: %m");
845
846 r = move_mount(mount_tree_fd, "", -EBADF, MOUNT_TREE_ROOT, MOVE_MOUNT_F_EMPTY_PATH);
847 if (r < 0)
848 return log_warning_errno(errno, "Failed to attach mount tree: %m");
849
850 return 0;
851}
852
853static int submit_coredump(
854 const Context *context,
855 struct iovec_wrapper *iovw,
856 int input_fd) {
857
858 _cleanup_(sd_json_variant_unrefp) sd_json_variant *json_metadata = NULL;
859 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
860 _cleanup_free_ char *filename = NULL, *coredump_data = NULL, *stacktrace = NULL;
861 const char *module_name, *root = NULL;
862 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
863 bool truncated = false, written = false;
864 sd_json_variant *module_json;
865 int r;
866
867 assert(context);
868 assert(iovw);
869 assert(input_fd >= 0);
870
871 /* Vacuum before we write anything again */
872 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
873
874 /* Always stream the coredump to disk, if that's possible */
875 written = save_external_coredump(
876 context, input_fd,
877 &filename, &coredump_node_fd, &coredump_fd,
878 &coredump_size, &coredump_compressed_size, &truncated) >= 0;
879 if (written) {
880 /* If we could write it to disk we can now process it. */
881 /* If we don't want to keep the coredump on disk, remove it now, as later on we
882 * will lack the privileges for it. However, we keep the fd to it, so that we can
883 * still process it and log it. */
884 r = maybe_remove_external_coredump(
885 context,
886 filename,
887 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
888 if (r < 0)
889 return r;
890 if (r == 0)
891 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
892 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
893 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
894 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
895
896 /* Vacuum again, but exclude the coredump we just created */
897 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
898 }
899
900 if (context->mount_tree_fd >= 0 && attach_mount_tree(context->mount_tree_fd) >= 0)
901 root = MOUNT_TREE_ROOT;
902
903 /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
904 * coredump memory under the user's uid. This also ensures that the credentials journald will see are
905 * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's
906 * also get rid of all capabilities, if we run as root, we won't need them anymore. */
907 r = change_uid_gid(context);
908 if (r < 0)
909 return log_error_errno(r, "Failed to drop privileges: %m");
910
911 if (written) {
912 /* Try to get a stack trace if we can */
913 if (coredump_size > arg_process_size_max)
914 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
915 "than %"PRIu64" (the configured maximum)",
916 coredump_size, arg_process_size_max);
917 else if (coredump_fd >= 0) {
918 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
919
920 (void) parse_elf_object(coredump_fd,
921 context->meta[META_EXE],
922 root,
923 /* fork_disable_dump= */ skip, /* avoid loops */
924 &stacktrace,
925 &json_metadata);
926 }
927 }
928
929 _cleanup_free_ char *core_message = NULL;
930 core_message = strjoin(
931 "Process ", context->meta[META_ARGV_PID],
932 " (", context->meta[META_COMM],
933 ") of user ", context->meta[META_ARGV_UID],
934 written ? " dumped core." : " terminated abnormally without generating a coredump.");
935 if (!core_message)
936 return log_oom();
937
938 if (context->is_journald && filename)
939 if (!strextend(&core_message, "\nCoredump diverted to ", filename))
940 return log_oom();
941
942 if (stacktrace)
943 if (!strextend(&core_message, "\n\n", stacktrace))
944 return log_oom();
945
946 if (context->is_journald)
947 /* We might not be able to log to the journal, so let's always print the message to another
948 * log target. The target was set previously to something safe. */
949 log_dispatch(LOG_ERR, 0, core_message);
950
951 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
952
953 if (truncated)
954 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
955
956 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
957 * attach it as journal metadata. */
958 if (json_metadata) {
959 _cleanup_free_ char *formatted_json = NULL;
960
961 r = sd_json_variant_format(json_metadata, 0, &formatted_json);
962 if (r < 0)
963 return log_error_errno(r, "Failed to format JSON package metadata: %m");
964
965 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
966 }
967
968 /* In the unlikely scenario that context->meta[META_EXE] is not available,
969 * let's avoid guessing the module name and skip the loop. */
970 if (context->meta[META_EXE])
971 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
972 sd_json_variant *t;
973
974 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
975 if (!path_equal_filename(module_name, context->meta[META_EXE]))
976 continue;
977
978 t = sd_json_variant_by_key(module_json, "name");
979 if (t)
980 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", sd_json_variant_string(t));
981
982 t = sd_json_variant_by_key(module_json, "version");
983 if (t)
984 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", sd_json_variant_string(t));
985 }
986
987 /* Optionally store the entire coredump in the journal */
988 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
989 if (coredump_size <= arg_journal_size_max) {
990 size_t sz = 0;
991
992 /* Store the coredump itself in the journal */
993
994 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
995 if (r >= 0) {
996 if (iovw_put(iovw, coredump_data, sz) >= 0)
997 TAKE_PTR(coredump_data);
998 } else
999 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
1000 } else
1001 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
1002 coredump_size, arg_journal_size_max);
1003 }
1004
1005 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
1006 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
1007 * the coredump to the socket. */
1008
1009 if (context->is_journald) {
1010 r = journal_fd_nonblock(true);
1011 if (r < 0)
1012 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
1013 }
1014
1015 r = sd_journal_sendv(iovw->iovec, iovw->count);
1016
1017 if (context->is_journald) {
1018 int k;
1019
1020 k = journal_fd_nonblock(false);
1021 if (k < 0)
1022 return log_error_errno(k, "Failed to make journal socket blocking: %m");
1023 }
1024
1025 if (r == -EAGAIN && context->is_journald)
1026 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
1027 else if (r < 0)
1028 return log_error_errno(r, "Failed to log coredump: %m");
1029
1030 return 0;
1031}
1032
1033static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) {
1034 const char *unit;
1035 int r;
1036
1037 assert(context);
1038 assert(iovw);
1039
1040 /* Converts the data in the iovec array iovw into separate fields. Fills in context->meta[] (for
1041 * which no memory is allocated, it just contains direct pointers into the iovec array memory). */
1042
1043 bool have_signal_name = false;
1044 FOREACH_ARRAY(iovec, iovw->iovec, iovw->count) {
1045 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
1046 /* Note that these strings are NUL-terminated, because we made sure that a
1047 * trailing NUL byte is in the buffer, though not included in the iov_len
1048 * count (see process_socket() and gather_pid_metadata_*()). */
1049 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
1050
1051 const char *p = memory_startswith(iovec->iov_base, iovec->iov_len, meta_field_names[i]);
1052 if (p) {
1053 context->meta[i] = p;
1054 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
1055 break;
1056 }
1057 }
1058
1059 have_signal_name = have_signal_name ||
1060 memory_startswith(iovec->iov_base, iovec->iov_len, "COREDUMP_SIGNAL_NAME=");
1061 }
1062
1063 /* The basic fields from argv[] should always be there, refuse early if not. */
1064 for (int i = 0; i < _META_ARGV_REQUIRED; i++)
1065 if (!context->meta[i])
1066 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1067 "A required (%s) has not been sent, aborting.", meta_field_names[i]);
1068
1069 pid_t parsed_pid;
1070 r = parse_pid(context->meta[META_ARGV_PID], &parsed_pid);
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
1073 if (pidref_is_set(&context->pidref)) {
1074 if (context->pidref.pid != parsed_pid)
1075 return log_error_errno(r, "Passed PID " PID_FMT " does not match passed " PID_FMT ": %m",
1076 parsed_pid, context->pidref.pid);
1077 } else {
1078 r = pidref_set_pid(&context->pidref, parsed_pid);
1079 if (r < 0)
1080 return log_error_errno(r, "Failed to initialize pidref from pid " PID_FMT ": %m", parsed_pid);
1081 }
1082
1083 r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
1086
1087 r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
1088 if (r < 0)
1089 return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
1090
1091 r = parse_signo(context->meta[META_ARGV_SIGNAL], &context->signo);
1092 if (r < 0)
1093 log_warning_errno(r, "Failed to parse signal number \"%s\", ignoring: %m", context->meta[META_ARGV_SIGNAL]);
1094
1095 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &context->rlimit);
1096 if (r < 0)
1097 log_warning_errno(r, "Failed to parse resource limit \"%s\", ignoring: %m", context->meta[META_ARGV_RLIMIT]);
1098
1099 /* The value is set to contents of /proc/sys/fs/suid_dumpable, which we set to SUID_DUMP_SAFE (2),
1100 * if the process is marked as not dumpable, see PR_SET_DUMPABLE(2const). */
1101 if (context->meta[META_ARGV_DUMPABLE]) {
1102 r = safe_atou(context->meta[META_ARGV_DUMPABLE], &context->dumpable);
1103 if (r < 0)
1104 return log_error_errno(r, "Failed to parse dumpable field \"%s\": %m", context->meta[META_ARGV_DUMPABLE]);
1105 if (context->dumpable > SUID_DUMP_SAFE)
1106 log_notice("Got unexpected %%d/dumpable value %u.", context->dumpable);
1107 }
1108
1109 unit = context->meta[META_UNIT];
1110 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
1111 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
1112
1113 /* After parsing everything, let's also synthesize a new iovw field for the textual signal name if it
1114 * isn't already set. */
1115 if (SIGNAL_VALID(context->signo) && !have_signal_name)
1116 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG", signal_to_string(context->signo));
1117
1118 return 0;
1119}
1120
1121static int process_socket(int fd) {
1122 _cleanup_(iovw_done_free) struct iovec_wrapper iovw = {};
1123 _cleanup_(context_done) Context context = CONTEXT_NULL;
1124 _cleanup_close_ int input_fd = -EBADF;
1125 enum {
1126 STATE_PAYLOAD,
1127 STATE_INPUT_FD_DONE,
1128 STATE_PID_FD_DONE,
1129 } state = STATE_PAYLOAD;
1130 int r;
1131
1132 assert(fd >= 0);
1133
1134 log_setup();
1135
1136 log_debug("Processing coredump received via socket...");
1137
1138 for (;;) {
1139 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
1140 struct msghdr mh = {
1141 .msg_control = &control,
1142 .msg_controllen = sizeof(control),
1143 .msg_iovlen = 1,
1144 };
1145 ssize_t n, l;
1146
1147 l = next_datagram_size_fd(fd);
1148 if (l < 0)
1149 return log_error_errno(l, "Failed to determine datagram size to read: %m");
1150
1151 _cleanup_(iovec_done) struct iovec iovec = {
1152 .iov_len = l,
1153 .iov_base = malloc(l + 1),
1154 };
1155 if (!iovec.iov_base)
1156 return log_oom();
1157
1158 mh.msg_iov = &iovec;
1159
1160 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
1161 if (n < 0)
1162 return log_error_errno(n, "Failed to receive datagram: %m");
1163
1164 /* The final zero-length datagrams ("sentinels") carry file descriptors and tell us that
1165 * we're done. There are three sentinels: one with just the coredump fd, followed by one with
1166 * the pidfd, and finally one with the mount tree fd. The latter two or the last one may be
1167 * omitted (which is supported for compatibility with older systemd version, in particular to
1168 * facilitate cross-container coredumping). */
1169 if (n == 0) {
1170 struct cmsghdr *found;
1171
1172 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
1173 if (!found) {
1174 /* This is zero length message but it either doesn't carry a single
1175 * descriptor, or it has more than one. This is a protocol violation so let's
1176 * bail out.
1177 *
1178 * Well, not quite! In practice there's one more complication: EOF on
1179 * SOCK_SEQPACKET is not distinguishable from a zero length datagram. Hence
1180 * if we get a zero length datagram without fds we consider it EOF, and
1181 * that's permissible for the final two fds. Hence let's be strict on the
1182 * first fd, but lenient on the other two. */
1183
1184 if (!cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1) && state != STATE_PAYLOAD)
1185 /* No fds, and already got the first fd → we are done. */
1186 break;
1187
1188 cmsg_close_all(&mh);
1189 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1190 "Received zero length message with zero or more than one file descriptor(s), expected one.");
1191 }
1192
1193 switch (state) {
1194
1195 case STATE_PAYLOAD:
1196 assert(input_fd < 0);
1197 input_fd = *CMSG_TYPED_DATA(found, int);
1198 state = STATE_INPUT_FD_DONE;
1199 continue;
1200
1201 case STATE_INPUT_FD_DONE:
1202 assert(!pidref_is_set(&context.pidref));
1203
1204 r = pidref_set_pidfd_consume(&context.pidref, *CMSG_TYPED_DATA(found, int));
1205 if (r < 0)
1206 return log_error_errno(r, "Failed to initialize pidref: %m");
1207
1208 state = STATE_PID_FD_DONE;
1209 continue;
1210
1211 case STATE_PID_FD_DONE:
1212 assert(context.mount_tree_fd < 0);
1213 context.mount_tree_fd = *CMSG_TYPED_DATA(found, int);
1214 /* We have all FDs we need so we are done. */
1215 break;
1216 }
1217
1218 break;
1219 }
1220
1221 cmsg_close_all(&mh);
1222
1223 /* Only zero length messages are allowed after the first message that carried a file descriptor. */
1224 if (state != STATE_PAYLOAD)
1225 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Received unexpected message with non-zero length.");
1226
1227 /* Payload messages should not carry fds */
1228 if (cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1))
1229 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1230 "Received payload message with file descriptor(s), expected none.");
1231
1232 /* Add trailing NUL byte, in case these are strings */
1233 ((char*) iovec.iov_base)[n] = 0;
1234 iovec.iov_len = (size_t) n;
1235
1236 if (iovw_put(&iovw, iovec.iov_base, iovec.iov_len) < 0)
1237 return log_oom();
1238
1239 TAKE_STRUCT(iovec);
1240 }
1241
1242 /* Make sure we got all data we really need */
1243 assert(input_fd >= 0);
1244
1245 r = context_parse_iovw(&context, &iovw);
1246 if (r < 0)
1247 return r;
1248
1249 /* Make sure we received all the expected fields. We support being called by an *older*
1250 * systemd-coredump from the outside, so we require only the basic set of fields that
1251 * was being sent when the support for sending to containers over a socket was added
1252 * in a108c43e36d3ceb6e34efe37c014fc2cda856000. */
1253 meta_argv_t i;
1254 FOREACH_ARGUMENT(i,
1255 META_ARGV_PID,
1256 META_ARGV_UID,
1257 META_ARGV_GID,
1258 META_ARGV_SIGNAL,
1259 META_ARGV_TIMESTAMP,
1260 META_ARGV_RLIMIT,
1261 META_ARGV_HOSTNAME,
1262 META_COMM)
1263 if (!context.meta[i])
1264 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1265 "Mandatory argument %s not received on socket, aborting.",
1266 meta_field_names[i]);
1267
1268 return submit_coredump(&context, &iovw, input_fd);
1269}
1270
1271static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, PidRef *pidref, int mount_tree_fd) {
1272 _cleanup_close_ int fd = -EBADF;
1273 int r;
1274
1275 assert(iovw);
1276 assert(input_fd >= 0);
1277
1278 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1279 if (fd < 0)
1280 return log_error_errno(errno, "Failed to create coredump socket: %m");
1281
1282 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1283 if (r < 0)
1284 return log_error_errno(r, "Failed to connect to coredump service: %m");
1285
1286 for (size_t i = 0; i < iovw->count; i++) {
1287 struct msghdr mh = {
1288 .msg_iov = iovw->iovec + i,
1289 .msg_iovlen = 1,
1290 };
1291 struct iovec copy[2];
1292
1293 for (;;) {
1294 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1295 break;
1296
1297 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
1298 /* This field didn't fit? That's a pity. Given that this is
1299 * just metadata, let's truncate the field at half, and try
1300 * again. We append three dots, in order to show that this is
1301 * truncated. */
1302
1303 if (mh.msg_iov != copy) {
1304 /* We don't want to modify the caller's iovec, hence
1305 * let's create our own array, consisting of two new
1306 * iovecs, where the first is a (truncated) copy of
1307 * what we want to send, and the second one contains
1308 * the trailing dots. */
1309 copy[0] = iovw->iovec[i];
1310 copy[1] = IOVEC_MAKE(((const char[]){'.', '.', '.'}), 3);
1311
1312 mh.msg_iov = copy;
1313 mh.msg_iovlen = 2;
1314 }
1315
1316 copy[0].iov_len /= 2; /* halve it, and try again */
1317 continue;
1318 }
1319
1320 return log_error_errno(errno, "Failed to send coredump datagram: %m");
1321 }
1322 }
1323
1324 /* First sentinel: the coredump fd */
1325 r = send_one_fd(fd, input_fd, 0);
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to send coredump fd: %m");
1328
1329 /* The optional second sentinel: the pidfd */
1330 if (!pidref_is_set(pidref) || pidref->fd < 0) /* If we have no pidfd, stop now */
1331 return 0;
1332
1333 r = send_one_fd(fd, pidref->fd, 0);
1334 if (r < 0)
1335 return log_error_errno(r, "Failed to send pidfd: %m");
1336
1337 /* The optional third sentinel: the mount tree fd */
1338 if (mount_tree_fd < 0) /* If we have no mount tree, stop now */
1339 return 0;
1340
1341 r = send_one_fd(fd, mount_tree_fd, 0);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to send mount tree fd: %m");
1344
1345 return 0;
1346}
1347
1348static int gather_pid_metadata_from_argv(
1349 struct iovec_wrapper *iovw,
1350 Context *context,
1351 int argc, char **argv) {
1352
1353 _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL;
1354 int r, kernel_fd = -EBADF;
1355
1356 assert(iovw);
1357 assert(context);
1358
1359 /* We gather all metadata that were passed via argv[] into an array of iovecs that
1360 * we'll forward to the socket unit.
1361 *
1362 * We require at least _META_ARGV_REQUIRED args, but will accept more.
1363 * We know how to parse _META_ARGV_MAX args. The rest will be ignored. */
1364
1365 if (argc < _META_ARGV_REQUIRED)
1366 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1367 "Not enough arguments passed by the kernel (%i, expected between %i and %i).",
1368 argc, _META_ARGV_REQUIRED, _META_ARGV_MAX);
1369
1370 for (int i = 0; i < MIN(argc, _META_ARGV_MAX); i++) {
1371 _cleanup_free_ char *buf = NULL;
1372 const char *t = argv[i];
1373
1374 if (i == META_ARGV_TIMESTAMP) {
1375 /* The journal fields contain the timestamp padded with six
1376 * zeroes, so that the kernel-supplied 1s granularity timestamps
1377 * becomes 1μs granularity, i.e. the granularity systemd usually
1378 * operates in. */
1379 buf = strjoin(argv[i], "000000");
1380 if (!buf)
1381 return log_oom();
1382
1383 t = buf;
1384 }
1385
1386 if (i == META_ARGV_PID) {
1387 /* Store this so that we can check whether the core will be forwarded to a container
1388 * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is
1389 * >= v6.16. */
1390 r = pidref_set_pidstr(&local_pidref, t);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t);
1393 }
1394
1395 if (i == META_ARGV_PIDFD) {
1396 /* If the current kernel doesn't support the %F specifier (which resolves to a
1397 * pidfd), but we included it in the core_pattern expression, we'll receive an empty
1398 * string here. Deal with that gracefully. */
1399 if (isempty(t))
1400 continue;
1401
1402 assert(!pidref_is_set(&context->pidref));
1403 assert(kernel_fd < 0);
1404
1405 kernel_fd = parse_fd(t);
1406 if (kernel_fd < 0)
1407 return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t);
1408
1409 r = pidref_set_pidfd(&context->pidref, kernel_fd);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd);
1412
1413 context->got_pidfd = 1;
1414
1415 /* If there are containers involved with different versions of the code they might
1416 * not be using pidfds, so it would be wrong to set the metadata, skip it. */
1417 r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID);
1418 if (r < 0)
1419 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1420 if (r <= 0)
1421 continue;
1422
1423 /* We don't print the fd number in the journal as it's meaningless, but we still
1424 * record that the parsing was done with a kernel-provided fd as it means it's safe
1425 * from races, which is valuable information to provide in the journal record. */
1426 t = "1";
1427 }
1428
1429 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1430 if (r < 0)
1431 return r;
1432 }
1433
1434 /* Cache some of the process metadata we collected so far and that we'll need to
1435 * access soon. */
1436 r = context_parse_iovw(context, iovw);
1437 if (r < 0)
1438 return r;
1439
1440 /* If the kernel didn't give us a PIDFD, then use the one derived from the
1441 * PID immediately, given we have it. */
1442 if (!pidref_is_set(&context->pidref))
1443 context->pidref = TAKE_PIDREF(local_pidref);
1444
1445 /* Close the kernel-provided FD as the last thing after everything else succeeded. */
1446 kernel_fd = safe_close(kernel_fd);
1447
1448 return 0;
1449}
1450
1451static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
1452 uid_t owner_uid;
1453 pid_t pid;
1454 char *t;
1455 size_t size;
1456 const char *p;
1457 int r;
1458
1459 assert(iovw);
1460 assert(context);
1461
1462 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1463 * structure. (It remains valid, with the first iovec fields initialized.) */
1464
1465 pid = context->pidref.pid;
1466
1467 /* The following is mandatory */
1468 r = pidref_get_comm(&context->pidref, &t);
1469 if (r < 0)
1470 return log_error_errno(r, "Failed to get COMM: %m");
1471
1472 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
1473 if (r < 0)
1474 return r;
1475
1476 /* The following are optional, but we use them if present. */
1477 r = get_process_exe(pid, &t);
1478 if (r >= 0)
1479 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1480 if (r < 0)
1481 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
1482
1483 if (cg_pidref_get_unit(&context->pidref, &t) >= 0)
1484 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
1485
1486 if (cg_pid_get_user_unit(pid, &t) >= 0)
1487 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
1488
1489 if (cg_pidref_get_session(&context->pidref, &t) >= 0)
1490 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
1491
1492 if (cg_pidref_get_owner_uid(&context->pidref, &owner_uid) >= 0) {
1493 r = asprintf(&t, UID_FMT, owner_uid);
1494 if (r > 0)
1495 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
1496 }
1497
1498 if (sd_pid_get_slice(pid, &t) >= 0)
1499 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
1500
1501 if (pidref_get_cmdline(&context->pidref, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
1502 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
1503
1504 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
1505 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
1506
1507 if (compose_open_fds(pid, &t) >= 0)
1508 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
1509
1510 p = procfs_file_alloca(pid, "status");
1511 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
1512 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
1513
1514 p = procfs_file_alloca(pid, "maps");
1515 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
1516 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
1517
1518 p = procfs_file_alloca(pid, "limits"); /* this uses 'seq_file' in kernel, use read_full_file_at() */
1519 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
1520 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
1521
1522 p = procfs_file_alloca(pid, "cgroup");
1523 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
1524 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
1525
1526 p = procfs_file_alloca(pid, "mountinfo");
1527 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
1528 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
1529
1530 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1531 p = procfs_file_alloca(pid, "auxv");
1532 if (read_full_file(p, &t, &size) >= 0) {
1533 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1534 if (buf) {
1535 /* Add a dummy terminator to make context_parse_iovw() happy. */
1536 *mempcpy_typesafe(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size) = '\0';
1537 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1538 }
1539
1540 free(t);
1541 }
1542
1543 if (get_process_cwd(pid, &t) >= 0)
1544 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
1545
1546 if (get_process_root(pid, &t) >= 0) {
1547 bool proc_self_root_is_slash;
1548
1549 proc_self_root_is_slash = strcmp(t, "/") == 0;
1550
1551 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
1552
1553 /* If the process' root is "/", then there is a chance it has
1554 * mounted own root and hence being containerized. */
1555 if (proc_self_root_is_slash && get_process_container_parent_cmdline(&context->pidref, &t) > 0)
1556 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
1557 }
1558
1559 if (get_process_environ(pid, &t) >= 0)
1560 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
1561
1562 /* Now that we have parsed info from /proc/ ensure the pidfd is still valid before continuing. */
1563 r = pidref_verify(&context->pidref);
1564 if (r < 0)
1565 return log_error_errno(r, "PIDFD validation failed: %m");
1566
1567 /* We successfully acquired all metadata. */
1568 return context_parse_iovw(context, iovw);
1569}
1570
1571static int send_ucred(int transport_fd, const struct ucred *ucred) {
1572 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1573 struct msghdr mh = {
1574 .msg_control = &control,
1575 .msg_controllen = sizeof(control),
1576 };
1577 struct cmsghdr *cmsg;
1578
1579 assert(transport_fd >= 0);
1580 assert(ucred);
1581
1582 cmsg = CMSG_FIRSTHDR(&mh);
1583 *cmsg = (struct cmsghdr) {
1584 .cmsg_level = SOL_SOCKET,
1585 .cmsg_type = SCM_CREDENTIALS,
1586 .cmsg_len = CMSG_LEN(sizeof(struct ucred)),
1587 };
1588 memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred));
1589
1590 return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL));
1591}
1592
1593static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
1594 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1595 struct msghdr mh = {
1596 .msg_control = &control,
1597 .msg_controllen = sizeof(control),
1598 };
1599 struct cmsghdr *cmsg = NULL;
1600 struct ucred *ucred = NULL;
1601 ssize_t n;
1602
1603 assert(transport_fd >= 0);
1604 assert(ret_ucred);
1605
1606 n = recvmsg_safe(transport_fd, &mh, 0);
1607 if (n < 0)
1608 return n;
1609
1610 CMSG_FOREACH(cmsg, &mh)
1611 if (cmsg->cmsg_level == SOL_SOCKET &&
1612 cmsg->cmsg_type == SCM_CREDENTIALS &&
1613 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
1614
1615 assert(!ucred);
1616 ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
1617 }
1618
1619 if (!ucred)
1620 return -EIO;
1621
1622 *ret_ucred = *ucred;
1623
1624 return 0;
1625}
1626
1627static int can_forward_coredump(Context *context, const PidRef *pid) {
1628 _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
1629 int r;
1630
1631 assert(context);
1632 assert(pidref_is_set(pid));
1633 assert(!pidref_is_remote(pid));
1634
1635 /* We need to avoid a situation where the attacker crashes a SUID process or a root daemon and
1636 * quickly replaces it with a namespaced process and we forward the coredump to the attacker, into
1637 * the namespace. With %F/pidfd we can reliably check the namespace of the original process, hence we
1638 * can allow forwarding. */
1639 if (!context->got_pidfd && context->dumpable != SUID_DUMP_USER)
1640 return false;
1641
1642 r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1643 if (r < 0)
1644 return r;
1645
1646 r = path_extract_directory(cgroup, &path);
1647 if (r < 0)
1648 return r;
1649
1650 r = cg_path_get_unit_path(path, &unit);
1651 if (r == -ENOMEM)
1652 return log_oom();
1653 if (r == -ENXIO)
1654 /* No valid units in this path. */
1655 return false;
1656 if (r < 0)
1657 return r;
1658
1659 /* We require that this process belongs to a delegated cgroup
1660 * (i.e. Delegate=yes), with CoredumpReceive=yes also. */
1661 r = cg_is_delegated(unit);
1662 if (r <= 0)
1663 return r;
1664
1665 return cg_has_coredump_receive(unit);
1666}
1667
1668static int forward_coredump_to_container(Context *context) {
1669 _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF;
1670 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
1671 pid_t child;
1672 struct ucred ucred = {
1673 .pid = context->pidref.pid,
1674 .uid = context->uid,
1675 .gid = context->gid,
1676 };
1677 int r;
1678
1679 assert(context);
1680
1681 _cleanup_(pidref_done) PidRef leader_pid = PIDREF_NULL;
1682 r = namespace_get_leader(&context->pidref, NAMESPACE_PID, &leader_pid);
1683 if (r < 0)
1684 return log_debug_errno(r, "Failed to get namespace leader: %m");
1685
1686 r = can_forward_coredump(context, &leader_pid);
1687 if (r < 0)
1688 return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
1689 if (r == 0)
1690 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
1691 "Coredump will not be forwarded because no target cgroup was found.");
1692
1693 r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair));
1694 if (r < 0)
1695 return log_debug_errno(r, "Failed to create socket pair: %m");
1696
1697 r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true);
1698 if (r < 0)
1699 return log_debug_errno(r, "Failed to set SO_PASSCRED: %m");
1700
1701 r = pidref_namespace_open(&leader_pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd);
1702 if (r < 0)
1703 return log_debug_errno(r, "Failed to open namespaces of PID " PID_FMT ": %m", leader_pid.pid);
1704
1705 r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0,
1706 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
1707 pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child);
1708 if (r < 0)
1709 return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", leader_pid.pid);
1710 if (r == 0) {
1711 pair[0] = safe_close(pair[0]);
1712
1713 r = access_nofollow("/run/systemd/coredump", W_OK);
1714 if (r < 0) {
1715 log_debug_errno(r, "Cannot find coredump socket, exiting: %m");
1716 _exit(EXIT_FAILURE);
1717 }
1718
1719 r = receive_ucred(pair[1], &ucred);
1720 if (r < 0) {
1721 log_debug_errno(r, "Failed to receive ucred and fd: %m");
1722 _exit(EXIT_FAILURE);
1723 }
1724
1725 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = iovw_new();
1726 if (!iovw) {
1727 log_oom();
1728 _exit(EXIT_FAILURE);
1729 }
1730
1731 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1732 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1733 (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1");
1734
1735 for (int i = 0; i < _META_ARGV_MAX; i++) {
1736 char buf[DECIMAL_STR_MAX(pid_t)];
1737 const char *t = context->meta[i];
1738
1739 /* Patch some of the fields with the translated ucred data */
1740 switch (i) {
1741
1742 case META_ARGV_PID:
1743 xsprintf(buf, PID_FMT, ucred.pid);
1744 t = buf;
1745 break;
1746
1747 case META_ARGV_UID:
1748 xsprintf(buf, UID_FMT, ucred.uid);
1749 t = buf;
1750 break;
1751
1752 case META_ARGV_GID:
1753 xsprintf(buf, GID_FMT, ucred.gid);
1754 t = buf;
1755 break;
1756
1757 default:
1758 ;
1759 }
1760
1761 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1762 if (r < 0) {
1763 log_debug_errno(r, "Failed to construct iovec: %m");
1764 _exit(EXIT_FAILURE);
1765 }
1766 }
1767
1768 _cleanup_(context_done) Context child_context = CONTEXT_NULL;
1769 r = context_parse_iovw(&child_context, iovw);
1770 if (r < 0) {
1771 log_debug_errno(r, "Failed to save context: %m");
1772 _exit(EXIT_FAILURE);
1773 }
1774
1775 r = gather_pid_metadata_from_procfs(iovw, &child_context);
1776 if (r < 0) {
1777 log_debug_errno(r, "Failed to gather metadata from procfs: %m");
1778 _exit(EXIT_FAILURE);
1779 }
1780
1781 r = send_iovec(iovw, STDIN_FILENO, &context->pidref, /* mount_tree_fd= */ -EBADF);
1782 if (r < 0) {
1783 log_debug_errno(r, "Failed to send iovec to coredump socket: %m");
1784 _exit(EXIT_FAILURE);
1785 }
1786
1787 _exit(EXIT_SUCCESS);
1788 }
1789
1790 pair[1] = safe_close(pair[1]);
1791
1792 /* We need to translate the PID, UID, and GID of the crashing process
1793 * to the container's namespaces. Do this by sending an SCM_CREDENTIALS
1794 * message on a socket pair, and read the result when we join the
1795 * container. The kernel will perform the translation for us. */
1796 r = send_ucred(pair[0], &ucred);
1797 if (r < 0)
1798 return log_debug_errno(r, "Failed to send metadata to container: %m");
1799
1800 r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0);
1801 if (r < 0)
1802 return log_debug_errno(r, "Failed to wait for child to terminate: %m");
1803 if (r != EXIT_SUCCESS)
1804 return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container.");
1805
1806 return 0;
1807}
1808
1809static int acquire_pid_mount_tree_fd(const Context *context, int *ret_fd) {
1810 /* Don't bother preparing environment if we can't pass it to libdwfl. */
1811#if !HAVE_DWFL_SET_SYSROOT
1812 *ret_fd = -EOPNOTSUPP;
1813 log_debug("dwfl_set_sysroot() is not supported.");
1814#else
1815 _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, fd = -EBADF;
1816 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
1817 int r;
1818
1819 assert(context);
1820 assert(ret_fd);
1821
1822 if (!arg_enter_namespace) {
1823 *ret_fd = -EHOSTDOWN;
1824 log_debug("EnterNamespace=no so we won't use mount tree of the crashed process for generating backtrace.");
1825 return 0;
1826 }
1827
1828 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0)
1829 return log_error_errno(errno, "Failed to create socket pair: %m");
1830
1831 r = pidref_namespace_open(
1832 &context->pidref,
1833 /* ret_pidns_fd= */ NULL,
1834 &mntns_fd,
1835 /* ret_netns_fd= */ NULL,
1836 /* ret_userns_fd= */ NULL,
1837 &root_fd);
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to open mount namespace of crashing process: %m");
1840
1841 r = namespace_fork("(sd-mount-tree-ns)",
1842 "(sd-mount-tree)",
1843 /* except_fds= */ NULL,
1844 /* n_except_fds= */ 0,
1845 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT,
1846 /* pidns_fd= */ -EBADF,
1847 mntns_fd,
1848 /* netns_fd= */ -EBADF,
1849 /* userns_fd= */ -EBADF,
1850 root_fd,
1851 NULL);
1852 if (r < 0)
1853 return r;
1854 if (r == 0) {
1855 pair[0] = safe_close(pair[0]);
1856
1857 fd = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
1858 if (fd < 0) {
1859 log_error_errno(errno, "Failed to clone mount tree: %m");
1860 _exit(EXIT_FAILURE);
1861 }
1862
1863 r = send_one_fd(pair[1], fd, 0);
1864 if (r < 0) {
1865 log_error_errno(r, "Failed to send mount tree to parent: %m");
1866 _exit(EXIT_FAILURE);
1867 }
1868
1869 _exit(EXIT_SUCCESS);
1870 }
1871
1872 pair[1] = safe_close(pair[1]);
1873
1874 fd = receive_one_fd(pair[0], MSG_DONTWAIT);
1875 if (fd < 0)
1876 return log_error_errno(fd, "Failed to receive mount tree: %m");
1877
1878 *ret_fd = TAKE_FD(fd);
1879#endif
1880 return 0;
1881}
1882
1883static int process_kernel(int argc, char *argv[]) {
1884 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
1885 _cleanup_(context_done) Context context = CONTEXT_NULL;
1886 int r;
1887
1888 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1889 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1890 * /dev/null. */
1891 r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
1892 if (r < 0)
1893 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1894
1895 log_debug("Processing coredump received from the kernel...");
1896
1897 iovw = iovw_new();
1898 if (!iovw)
1899 return log_oom();
1900
1901 /* Collect all process metadata passed by the kernel through argv[] */
1902 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
1903 if (r < 0)
1904 return r;
1905
1906 /* Collect the rest of the process metadata retrieved from the runtime */
1907 r = gather_pid_metadata_from_procfs(iovw, &context);
1908 if (r < 0)
1909 return r;
1910
1911 if (!context.is_journald)
1912 /* OK, now we know it's not the journal, hence we can make use of it now. */
1913 log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
1914
1915 /* Log minimal metadata now, so it is not lost if the system is about to shut down. */
1916 log_info("Process %s (%s) of user %s terminated abnormally with signal %s/%s, processing...",
1917 context.meta[META_ARGV_PID], context.meta[META_COMM],
1918 context.meta[META_ARGV_UID], context.meta[META_ARGV_SIGNAL],
1919 signal_to_string(context.signo));
1920
1921 r = pidref_in_same_namespace(/* pid1 = */ NULL, &context.pidref, NAMESPACE_PID);
1922 if (r < 0)
1923 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1924 if (r == 0) {
1925 /* If this fails, fallback to the old behavior so that
1926 * there is still some record of the crash. */
1927 r = forward_coredump_to_container(&context);
1928 if (r >= 0)
1929 return 0;
1930
1931 r = acquire_pid_mount_tree_fd(&context, &context.mount_tree_fd);
1932 if (r < 0)
1933 log_warning_errno(r, "Failed to access the mount tree of a container, ignoring: %m");
1934 }
1935
1936 /* If this is PID 1, disable coredump collection, we'll unlikely be able to process
1937 * it later on.
1938 *
1939 * FIXME: maybe we should disable coredumps generation from the beginning and
1940 * re-enable it only when we know it's either safe (i.e. we're not running OOM) or
1941 * it's not PID 1 ? */
1942 if (context.is_pid1) {
1943 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1944 disable_coredumps();
1945 }
1946
1947 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1948 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1949
1950 if (context.is_journald || context.is_pid1)
1951 return submit_coredump(&context, iovw, STDIN_FILENO);
1952
1953 return send_iovec(iovw, STDIN_FILENO, &context.pidref, context.mount_tree_fd);
1954}
1955
1956static int process_backtrace(int argc, char *argv[]) {
1957 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1958 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
1959 _cleanup_(context_done) Context context = CONTEXT_NULL;
1960 char *message;
1961 int r;
1962
1963 assert(argc >= 2);
1964
1965 log_debug("Processing backtrace on stdin...");
1966
1967 iovw = iovw_new();
1968 if (!iovw)
1969 return log_oom();
1970
1971 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1972 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1973
1974 /* Collect all process metadata from argv[] by making sure to skip the
1975 * '--backtrace' option */
1976 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
1977 if (r < 0)
1978 return r;
1979
1980 /* Collect the rest of the process metadata retrieved from the runtime */
1981 r = gather_pid_metadata_from_procfs(iovw, &context);
1982 if (r < 0)
1983 return r;
1984
1985 for (;;) {
1986 r = journal_importer_process_data(&importer);
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
1989 if (r == 1 || /* complete entry */
1990 journal_importer_eof(&importer)) /* end of data */
1991 break;
1992 }
1993
1994 if (journal_importer_eof(&importer)) {
1995 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
1996
1997 message = strjoina("Process ", context.meta[META_ARGV_PID],
1998 " (", context.meta[META_COMM], ")"
1999 " of user ", context.meta[META_ARGV_UID],
2000 " failed with ", context.meta[META_ARGV_SIGNAL]);
2001
2002 r = iovw_put_string_field(iovw, "MESSAGE=", message);
2003 if (r < 0)
2004 return r;
2005 } else {
2006 /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
2007 * end of the array. */
2008 r = iovw_append(iovw, &importer.iovw);
2009 if (r < 0)
2010 return r;
2011 }
2012
2013 r = sd_journal_sendv(iovw->iovec, iovw->count);
2014 if (r < 0)
2015 return log_error_errno(r, "Failed to log backtrace: %m");
2016
2017 return 0;
2018}
2019
2020static int run(int argc, char *argv[]) {
2021 int r;
2022
2023 /* First, log to a safe place, since we don't know what crashed and it might
2024 * be journald which we'd rather not log to then. */
2025
2026 log_set_target_and_open(LOG_TARGET_KMSG);
2027
2028 /* Make sure we never enter a loop */
2029 (void) set_dumpable(SUID_DUMP_DISABLE);
2030
2031 /* Ignore all parse errors */
2032 (void) parse_config();
2033
2034 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
2035 log_debug("Selected compression %s.", yes_no(arg_compress));
2036
2037 r = sd_listen_fds(false);
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
2040
2041 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
2042 * are invoked from the kernel as coredump handler. */
2043 if (r == 0) {
2044 if (streq_ptr(argv[1], "--backtrace"))
2045 return process_backtrace(argc, argv);
2046 else
2047 return process_kernel(argc, argv);
2048 } else if (r == 1)
2049 return process_socket(SD_LISTEN_FDS_START);
2050
2051 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2052 "Received unexpected number of file descriptors.");
2053}
2054
2055DEFINE_MAIN_FUNCTION(run);