]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/coredump/coredump.c
vmspawn: Run auxiliary daemons inside scope instead of separate service (#38047)
[thirdparty/systemd.git] / src / coredump / coredump.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
f5e04665 2
86b9a3e3 3#include <elf.h>
803a3464 4#include <stdio.h>
b8fe1b1d 5#include <sys/mount.h>
587f2a5e 6#include <sys/statvfs.h>
cacd6403 7#include <sys/xattr.h>
4f5dd394 8#include <unistd.h>
f5e04665 9
b06adfb0 10#include "sd-bus.h"
73a99163 11#include "sd-daemon.h"
f11943c5 12#include "sd-journal.h"
309a747f 13#include "sd-json.h"
f11943c5 14#include "sd-login.h"
73a99163 15#include "sd-messages.h"
4f5dd394
LP
16
17#include "acl-util.h"
b5efdb8a 18#include "alloc-util.h"
587f2a5e 19#include "bus-error.h"
430f0182 20#include "capability-util.h"
ba1261bc 21#include "cgroup-util.h"
4f5dd394 22#include "compress.h"
34c10968
LP
23#include "conf-parser.h"
24#include "copy.h"
c8715007 25#include "coredump-util.h"
f11943c5 26#include "coredump-vacuum.h"
a0956174 27#include "dirent-util.h"
ea680f05 28#include "elf-util.h"
b06adfb0 29#include "errno-util.h"
4f5dd394 30#include "escape.h"
3ffd4af2 31#include "fd-util.h"
4f5dd394 32#include "fileio.h"
f4f15635 33#include "fs-util.h"
b06adfb0 34#include "io-util.h"
bd1ae178 35#include "iovec-util.h"
b18453ed 36#include "journal-importer.h"
5edf875b 37#include "journal-send.h"
309a747f 38#include "json-util.h"
4f5dd394 39#include "log.h"
5e332028 40#include "main-func.h"
0a970718 41#include "memory-util.h"
2485b7e2 42#include "memstream-util.h"
35cd0ba5 43#include "mkdir-label.h"
a108c43e 44#include "namespace-util.h"
6bedfcbb 45#include "parse-util.h"
a108c43e 46#include "path-util.h"
b06adfb0 47#include "pidref.h"
0b452006 48#include "process-util.h"
d14bcb4e 49#include "signal-util.h"
3c171f0b 50#include "socket-util.h"
4f5dd394 51#include "special.h"
587f2a5e 52#include "stat-util.h"
8b43440b 53#include "string-table.h"
07630cea 54#include "string-util.h"
e4de7287 55#include "tmpfile-util.h"
8e1ac16b 56#include "uid-classification.h"
b1d4f8e1 57#include "user-util.h"
34727273 58
da890466 59/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
e677041e
LP
60#if __SIZEOF_POINTER__ == 4
61#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
62#elif __SIZEOF_POINTER__ == 8
63#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
64#else
65#error "Unexpected pointer size"
66#endif
34c10968 67
bdfd7b2c 68/* The maximum size up to which we leave the coredump around on disk */
34c10968
LP
69#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
70
bdfd7b2c 71/* The maximum size up to which we store the coredump in the journal */
25cad95c 72#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
34c10968 73#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
25cad95c
YW
74#else
75/* oss-fuzz limits memory usage. */
76#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
77#endif
f5e04665 78
587f2a5e
LB
79/* When checking for available memory and setting lower limits, don't
80 * go below 4MB for writing core files to storage. */
81#define PROCESS_SIZE_MIN (4U*1024U*1024U)
82
c4aa09b0 83/* Make sure to not make this larger than the maximum journal entry
27f931d1 84 * size. See DATA_SIZE_MAX in journal-importer.h. */
874bc134 85assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
f5e04665 86
0aea6872
MS
87#define MOUNT_TREE_ROOT "/run/systemd/mount-rootfs"
88
49f1f2d4 89typedef enum {
f46c706b 90 /* We use these as array indexes for our process metadata cache.
ea5cc2a8 91 *
c673f1f6
ZJS
92 * The first indices of the cache stores the same metadata as the ones passed by the kernel via
93 * argv[], i.e. the strings specified in our pattern defined in /proc/sys/kernel/core_pattern,
94 * see core(5). */
f46c706b
FB
95
96 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
97 META_ARGV_UID, /* %u: as seen in the initial user namespace */
98 META_ARGV_GID, /* %g: as seen in the initial user namespace */
99 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
e503019b 100 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
f46c706b 101 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
ded0aac3
ZJS
102 _META_ARGV_REQUIRED,
103 /* The fields below were added to kernel/core_pattern at later points, so they might be missing. */
104 META_ARGV_HOSTNAME = _META_ARGV_REQUIRED, /* %h: hostname */
0c49e004 105 META_ARGV_DUMPABLE, /* %d: as set by the kernel */
868d9557 106 META_ARGV_PIDFD, /* %F: pidfd of the process, since v6.16 */
ded0aac3
ZJS
107 /* If new fields are added, they should be added here, to maintain compatibility
108 * with callers which don't know about the new fields. */
49f1f2d4 109 _META_ARGV_MAX,
f46c706b
FB
110
111 /* The following indexes are cached for a couple of special fields we use (and
112 * thereby need to be retrieved quickly) for naming coredump files, and attaching
113 * xattrs. Unlike the previous ones they are retrieved from the runtime
114 * environment. */
115
116 META_COMM = _META_ARGV_MAX,
f46c706b
FB
117
118 /* The rest are similar to the previous ones except that we won't fail if one of
ded0aac3 119 * them is missing in a message sent over the socket. */
f46c706b 120
49f1f2d4 121 META_EXE,
f46c706b 122 META_UNIT,
3e4d0f6c 123 META_PROC_AUXV,
f46c706b 124 _META_MAX
49f1f2d4 125} meta_argv_t;
f5e04665 126
f46c706b 127static const char * const meta_field_names[_META_MAX] = {
510a1466
ZJS
128 [META_ARGV_PID] = "COREDUMP_PID=",
129 [META_ARGV_UID] = "COREDUMP_UID=",
130 [META_ARGV_GID] = "COREDUMP_GID=",
131 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
132 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
133 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
134 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
0c49e004 135 [META_ARGV_DUMPABLE] = "COREDUMP_DUMPABLE=",
868d9557 136 [META_ARGV_PIDFD] = "COREDUMP_BY_PIDFD=",
510a1466
ZJS
137 [META_COMM] = "COREDUMP_COMM=",
138 [META_EXE] = "COREDUMP_EXE=",
139 [META_UNIT] = "COREDUMP_UNIT=",
3e4d0f6c 140 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
f46c706b
FB
141};
142
143typedef struct Context {
313537da 144 PidRef pidref;
9764bca9
NR
145 uid_t uid;
146 gid_t gid;
0c49e004 147 unsigned dumpable;
960b0458 148 int signo;
19455dd6 149 uint64_t rlimit;
f46c706b
FB
150 bool is_pid1;
151 bool is_journald;
e6a8687b 152 bool got_pidfd;
313537da
LP
153 int mount_tree_fd;
154
155 /* These point into external memory, are not owned by this object */
156 const char *meta[_META_MAX];
157 size_t meta_size[_META_MAX];
f46c706b
FB
158} Context;
159
313537da
LP
160#define CONTEXT_NULL \
161 (Context) { \
162 .pidref = PIDREF_NULL, \
163 .uid = UID_INVALID, \
164 .gid = GID_INVALID, \
165 .mount_tree_fd = -EBADF, \
166 }
167
34c10968
LP
168typedef enum CoredumpStorage {
169 COREDUMP_STORAGE_NONE,
170 COREDUMP_STORAGE_EXTERNAL,
171 COREDUMP_STORAGE_JOURNAL,
34c10968 172 _COREDUMP_STORAGE_MAX,
2d93c20e 173 _COREDUMP_STORAGE_INVALID = -EINVAL,
34c10968
LP
174} CoredumpStorage;
175
34c10968 176static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
510a1466 177 [COREDUMP_STORAGE_NONE] = "none",
34c10968 178 [COREDUMP_STORAGE_EXTERNAL] = "external",
510a1466 179 [COREDUMP_STORAGE_JOURNAL] = "journal",
34c10968
LP
180};
181
182DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
42efe5be 183static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage);
34727273
ZJS
184
185static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
8c9571d0 186static bool arg_compress = true;
59f448cf
LP
187static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
188static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
6e2b4a69 189static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
f5fbe71d
YW
190static uint64_t arg_keep_free = UINT64_MAX;
191static uint64_t arg_max_use = UINT64_MAX;
e26a7e08
MS
192#if HAVE_DWFL_SET_SYSROOT
193static bool arg_enter_namespace = false;
194#endif
34c10968 195
313537da
LP
196static void context_done(Context *c) {
197 assert(c);
198
199 pidref_done(&c->pidref);
200 c->mount_tree_fd = safe_close(c->mount_tree_fd);
201}
202
34c10968 203static int parse_config(void) {
34c10968 204 static const ConfigTableItem items[] = {
68511ceb
MS
205 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
206 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
207 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
208 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
209 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
210 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
211 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
212#if HAVE_DWFL_SET_SYSROOT
e26a7e08 213 { "Coredump", "EnterNamespace", config_parse_bool, 0, &arg_enter_namespace },
68511ceb 214#else
8ec2e177 215 { "Coredump", "EnterNamespace", config_parse_warn_compat, DISABLED_CONFIGURATION, NULL },
68511ceb 216#endif
34c10968
LP
217 {}
218 };
219
4a78074f
LP
220 int r;
221
6378f257 222 r = config_parse_standard_file_with_dropins(
e5abff37 223 "systemd/coredump.conf",
4a78074f
LP
224 "Coredump\0",
225 config_item_table_lookup,
226 items,
227 CONFIG_PARSE_WARN,
228 /* userdata= */ NULL);
229 if (r < 0)
230 return r;
231
232 /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
233 * efficiency reasons. journald wouldn't accept anything larger anyway. */
234 if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
235 log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
236 FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
237 arg_journal_size_max = JOURNAL_SIZE_MAX;
238 }
239
240 return 0;
34c10968
LP
241}
242
a1e92eee 243static uint64_t storage_size_max(void) {
ee0449fd
ZJS
244 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
245 return arg_external_size_max;
246 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
247 return arg_journal_size_max;
248 assert(arg_storage == COREDUMP_STORAGE_NONE);
249 return 0;
73a99163
ZJS
250}
251
3e4d0f6c
ZJS
252static int fix_acl(int fd, uid_t uid, bool allow_user) {
253 assert(fd >= 0);
254 assert(uid_is_valid(uid));
34c10968 255
349cc4a5 256#if HAVE_ACL
709f6e46 257 int r;
34c10968 258
3e4d0f6c
ZJS
259 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
260 if (!allow_user)
261 return 0;
b59233e6 262
554130fa 263 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid_is_greeter(uid) || uid == UID_NOBODY)
34c10968
LP
264 return 0;
265
d81be4e7 266 /* Make sure normal users can read (but not write or delete) their own coredumps */
567aeb58 267 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
709f6e46 268 if (r < 0)
567aeb58 269 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
34c10968
LP
270#endif
271
272 return 0;
273}
274
f46c706b 275static int fix_xattr(int fd, const Context *context) {
f46c706b 276 static const char * const xattrs[_META_MAX] = {
510a1466
ZJS
277 [META_ARGV_PID] = "user.coredump.pid",
278 [META_ARGV_UID] = "user.coredump.uid",
279 [META_ARGV_GID] = "user.coredump.gid",
280 [META_ARGV_SIGNAL] = "user.coredump.signal",
281 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
282 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
283 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
284 [META_COMM] = "user.coredump.comm",
285 [META_EXE] = "user.coredump.exe",
0cd77f97
LP
286 };
287
34c10968
LP
288 int r = 0;
289
b59233e6
LP
290 assert(fd >= 0);
291
60ecc386 292 /* Attach some metadata to coredumps via extended attributes. Just because we can. */
34c10968 293
fe96c0f8 294 for (unsigned i = 0; i < _META_MAX; i++) {
1eef15b1
ZJS
295 int k;
296
f46c706b 297 if (isempty(context->meta[i]) || !xattrs[i])
0cd77f97 298 continue;
34c10968 299
60ecc386
ZJS
300 k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
301 RET_GATHER(r, k);
0cd77f97 302 }
34c10968
LP
303
304 return r;
305}
306
b0b21dce 307#define filename_escape(s) xescape((s), "./ ")
34c10968 308
a1e92eee 309static const char *coredump_tmpfile_name(const char *s) {
1da3cb81 310 return s ?: "(unnamed temporary file)";
0c773903
EV
311}
312
2d0bcf1e 313static int fix_permissions_and_link(
b59233e6
LP
314 int fd,
315 const char *filename,
316 const char *target,
f46c706b 317 const Context *context,
3e4d0f6c 318 bool allow_user) {
b59233e6 319
03532f0a
LP
320 int r;
321
b59233e6 322 assert(fd >= 0);
b59233e6 323 assert(target);
3c171f0b 324 assert(context);
cfd652ed
ZJS
325
326 /* Ignore errors on these */
3c171f0b 327 (void) fchmod(fd, 0640);
9764bca9 328 (void) fix_acl(fd, context->uid, allow_user);
3c171f0b 329 (void) fix_xattr(fd, context);
cfd652ed 330
74402bf0 331 r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
03532f0a
LP
332 if (r < 0)
333 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
cfd652ed
ZJS
334
335 return 0;
336}
337
5125a0b8
LP
338static int maybe_remove_external_coredump(
339 const Context *c,
340 const char *filename,
341 uint64_t size) {
342
343 assert(c);
cfd652ed 344
5125a0b8
LP
345 /* Returns true if might remove, false if will not remove, < 0 on error. */
346
347 if (arg_storage != COREDUMP_STORAGE_NONE &&
348 (c->is_pid1 || c->is_journald)) /* Always keep around in case of journald/pid1, since we cannot rely on the journal to accept them */
349 return false;
cfd652ed 350
fc6cec86 351 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
cfd652ed 352 size <= arg_external_size_max)
5125a0b8 353 return false;
cfd652ed
ZJS
354
355 if (!filename)
5125a0b8 356 return true;
cfd652ed 357
4a62c710
MS
358 if (unlink(filename) < 0 && errno != ENOENT)
359 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
cfd652ed 360
5125a0b8 361 return true;
cfd652ed
ZJS
362}
363
f46c706b 364static int make_filename(const Context *context, char **ret) {
b59233e6 365 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
a7f7d1bd 366 sd_id128_t boot = {};
34c10968
LP
367 int r;
368
3c171f0b 369 assert(context);
34c10968 370
f46c706b 371 c = filename_escape(context->meta[META_COMM]);
34c10968 372 if (!c)
b59233e6 373 return -ENOMEM;
34c10968 374
f46c706b 375 u = filename_escape(context->meta[META_ARGV_UID]);
0dc5d23c 376 if (!u)
b59233e6 377 return -ENOMEM;
34c10968
LP
378
379 r = sd_id128_get_boot(&boot);
b59233e6 380 if (r < 0)
34c10968 381 return r;
34c10968 382
f46c706b 383 p = filename_escape(context->meta[META_ARGV_PID]);
b59233e6
LP
384 if (!p)
385 return -ENOMEM;
386
f46c706b 387 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
b59233e6
LP
388 if (!t)
389 return -ENOMEM;
390
391 if (asprintf(ret,
64a5384f 392 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
34c10968 393 c,
0dc5d23c 394 u,
34c10968
LP
395 SD_ID128_FORMAT_VAL(boot),
396 p,
b59233e6
LP
397 t) < 0)
398 return -ENOMEM;
399
400 return 0;
401}
402
3e4d0f6c
ZJS
403static int grant_user_access(int core_fd, const Context *context) {
404 int at_secure = -1;
405 uid_t uid = UID_INVALID, euid = UID_INVALID;
406 uid_t gid = GID_INVALID, egid = GID_INVALID;
407 int r;
408
409 assert(core_fd >= 0);
410 assert(context);
411
412 if (!context->meta[META_PROC_AUXV])
413 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
414
415 uint8_t elf[EI_NIDENT];
416 errno = 0;
417 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
418 return log_warning_errno(errno_or_else(EIO),
419 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
420
421 if (elf[EI_MAG0] != ELFMAG0 ||
422 elf[EI_MAG1] != ELFMAG1 ||
423 elf[EI_MAG2] != ELFMAG2 ||
424 elf[EI_MAG3] != ELFMAG3 ||
425 elf[EI_VERSION] != EV_CURRENT)
426 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
427 "Core file does not have ELF header, not adjusting permissions.");
428 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
429 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
430 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
431 "Core file has strange ELF class, not adjusting permissions.");
432
433 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
434 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
435 "Core file has non-native endianness, not adjusting permissions.");
436
cb38fdbe
ZJS
437 r = parse_auxv(LOG_WARNING,
438 /* elf_class= */ elf[EI_CLASS],
439 context->meta[META_PROC_AUXV],
440 context->meta_size[META_PROC_AUXV],
441 &at_secure, &uid, &euid, &gid, &egid);
3e4d0f6c
ZJS
442 if (r < 0)
443 return r;
444
0c49e004
ZJS
445 /* We allow access if %d/dumpable on the command line was exactly 1, we got all the data,
446 * at_secure is not set, and the uid/gid match euid/egid. */
3e4d0f6c 447 bool ret =
76e0ab49 448 context->dumpable == SUID_DUMP_USER &&
3e4d0f6c
ZJS
449 at_secure == 0 &&
450 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
451 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
0c49e004 452 log_debug("Will %s access (dumpable=%u uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
3e4d0f6c 453 ret ? "permit" : "restrict",
0c49e004 454 context->dumpable,
3e4d0f6c
ZJS
455 uid, euid, gid, egid, yes_no(at_secure));
456 return ret;
457}
458
b59233e6 459static int save_external_coredump(
f46c706b 460 const Context *context,
3c171f0b 461 int input_fd,
b59233e6 462 char **ret_filename,
5f3e0a74
HW
463 int *ret_node_fd,
464 int *ret_data_fd,
0cd4e913 465 uint64_t *ret_size,
587f2a5e 466 uint64_t *ret_compressed_size,
cc4419ed 467 bool *ret_truncated) {
b59233e6 468
587f2a5e
LB
469 _cleanup_(unlink_and_freep) char *tmp = NULL;
470 _cleanup_free_ char *fn = NULL;
254d1313 471 _cleanup_close_ int fd = -EBADF;
19455dd6 472 uint64_t process_limit, max_size;
587f2a5e 473 bool truncated, storage_on_tmpfs;
b59233e6
LP
474 struct stat st;
475 int r;
476
3c171f0b 477 assert(context);
b59233e6 478 assert(ret_filename);
5f3e0a74
HW
479 assert(ret_node_fd);
480 assert(ret_data_fd);
b59233e6 481 assert(ret_size);
587f2a5e
LB
482 assert(ret_compressed_size);
483 assert(ret_truncated);
b59233e6 484
19455dd6 485 if (context->rlimit < page_size())
f46c706b 486 /* Is coredumping disabled? Then don't bother saving/processing the
3a559f22 487 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
f46c706b
FB
488 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
489 * is usually the same as PAGE_SIZE. */
baaa35ad
ZJS
490 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
491 "Resource limits disable core dumping for process %s (%s).",
f46c706b 492 context->meta[META_ARGV_PID], context->meta[META_COMM]);
bdfd7b2c 493
ee0449fd 494 process_limit = MAX(arg_process_size_max, storage_size_max());
baaa35ad
ZJS
495 if (process_limit == 0)
496 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
497 "Limits for coredump processing and storage are both 0, not dumping core.");
ee0449fd 498
bdfd7b2c 499 /* Never store more than the process configured, or than we actually shall keep or process */
19455dd6 500 max_size = MIN(context->rlimit, process_limit);
bdfd7b2c 501
3c171f0b 502 r = make_filename(context, &fn);
23bbb0de
MS
503 if (r < 0)
504 return log_error_errno(r, "Failed to determine coredump file name: %m");
34c10968 505
1fbe8d0c 506 (void) mkdir_parents_label(fn, 0755);
803a3464 507
03532f0a 508 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
4a62c710 509 if (fd < 0)
03532f0a 510 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
803a3464 511
587f2a5e
LB
512 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
513 * the service or the slice it belongs to. This is common on low-resources systems,
514 * to avoid crashing processes to take away too many system resources.
515 * Check the cgroup settings, and set max_size to a bit less than half of the
516 * available memory left to the process.
517 * Then, attempt to write the core file uncompressed first - if the write gets
518 * interrupted, we know we won't be able to write it all, so instead compress what
519 * was written so far, delete the uncompressed truncated core, and then continue
520 * compressing from STDIN. Given the compressed core cannot be larger than the
521 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
522 * should be able to at least store the full compressed core file. */
523
524 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
525 if (storage_on_tmpfs && arg_compress) {
526 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
527 uint64_t cgroup_limit = UINT64_MAX;
528 struct statvfs sv;
529
530 /* If we can't get the cgroup limit, just ignore it, but don't fail,
531 * try anyway with the config settings. */
532 r = sd_bus_default_system(&bus);
533 if (r < 0)
534 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
535 else {
536 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
537
538 r = sd_bus_get_property_trivial(
539 bus,
540 "org.freedesktop.systemd1",
541 "/org/freedesktop/systemd1/unit/self",
542 "org.freedesktop.systemd1.Service",
543 "MemoryAvailable",
544 &error,
545 't', &cgroup_limit);
546 if (r < 0)
547 log_warning_errno(r,
548 "Failed to query MemoryAvailable for current unit, "
549 "falling back to static config settings: %s",
550 bus_error_message(&error, r));
551 }
803a3464 552
e6b25082 553 /* First, ensure we are not going to go over the cgroup limit */
587f2a5e 554 max_size = MIN(cgroup_limit, max_size);
e6b25082
LB
555 /* tmpfs might get full quickly, so check the available space too. But don't worry about
556 * errors here, failing to access the storage location will be better logged when writing to
557 * it. */
8facac5f 558 if (fstatvfs(fd, &sv) >= 0)
587f2a5e 559 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
e6b25082
LB
560 /* Impose a lower minimum, otherwise we will miss the basic headers. */
561 max_size = MAX(PROCESS_SIZE_MIN, max_size);
562 /* Ensure we can always switch to compressing on the fly in case we are running out of space
563 * by keeping half of the space/memory available, plus 1KB metadata overhead from the
564 * compression algorithm. */
565 max_size = LESS_BY(max_size, 1024U) / 2;
566
567 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup and/or filesystem limits.", max_size);
7849c2ac
TA
568 }
569
587f2a5e
LB
570 r = copy_bytes(input_fd, fd, max_size, 0);
571 if (r < 0)
572 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
573 context->meta[META_ARGV_PID], context->meta[META_COMM]);
574 truncated = r == 1;
cfd652ed 575
3e4d0f6c
ZJS
576 bool allow_user = grant_user_access(fd, context) > 0;
577
587f2a5e
LB
578#if HAVE_COMPRESSION
579 if (arg_compress) {
580 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
581 _cleanup_free_ char *fn_compressed = NULL;
254d1313 582 _cleanup_close_ int fd_compressed = -EBADF;
587f2a5e
LB
583 uint64_t uncompressed_size = 0;
584
86cbbc6d 585 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 586 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
cfd652ed 587
ee00684c 588 fn_compressed = strjoin(fn, default_compression_extension());
587f2a5e
LB
589 if (!fn_compressed)
590 return log_oom();
cfd652ed 591
03532f0a 592 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
587f2a5e
LB
593 if (fd_compressed < 0)
594 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
cfd652ed 595
587f2a5e
LB
596 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
597 if (r < 0)
598 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
599
600 if (truncated && storage_on_tmpfs) {
601 uint64_t partial_uncompressed_size = 0;
602
603 /* Uncompressed write was truncated and we are writing to tmpfs: delete
604 * the uncompressed core, and compress the remaining part from STDIN. */
605
606 tmp = unlink_and_free(tmp);
607 fd = safe_close(fd);
608
609 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
610 if (r < 0)
611 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
612 uncompressed_size += partial_uncompressed_size;
b59233e6
LP
613 }
614
2d0bcf1e 615 r = fix_permissions_and_link(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
cfd652ed 616 if (r < 0)
587f2a5e 617 return r;
b59233e6 618
587f2a5e
LB
619 if (fstat(fd_compressed, &st) < 0)
620 return log_error_errno(errno,
621 "Failed to fstat core file %s: %m",
622 coredump_tmpfile_name(tmp_compressed));
cfd652ed 623
587f2a5e
LB
624 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
625 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
587f2a5e
LB
626 *ret_data_fd = TAKE_FD(fd);
627 *ret_size = uncompressed_size;
dc8e3118 628 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
587f2a5e 629 *ret_truncated = truncated;
cfd652ed 630
cfd652ed 631 return 0;
34c10968 632 }
3b1a55e1 633#endif
5f3e0a74 634
587f2a5e
LB
635 if (truncated)
636 log_struct(LOG_INFO,
08e86b15 637 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
3cf6a3a3
YW
638 LOG_ITEM("SIZE_LIMIT=%"PRIu64, max_size),
639 LOG_MESSAGE_ID(SD_MESSAGE_TRUNCATED_CORE_STR));
587f2a5e 640
2d0bcf1e 641 r = fix_permissions_and_link(fd, tmp, fn, context, allow_user);
cfd652ed 642 if (r < 0)
587f2a5e
LB
643 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
644
645 if (fstat(fd, &st) < 0)
646 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
647
86cbbc6d 648 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 649 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
34c10968 650
0cfb0971 651 *ret_filename = TAKE_PTR(fn);
dc8e3118 652 *ret_node_fd = -EBADF;
1cc6c93a 653 *ret_data_fd = TAKE_FD(fd);
59f448cf 654 *ret_size = (uint64_t) st.st_size;
dc8e3118 655 *ret_compressed_size = UINT64_MAX;
587f2a5e 656 *ret_truncated = truncated;
34c10968 657
34c10968 658 return 0;
34c10968
LP
659}
660
661static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
662 _cleanup_free_ char *field = NULL;
663 ssize_t n;
664
8d4e028f 665 assert(fd >= 0);
34c10968
LP
666 assert(ret);
667 assert(ret_size);
668
86cbbc6d 669 if (lseek(fd, 0, SEEK_SET) < 0)
4a62c710 670 return log_warning_errno(errno, "Failed to seek: %m");
803a3464 671
34c10968 672 field = malloc(9 + size);
a73c74db
LP
673 if (!field)
674 return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
675 "Failed to allocate memory for coredump, coredump will not be stored.");
34c10968
LP
676
677 memcpy(field, "COREDUMP=", 9);
678
a73c74db
LP
679 /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
680 * 0x7ffff000 bytes max. Hence call things in a loop. */
681 n = loop_read(fd, field + 9, size, /* do_poll= */ false);
23bbb0de
MS
682 if (n < 0)
683 return log_error_errno((int) n, "Failed to read core data: %m");
baaa35ad 684 if ((size_t) n < size)
4e494e6a 685 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Core data too short.");
34c10968 686
1cc6c93a 687 *ret = TAKE_PTR(field);
34c10968
LP
688 *ret_size = size + 9;
689
34c10968
LP
690 return 0;
691}
803a3464 692
3f132692
JF
693/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
694 * 0:/dev/pts/23
695 * pos: 0
696 * flags: 0100002
697 *
698 * 1:/dev/pts/23
699 * pos: 0
700 * flags: 0100002
701 *
702 * 2:/dev/pts/23
703 * pos: 0
704 * flags: 0100002
705 * EOF
706 */
2485b7e2
YW
707static int compose_open_fds(pid_t pid, char **ret) {
708 _cleanup_(memstream_done) MemStream m = {};
4d84bc2f 709 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
254d1313 710 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
59059b4a 711 const char *fddelim = "", *path;
2485b7e2 712 FILE *stream;
7b26ea6f 713 int r;
3f132692
JF
714
715 assert(pid >= 0);
2485b7e2 716 assert(ret);
3f132692 717
59059b4a 718 path = procfs_file_alloca(pid, "fd");
3f132692 719 proc_fd_dir = opendir(path);
59059b4a
ZJS
720 if (!proc_fd_dir)
721 return -errno;
3f132692 722
4d84bc2f 723 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
59059b4a
ZJS
724 if (proc_fdinfo_fd < 0)
725 return -errno;
3f132692 726
2485b7e2 727 stream = memstream_init(&m);
3f132692
JF
728 if (!stream)
729 return -ENOMEM;
730
af3b864d 731 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
3f132692 732 _cleanup_fclose_ FILE *fdinfo = NULL;
4d84bc2f 733 _cleanup_free_ char *fdname = NULL;
254d1313 734 _cleanup_close_ int fd = -EBADF;
3f132692 735
af3b864d 736 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
3f132692
JF
737 if (r < 0)
738 return r;
739
af3b864d 740 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
3f132692
JF
741 fddelim = "\n";
742
743 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
af3b864d 744 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
59059b4a 745 if (fd < 0)
3f132692
JF
746 continue;
747
b46c3e49
VC
748 fdinfo = take_fdopen(&fd, "r");
749 if (!fdinfo)
3f132692
JF
750 continue;
751
7b26ea6f
LP
752 for (;;) {
753 _cleanup_free_ char *line = NULL;
754
755 r = read_line(fdinfo, LONG_LINE_MAX, &line);
756 if (r < 0)
757 return r;
758 if (r == 0)
759 break;
760
0d536673 761 fputs(line, stream);
7b26ea6f 762 fputc('\n', stream);
4d84bc2f 763 }
3f132692
JF
764 }
765
2485b7e2 766 return memstream_finalize(&m, ret, NULL);
3f132692
JF
767}
768
7ed03ce6
JF
769/* Returns 1 if the parent was found.
770 * Returns 0 if there is not a process we can call the pid's
771 * container parent (the pid's process isn't 'containerized').
772 * Returns a negative number on errors.
773 */
0b8b1332 774static int get_process_container_parent_cmdline(PidRef *pid, char** ret_cmdline) {
83844031 775 int r;
7ed03ce6 776
0b8b1332
LP
777 assert(pidref_is_set(pid));
778 assert(!pidref_is_remote(pid));
779
d6267b9b
LP
780 r = pidref_from_same_root_fs(pid, &PIDREF_MAKE_FROM_PID(1));
781 if (r < 0)
782 return r;
783 if (r > 0) {
784 /* The process uses system root. */
0b8b1332 785 *ret_cmdline = NULL;
7ed03ce6
JF
786 return 0;
787 }
788
0b8b1332 789 _cleanup_(pidref_done) PidRef container_pid = PIDREF_NULL;
ade39d9a 790 r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
7ed03ce6
JF
791 if (r < 0)
792 return r;
793
0b8b1332 794 r = pidref_get_cmdline(&container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, ret_cmdline);
d3cba4ea
EV
795 if (r < 0)
796 return r;
797
798 return 1;
7ed03ce6
JF
799}
800
f46c706b 801static int change_uid_gid(const Context *context) {
ea8eb370
LP
802 int r;
803
804 assert(context);
805
9764bca9
NR
806 uid_t uid = context->uid;
807 gid_t gid = context->gid;
34c10968 808
28add648 809 if (uid_is_system(uid)) {
888e378d
LP
810 const char *user = "systemd-coredump";
811
fafff8f1 812 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
888e378d
LP
813 if (r < 0) {
814 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
815 uid = gid = 0;
816 }
888e378d 817 }
3c171f0b
LP
818
819 return drop_privileges(uid, gid, 0);
820}
8c8549db 821
0aea6872 822static int attach_mount_tree(int mount_tree_fd) {
68511ceb
MS
823 int r;
824
825 assert(mount_tree_fd >= 0);
68511ceb 826
0aea6872 827 r = detach_mount_namespace();
68511ceb 828 if (r < 0)
0aea6872 829 return log_warning_errno(r, "Failed to detach mount namespace: %m");
68511ceb 830
0aea6872 831 r = mkdir_p_label(MOUNT_TREE_ROOT, 0555);
68511ceb 832 if (r < 0)
0aea6872 833 return log_warning_errno(r, "Failed to create directory: %m");
68511ceb 834
0aea6872 835 r = mount_setattr(mount_tree_fd, "", AT_EMPTY_PATH,
8f8148cb
MS
836 &(struct mount_attr) {
837 /* MOUNT_ATTR_NOSYMFOLLOW is left out on purpose to allow libdwfl to resolve symlinks.
838 * libdwfl will use openat2() with RESOLVE_IN_ROOT so there is no risk of symlink escape.
839 * https://sourceware.org/git/?p=elfutils.git;a=patch;h=06f0520f9a78b07c11c343181d552791dd630346 */
840 .attr_set = MOUNT_ATTR_RDONLY|MOUNT_ATTR_NOSUID|MOUNT_ATTR_NODEV|MOUNT_ATTR_NOEXEC,
841 .propagation = MS_SLAVE,
842 }, sizeof(struct mount_attr));
68511ceb 843 if (r < 0)
13cd1db0 844 return log_warning_errno(errno, "Failed to change properties of mount tree: %m");
68511ceb 845
0aea6872 846 r = move_mount(mount_tree_fd, "", -EBADF, MOUNT_TREE_ROOT, MOVE_MOUNT_F_EMPTY_PATH);
68511ceb 847 if (r < 0)
0aea6872 848 return log_warning_errno(errno, "Failed to attach mount tree: %m");
68511ceb 849
68511ceb
MS
850 return 0;
851}
852
3c171f0b 853static int submit_coredump(
3e4d0f6c 854 const Context *context,
9a435388 855 struct iovec_wrapper *iovw,
313537da 856 int input_fd) {
34c10968 857
309a747f 858 _cleanup_(sd_json_variant_unrefp) sd_json_variant *json_metadata = NULL;
254d1313 859 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
d8a567df 860 _cleanup_free_ char *filename = NULL, *coredump_data = NULL, *stacktrace = NULL;
0aea6872 861 const char *module_name, *root = NULL;
587f2a5e 862 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
6fea39ba 863 bool truncated = false, written = false;
309a747f 864 sd_json_variant *module_json;
3c171f0b 865 int r;
83844031 866
3c171f0b 867 assert(context);
9a435388 868 assert(iovw);
3c171f0b 869 assert(input_fd >= 0);
f5e04665 870
3c171f0b
LP
871 /* Vacuum before we write anything again */
872 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
803a3464 873
3c171f0b 874 /* Always stream the coredump to disk, if that's possible */
c8e94763
LP
875 written = save_external_coredump(
876 context, input_fd,
877 &filename, &coredump_node_fd, &coredump_fd,
878 &coredump_size, &coredump_compressed_size, &truncated) >= 0;
879 if (written) {
880 /* If we could write it to disk we can now process it. */
881 /* If we don't want to keep the coredump on disk, remove it now, as later on we
882 * will lack the privileges for it. However, we keep the fd to it, so that we can
883 * still process it and log it. */
5125a0b8
LP
884 r = maybe_remove_external_coredump(
885 context,
886 filename,
887 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
c8e94763
LP
888 if (r < 0)
889 return r;
890 if (r == 0)
891 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
892 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
893 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
894 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
895
896 /* Vacuum again, but exclude the coredump we just created */
897 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
898 }
6fea39ba 899
313537da 900 if (context->mount_tree_fd >= 0 && attach_mount_tree(context->mount_tree_fd) >= 0)
0aea6872 901 root = MOUNT_TREE_ROOT;
68511ceb 902
c8e94763
LP
903 /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
904 * coredump memory under the user's uid. This also ensures that the credentials journald will see are
905 * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's
906 * also get rid of all capabilities, if we run as root, we won't need them anymore. */
3c171f0b
LP
907 r = change_uid_gid(context);
908 if (r < 0)
909 return log_error_errno(r, "Failed to drop privileges: %m");
7bfce976 910
c8e94763
LP
911 if (written) {
912 /* Try to get a stack trace if we can */
913 if (coredump_size > arg_process_size_max)
914 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
915 "than %"PRIu64" (the configured maximum)",
916 coredump_size, arg_process_size_max);
917 else if (coredump_fd >= 0) {
918 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
919
920 (void) parse_elf_object(coredump_fd,
921 context->meta[META_EXE],
68511ceb 922 root,
c8e94763
LP
923 /* fork_disable_dump= */ skip, /* avoid loops */
924 &stacktrace,
925 &json_metadata);
926 }
c790632c 927 }
51d3783d 928
6fea39ba 929 _cleanup_free_ char *core_message = NULL;
6fea39ba
LP
930 core_message = strjoin(
931 "Process ", context->meta[META_ARGV_PID],
932 " (", context->meta[META_COMM],
933 ") of user ", context->meta[META_ARGV_UID],
934 written ? " dumped core." : " terminated abnormally without generating a coredump.");
935 if (!core_message)
936 return log_oom();
937
938 if (context->is_journald && filename)
939 if (!strextend(&core_message, "\nCoredump diverted to ", filename))
940 return log_oom();
51d3783d 941
6fea39ba
LP
942 if (stacktrace)
943 if (!strextend(&core_message, "\n\n", stacktrace))
944 return log_oom();
92e92d71 945
5edf875b
DDM
946 if (context->is_journald)
947 /* We might not be able to log to the journal, so let's always print the message to another
948 * log target. The target was set previously to something safe. */
9a435388 949 log_dispatch(LOG_ERR, 0, core_message);
92e92d71 950
2a3bebd0 951 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
3c171f0b 952
0cd4e913 953 if (truncated)
2a3bebd0 954 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
0cd4e913 955
c546154a
LB
956 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
957 * attach it as journal metadata. */
958 if (json_metadata) {
959 _cleanup_free_ char *formatted_json = NULL;
960
309a747f 961 r = sd_json_variant_format(json_metadata, 0, &formatted_json);
c546154a
LB
962 if (r < 0)
963 return log_error_errno(r, "Failed to format JSON package metadata: %m");
964
671769c9 965 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
c546154a
LB
966 }
967
c790632c
ZJS
968 /* In the unlikely scenario that context->meta[META_EXE] is not available,
969 * let's avoid guessing the module name and skip the loop. */
970 if (context->meta[META_EXE])
971 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
309a747f 972 sd_json_variant *t;
c546154a 973
c790632c
ZJS
974 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
975 if (!path_equal_filename(module_name, context->meta[META_EXE]))
976 continue;
c546154a 977
309a747f 978 t = sd_json_variant_by_key(module_json, "name");
c790632c 979 if (t)
309a747f 980 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", sd_json_variant_string(t));
1f2abb79 981
309a747f 982 t = sd_json_variant_by_key(module_json, "version");
c790632c 983 if (t)
309a747f 984 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", sd_json_variant_string(t));
c790632c 985 }
c546154a 986
3c171f0b 987 /* Optionally store the entire coredump in the journal */
587f2a5e 988 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
6e9ef603
ZJS
989 if (coredump_size <= arg_journal_size_max) {
990 size_t sz = 0;
991
992 /* Store the coredump itself in the journal */
993
994 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
9a435388
FB
995 if (r >= 0) {
996 if (iovw_put(iovw, coredump_data, sz) >= 0)
997 TAKE_PTR(coredump_data);
998 } else
6e9ef603
ZJS
999 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
1000 } else
5206a724 1001 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
6e9ef603 1002 coredump_size, arg_journal_size_max);
f5e04665
LP
1003 }
1004
5edf875b
DDM
1005 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
1006 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
1007 * the coredump to the socket. */
1008
1009 if (context->is_journald) {
1010 r = journal_fd_nonblock(true);
1011 if (r < 0)
1012 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
1013 }
1014
9a435388 1015 r = sd_journal_sendv(iovw->iovec, iovw->count);
5edf875b
DDM
1016
1017 if (context->is_journald) {
1018 int k;
1019
1020 k = journal_fd_nonblock(false);
1021 if (k < 0)
1022 return log_error_errno(k, "Failed to make journal socket blocking: %m");
1023 }
1024
1025 if (r == -EAGAIN && context->is_journald)
1026 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
1027 else if (r < 0)
3c171f0b
LP
1028 return log_error_errno(r, "Failed to log coredump: %m");
1029
1030 return 0;
1031}
1032
960b0458 1033static int context_parse_iovw(Context *context, struct iovec_wrapper *iovw) {
f46c706b
FB
1034 const char *unit;
1035 int r;
3c171f0b 1036
3c171f0b 1037 assert(context);
f46c706b 1038 assert(iovw);
3c171f0b 1039
313537da
LP
1040 /* Converts the data in the iovec array iovw into separate fields. Fills in context->meta[] (for
1041 * which no memory is allocated, it just contains direct pointers into the iovec array memory). */
3c171f0b 1042
960b0458
LP
1043 bool have_signal_name = false;
1044 FOREACH_ARRAY(iovec, iovw->iovec, iovw->count) {
fe96c0f8 1045 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
c673f1f6 1046 /* Note that these strings are NUL-terminated, because we made sure that a
f46c706b 1047 * trailing NUL byte is in the buffer, though not included in the iov_len
c673f1f6 1048 * count (see process_socket() and gather_pid_metadata_*()). */
f46c706b 1049 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
3c171f0b 1050
b1694040 1051 const char *p = memory_startswith(iovec->iov_base, iovec->iov_len, meta_field_names[i]);
f46c706b
FB
1052 if (p) {
1053 context->meta[i] = p;
3e4d0f6c 1054 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
f46c706b
FB
1055 break;
1056 }
1057 }
960b0458
LP
1058
1059 have_signal_name = have_signal_name ||
1060 memory_startswith(iovec->iov_base, iovec->iov_len, "COREDUMP_SIGNAL_NAME=");
3c171f0b 1061 }
f46c706b 1062
c673f1f6 1063 /* The basic fields from argv[] should always be there, refuse early if not. */
ded0aac3 1064 for (int i = 0; i < _META_ARGV_REQUIRED; i++)
098c3975 1065 if (!context->meta[i])
c673f1f6
ZJS
1066 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1067 "A required (%s) has not been sent, aborting.", meta_field_names[i]);
f46c706b 1068
313537da
LP
1069 pid_t parsed_pid;
1070 r = parse_pid(context->meta[META_ARGV_PID], &parsed_pid);
f46c706b
FB
1071 if (r < 0)
1072 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
313537da
LP
1073 if (pidref_is_set(&context->pidref)) {
1074 if (context->pidref.pid != parsed_pid)
c673f1f6
ZJS
1075 return log_error_errno(r, "Passed PID " PID_FMT " does not match passed " PID_FMT ": %m",
1076 parsed_pid, context->pidref.pid);
313537da
LP
1077 } else {
1078 r = pidref_set_pid(&context->pidref, parsed_pid);
1079 if (r < 0)
1080 return log_error_errno(r, "Failed to initialize pidref from pid " PID_FMT ": %m", parsed_pid);
1081 }
f46c706b 1082
9764bca9
NR
1083 r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
1084 if (r < 0)
1085 return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
1086
1087 r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
1088 if (r < 0)
1089 return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
1090
960b0458
LP
1091 r = parse_signo(context->meta[META_ARGV_SIGNAL], &context->signo);
1092 if (r < 0)
1093 log_warning_errno(r, "Failed to parse signal number \"%s\", ignoring: %m", context->meta[META_ARGV_SIGNAL]);
1094
19455dd6
LP
1095 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &context->rlimit);
1096 if (r < 0)
1097 log_warning_errno(r, "Failed to parse resource limit \"%s\", ignoring: %m", context->meta[META_ARGV_RLIMIT]);
1098
76e0ab49 1099 /* The value is set to contents of /proc/sys/fs/suid_dumpable, which we set to SUID_DUMP_SAFE (2),
0c49e004
ZJS
1100 * if the process is marked as not dumpable, see PR_SET_DUMPABLE(2const). */
1101 if (context->meta[META_ARGV_DUMPABLE]) {
1102 r = safe_atou(context->meta[META_ARGV_DUMPABLE], &context->dumpable);
1103 if (r < 0)
1104 return log_error_errno(r, "Failed to parse dumpable field \"%s\": %m", context->meta[META_ARGV_DUMPABLE]);
76e0ab49 1105 if (context->dumpable > SUID_DUMP_SAFE)
0c49e004
ZJS
1106 log_notice("Got unexpected %%d/dumpable value %u.", context->dumpable);
1107 }
1108
f46c706b
FB
1109 unit = context->meta[META_UNIT];
1110 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
1111 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
1112
960b0458
LP
1113 /* After parsing everything, let's also synthesize a new iovw field for the textual signal name if it
1114 * isn't already set. */
1115 if (SIGNAL_VALID(context->signo) && !have_signal_name)
1116 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG", signal_to_string(context->signo));
1117
f46c706b 1118 return 0;
3c171f0b
LP
1119}
1120
1121static int process_socket(int fd) {
2865561e 1122 _cleanup_(iovw_done_free) struct iovec_wrapper iovw = {};
313537da
LP
1123 _cleanup_(context_done) Context context = CONTEXT_NULL;
1124 _cleanup_close_ int input_fd = -EBADF;
313537da
LP
1125 enum {
1126 STATE_PAYLOAD,
1127 STATE_INPUT_FD_DONE,
1128 STATE_PID_FD_DONE,
1129 } state = STATE_PAYLOAD;
fe96c0f8 1130 int r;
3c171f0b
LP
1131
1132 assert(fd >= 0);
1133
d2acb93d 1134 log_setup();
3c171f0b 1135
ecfb4bb0 1136 log_debug("Processing coredump received via socket...");
988e89ee 1137
3c171f0b 1138 for (;;) {
fb29cdbe 1139 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
3c171f0b
LP
1140 struct msghdr mh = {
1141 .msg_control = &control,
1142 .msg_controllen = sizeof(control),
1143 .msg_iovlen = 1,
1144 };
369b1237 1145 ssize_t n, l;
3c171f0b 1146
fe1ef0f8 1147 l = next_datagram_size_fd(fd);
2865561e
LP
1148 if (l < 0)
1149 return log_error_errno(l, "Failed to determine datagram size to read: %m");
3c171f0b 1150
369b1237
LP
1151 _cleanup_(iovec_done) struct iovec iovec = {
1152 .iov_len = l,
1153 .iov_base = malloc(l + 1),
1154 };
2865561e
LP
1155 if (!iovec.iov_base)
1156 return log_oom();
3c171f0b 1157
9a435388 1158 mh.msg_iov = &iovec;
3c171f0b 1159
3691bcf3 1160 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
2865561e
LP
1161 if (n < 0)
1162 return log_error_errno(n, "Failed to receive datagram: %m");
3c171f0b 1163
313537da
LP
1164 /* The final zero-length datagrams ("sentinels") carry file descriptors and tell us that
1165 * we're done. There are three sentinels: one with just the coredump fd, followed by one with
1166 * the pidfd, and finally one with the mount tree fd. The latter two or the last one may be
1167 * omitted (which is supported for compatibility with older systemd version, in particular to
1168 * facilitate cross-container coredumping). */
3c171f0b 1169 if (n == 0) {
dac556fa 1170 struct cmsghdr *found;
3c171f0b 1171
313537da
LP
1172 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
1173 if (!found) {
1174 /* This is zero length message but it either doesn't carry a single
1175 * descriptor, or it has more than one. This is a protocol violation so let's
1176 * bail out.
1177 *
1178 * Well, not quite! In practice there's one more complication: EOF on
1179 * SOCK_SEQPACKET is not distinguishable from a zero length datagram. Hence
1180 * if we get a zero length datagram without fds we consider it EOF, and
1181 * that's permissible for the final two fds. Hence let's be strict on the
1182 * first fd, but lenient on the other two. */
1183
c673f1f6
ZJS
1184 if (!cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1) && state != STATE_PAYLOAD)
1185 /* No fds, and already got the first fd → we are done. */
313537da 1186 break;
a65ad191 1187
a65ad191 1188 cmsg_close_all(&mh);
2865561e
LP
1189 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1190 "Received zero length message with zero or more than one file descriptor(s), expected one.");
a65ad191
MS
1191 }
1192
313537da 1193 switch (state) {
68511ceb 1194
313537da
LP
1195 case STATE_PAYLOAD:
1196 assert(input_fd < 0);
1197 input_fd = *CMSG_TYPED_DATA(found, int);
1198 state = STATE_INPUT_FD_DONE;
1199 continue;
68511ceb 1200
313537da
LP
1201 case STATE_INPUT_FD_DONE:
1202 assert(!pidref_is_set(&context.pidref));
68511ceb 1203
313537da 1204 r = pidref_set_pidfd_consume(&context.pidref, *CMSG_TYPED_DATA(found, int));
2865561e
LP
1205 if (r < 0)
1206 return log_error_errno(r, "Failed to initialize pidref: %m");
68511ceb 1207
313537da
LP
1208 state = STATE_PID_FD_DONE;
1209 continue;
68511ceb 1210
313537da
LP
1211 case STATE_PID_FD_DONE:
1212 assert(context.mount_tree_fd < 0);
1213 context.mount_tree_fd = *CMSG_TYPED_DATA(found, int);
1214 /* We have all FDs we need so we are done. */
68511ceb 1215 break;
68511ceb
MS
1216 }
1217
313537da
LP
1218 break;
1219 }
a65ad191 1220
313537da 1221 cmsg_close_all(&mh);
a65ad191 1222
313537da 1223 /* Only zero length messages are allowed after the first message that carried a file descriptor. */
2865561e
LP
1224 if (state != STATE_PAYLOAD)
1225 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG), "Received unexpected message with non-zero length.");
d9fd1d37 1226
313537da 1227 /* Payload messages should not carry fds */
2865561e
LP
1228 if (cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, (socklen_t) -1))
1229 return log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
313537da 1230 "Received payload message with file descriptor(s), expected none.");
3c171f0b
LP
1231
1232 /* Add trailing NUL byte, in case these are strings */
9a435388
FB
1233 ((char*) iovec.iov_base)[n] = 0;
1234 iovec.iov_len = (size_t) n;
3c171f0b 1235
2865561e
LP
1236 if (iovw_put(&iovw, iovec.iov_base, iovec.iov_len) < 0)
1237 return log_oom();
369b1237
LP
1238
1239 TAKE_STRUCT(iovec);
34c10968
LP
1240 }
1241
61233823 1242 /* Make sure we got all data we really need */
f8540bde 1243 assert(input_fd >= 0);
3c171f0b 1244
32756e57 1245 r = context_parse_iovw(&context, &iovw);
f46c706b 1246 if (r < 0)
2865561e 1247 return r;
f46c706b 1248
49f1f2d4
ZJS
1249 /* Make sure we received all the expected fields. We support being called by an *older*
1250 * systemd-coredump from the outside, so we require only the basic set of fields that
1251 * was being sent when the support for sending to containers over a socket was added
1252 * in a108c43e36d3ceb6e34efe37c014fc2cda856000. */
1253 meta_argv_t i;
1254 FOREACH_ARGUMENT(i,
1255 META_ARGV_PID,
1256 META_ARGV_UID,
1257 META_ARGV_GID,
1258 META_ARGV_SIGNAL,
1259 META_ARGV_TIMESTAMP,
1260 META_ARGV_RLIMIT,
1261 META_ARGV_HOSTNAME,
1262 META_COMM)
2865561e 1263 if (!context.meta[i])
49f1f2d4
ZJS
1264 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1265 "Mandatory argument %s not received on socket, aborting.",
1266 meta_field_names[i]);
3c171f0b 1267
2865561e 1268 return submit_coredump(&context, &iovw, input_fd);
3c171f0b
LP
1269}
1270
313537da 1271static int send_iovec(const struct iovec_wrapper *iovw, int input_fd, PidRef *pidref, int mount_tree_fd) {
254d1313 1272 _cleanup_close_ int fd = -EBADF;
3c171f0b
LP
1273 int r;
1274
9a435388 1275 assert(iovw);
3c171f0b
LP
1276 assert(input_fd >= 0);
1277
1278 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1279 if (fd < 0)
1280 return log_error_errno(errno, "Failed to create coredump socket: %m");
1281
1861986a
LP
1282 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1283 if (r < 0)
1284 return log_error_errno(r, "Failed to connect to coredump service: %m");
3c171f0b 1285
fe96c0f8 1286 for (size_t i = 0; i < iovw->count; i++) {
fec603eb 1287 struct msghdr mh = {
9a435388 1288 .msg_iov = iovw->iovec + i,
fec603eb
LP
1289 .msg_iovlen = 1,
1290 };
1291 struct iovec copy[2];
1292
1293 for (;;) {
1294 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1295 break;
1296
1297 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
f46c706b
FB
1298 /* This field didn't fit? That's a pity. Given that this is
1299 * just metadata, let's truncate the field at half, and try
1300 * again. We append three dots, in order to show that this is
1301 * truncated. */
fec603eb
LP
1302
1303 if (mh.msg_iov != copy) {
f46c706b
FB
1304 /* We don't want to modify the caller's iovec, hence
1305 * let's create our own array, consisting of two new
1306 * iovecs, where the first is a (truncated) copy of
1307 * what we want to send, and the second one contains
1308 * the trailing dots. */
9a435388 1309 copy[0] = iovw->iovec[i];
ea8eb370 1310 copy[1] = IOVEC_MAKE(((const char[]){'.', '.', '.'}), 3);
fec603eb
LP
1311
1312 mh.msg_iov = copy;
1313 mh.msg_iovlen = 2;
1314 }
1315
1316 copy[0].iov_len /= 2; /* halve it, and try again */
1317 continue;
1318 }
3c171f0b 1319
3c171f0b 1320 return log_error_errno(errno, "Failed to send coredump datagram: %m");
fec603eb 1321 }
1eef15b1
ZJS
1322 }
1323
313537da 1324 /* First sentinel: the coredump fd */
3c171f0b
LP
1325 r = send_one_fd(fd, input_fd, 0);
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to send coredump fd: %m");
1eef15b1 1328
313537da
LP
1329 /* The optional second sentinel: the pidfd */
1330 if (!pidref_is_set(pidref) || pidref->fd < 0) /* If we have no pidfd, stop now */
1331 return 0;
1332
1333 r = send_one_fd(fd, pidref->fd, 0);
1334 if (r < 0)
1335 return log_error_errno(r, "Failed to send pidfd: %m");
1336
1337 /* The optional third sentinel: the mount tree fd */
1338 if (mount_tree_fd < 0) /* If we have no mount tree, stop now */
1339 return 0;
1340
1341 r = send_one_fd(fd, mount_tree_fd, 0);
1342 if (r < 0)
1343 return log_error_errno(r, "Failed to send mount tree fd: %m");
68511ceb 1344
3c171f0b
LP
1345 return 0;
1346}
1eef15b1 1347
64a5384f
LP
1348static int gather_pid_metadata_from_argv(
1349 struct iovec_wrapper *iovw,
1350 Context *context,
1351 int argc, char **argv) {
1352
868d9557
LB
1353 _cleanup_(pidref_done) PidRef local_pidref = PIDREF_NULL;
1354 int r, kernel_fd = -EBADF;
3c171f0b 1355
e6aa443f
LP
1356 assert(iovw);
1357 assert(context);
1358
f46c706b 1359 /* We gather all metadata that were passed via argv[] into an array of iovecs that
ded0aac3
ZJS
1360 * we'll forward to the socket unit.
1361 *
1362 * We require at least _META_ARGV_REQUIRED args, but will accept more.
1363 * We know how to parse _META_ARGV_MAX args. The rest will be ignored. */
3c171f0b 1364
ded0aac3 1365 if (argc < _META_ARGV_REQUIRED)
f46c706b 1366 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
ded0aac3
ZJS
1367 "Not enough arguments passed by the kernel (%i, expected between %i and %i).",
1368 argc, _META_ARGV_REQUIRED, _META_ARGV_MAX);
3c171f0b 1369
ded0aac3 1370 for (int i = 0; i < MIN(argc, _META_ARGV_MAX); i++) {
ea8eb370
LP
1371 _cleanup_free_ char *buf = NULL;
1372 const char *t = argv[i];
3c171f0b 1373
960b0458 1374 if (i == META_ARGV_TIMESTAMP) {
f46c706b
FB
1375 /* The journal fields contain the timestamp padded with six
1376 * zeroes, so that the kernel-supplied 1s granularity timestamps
e503019b 1377 * becomes 1μs granularity, i.e. the granularity systemd usually
f46c706b 1378 * operates in. */
ea8eb370
LP
1379 buf = strjoin(argv[i], "000000");
1380 if (!buf)
f46c706b 1381 return log_oom();
ea8eb370
LP
1382
1383 t = buf;
c8091d92
LP
1384 }
1385
868d9557
LB
1386 if (i == META_ARGV_PID) {
1387 /* Store this so that we can check whether the core will be forwarded to a container
1388 * even when the kernel doesn't provide a pidfd. Can be dropped once baseline is
1389 * >= v6.16. */
1390 r = pidref_set_pidstr(&local_pidref, t);
1391 if (r < 0)
1392 return log_error_errno(r, "Failed to initialize pidref from pid %s: %m", t);
1393 }
1394
1395 if (i == META_ARGV_PIDFD) {
1396 /* If the current kernel doesn't support the %F specifier (which resolves to a
1397 * pidfd), but we included it in the core_pattern expression, we'll receive an empty
1398 * string here. Deal with that gracefully. */
1399 if (isempty(t))
1400 continue;
1401
1402 assert(!pidref_is_set(&context->pidref));
1403 assert(kernel_fd < 0);
1404
1405 kernel_fd = parse_fd(t);
1406 if (kernel_fd < 0)
1407 return log_error_errno(kernel_fd, "Failed to parse pidfd \"%s\": %m", t);
1408
1409 r = pidref_set_pidfd(&context->pidref, kernel_fd);
1410 if (r < 0)
1411 return log_error_errno(r, "Failed to initialize pidref from pidfd %d: %m", kernel_fd);
1412
e6a8687b
ZJS
1413 context->got_pidfd = 1;
1414
868d9557
LB
1415 /* If there are containers involved with different versions of the code they might
1416 * not be using pidfds, so it would be wrong to set the metadata, skip it. */
1417 r = pidref_in_same_namespace(/* pid1 = */ NULL, &context->pidref, NAMESPACE_PID);
1418 if (r < 0)
1419 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1420 if (r <= 0)
1421 continue;
1422
1423 /* We don't print the fd number in the journal as it's meaningless, but we still
1424 * record that the parsing was done with a kernel-provided fd as it means it's safe
1425 * from races, which is valuable information to provide in the journal record. */
1426 t = "1";
1427 }
1428
f46c706b
FB
1429 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1430 if (r < 0)
1431 return r;
8c8549db 1432 }
803a3464 1433
f46c706b 1434 /* Cache some of the process metadata we collected so far and that we'll need to
c673f1f6 1435 * access soon. */
868d9557
LB
1436 r = context_parse_iovw(context, iovw);
1437 if (r < 0)
1438 return r;
1439
1440 /* If the kernel didn't give us a PIDFD, then use the one derived from the
1441 * PID immediately, given we have it. */
1442 if (!pidref_is_set(&context->pidref))
1443 context->pidref = TAKE_PIDREF(local_pidref);
1444
1445 /* Close the kernel-provided FD as the last thing after everything else succeeded. */
1446 kernel_fd = safe_close(kernel_fd);
1447
1448 return 0;
f46c706b 1449}
3c171f0b 1450
db9ac801 1451static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
f46c706b
FB
1452 uid_t owner_uid;
1453 pid_t pid;
1454 char *t;
3e4d0f6c 1455 size_t size;
f46c706b
FB
1456 const char *p;
1457 int r;
f5e04665 1458
e6aa443f
LP
1459 assert(iovw);
1460 assert(context);
1461
f46c706b
FB
1462 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1463 * structure. (It remains valid, with the first iovec fields initialized.) */
f5e04665 1464
313537da 1465 pid = context->pidref.pid;
f5e04665 1466
f46c706b 1467 /* The following is mandatory */
1f485bc7 1468 r = pidref_get_comm(&context->pidref, &t);
9a435388 1469 if (r < 0)
f46c706b 1470 return log_error_errno(r, "Failed to get COMM: %m");
f5e04665 1471
f46c706b 1472 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
9a435388
FB
1473 if (r < 0)
1474 return r;
f45b8015 1475
c790632c 1476 /* The following are optional, but we use them if present. */
2a3bebd0
FB
1477 r = get_process_exe(pid, &t);
1478 if (r >= 0)
1479 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1480 if (r < 0)
f46c706b 1481 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
bdfd7b2c 1482
1f485bc7 1483 if (cg_pidref_get_unit(&context->pidref, &t) >= 0)
2a3bebd0 1484 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
f5e04665 1485
f46c706b 1486 if (cg_pid_get_user_unit(pid, &t) >= 0)
2a3bebd0 1487 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
f46c706b 1488
8703a508 1489 if (cg_pidref_get_session(&context->pidref, &t) >= 0)
9a435388 1490 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
f5e04665 1491
8703a508 1492 if (cg_pidref_get_owner_uid(&context->pidref, &owner_uid) >= 0) {
9a435388 1493 r = asprintf(&t, UID_FMT, owner_uid);
7de80bfe 1494 if (r > 0)
9a435388 1495 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
f5e04665
LP
1496 }
1497
9aa82023 1498 if (sd_pid_get_slice(pid, &t) >= 0)
2a3bebd0 1499 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
f5e04665 1500
1f485bc7 1501 if (pidref_get_cmdline(&context->pidref, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
2a3bebd0 1502 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
a035f819 1503
9aa82023 1504 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
2a3bebd0 1505 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
a035f819 1506
9aa82023 1507 if (compose_open_fds(pid, &t) >= 0)
2a3bebd0 1508 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
3f132692
JF
1509
1510 p = procfs_file_alloca(pid, "status");
da65941c 1511 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
2a3bebd0 1512 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
3f132692
JF
1513
1514 p = procfs_file_alloca(pid, "maps");
da65941c 1515 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
2a3bebd0 1516 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
3f132692 1517
da65941c
LP
1518 p = procfs_file_alloca(pid, "limits"); /* this uses 'seq_file' in kernel, use read_full_file_at() */
1519 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
2a3bebd0 1520 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
3f132692
JF
1521
1522 p = procfs_file_alloca(pid, "cgroup");
da65941c 1523 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
2a3bebd0 1524 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
3f132692 1525
d7032b1f 1526 p = procfs_file_alloca(pid, "mountinfo");
da65941c 1527 if (read_full_file(p, &t, /* ret_size= */ NULL) >= 0)
2a3bebd0 1528 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
d7032b1f 1529
3e4d0f6c
ZJS
1530 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1531 p = procfs_file_alloca(pid, "auxv");
da65941c 1532 if (read_full_file(p, &t, &size) >= 0) {
3e4d0f6c
ZJS
1533 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1534 if (buf) {
32756e57 1535 /* Add a dummy terminator to make context_parse_iovw() happy. */
eda62239 1536 *mempcpy_typesafe(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size) = '\0';
3e4d0f6c
ZJS
1537 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1538 }
1539
1540 free(t);
1541 }
1542
9aa82023 1543 if (get_process_cwd(pid, &t) >= 0)
2a3bebd0 1544 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
3f132692
JF
1545
1546 if (get_process_root(pid, &t) >= 0) {
9aa82023
ZJS
1547 bool proc_self_root_is_slash;
1548
1549 proc_self_root_is_slash = strcmp(t, "/") == 0;
3f132692 1550
2a3bebd0 1551 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
7ed03ce6
JF
1552
1553 /* If the process' root is "/", then there is a chance it has
1554 * mounted own root and hence being containerized. */
0b8b1332 1555 if (proc_self_root_is_slash && get_process_container_parent_cmdline(&context->pidref, &t) > 0)
2a3bebd0 1556 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
3f132692
JF
1557 }
1558
9aa82023 1559 if (get_process_environ(pid, &t) >= 0)
2a3bebd0 1560 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
9aa82023 1561
c673f1f6 1562 /* Now that we have parsed info from /proc/ ensure the pidfd is still valid before continuing. */
ba6c955f
LB
1563 r = pidref_verify(&context->pidref);
1564 if (r < 0)
1565 return log_error_errno(r, "PIDFD validation failed: %m");
1566
c673f1f6 1567 /* We successfully acquired all metadata. */
32756e57 1568 return context_parse_iovw(context, iovw);
9aa82023 1569}
3f132692 1570
ea8eb370 1571static int send_ucred(int transport_fd, const struct ucred *ucred) {
a108c43e
NR
1572 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1573 struct msghdr mh = {
1574 .msg_control = &control,
1575 .msg_controllen = sizeof(control),
1576 };
1577 struct cmsghdr *cmsg;
1578
1579 assert(transport_fd >= 0);
ea8eb370 1580 assert(ucred);
a108c43e
NR
1581
1582 cmsg = CMSG_FIRSTHDR(&mh);
1583 *cmsg = (struct cmsghdr) {
1584 .cmsg_level = SOL_SOCKET,
1585 .cmsg_type = SCM_CREDENTIALS,
1586 .cmsg_len = CMSG_LEN(sizeof(struct ucred)),
1587 };
1588 memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred));
1589
1590 return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL));
1591}
1592
1593static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
1594 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1595 struct msghdr mh = {
1596 .msg_control = &control,
1597 .msg_controllen = sizeof(control),
1598 };
1599 struct cmsghdr *cmsg = NULL;
1600 struct ucred *ucred = NULL;
1601 ssize_t n;
1602
ea8eb370 1603 assert(transport_fd >= 0);
a108c43e
NR
1604 assert(ret_ucred);
1605
1606 n = recvmsg_safe(transport_fd, &mh, 0);
1607 if (n < 0)
1608 return n;
1609
1610 CMSG_FOREACH(cmsg, &mh)
1611 if (cmsg->cmsg_level == SOL_SOCKET &&
1612 cmsg->cmsg_type == SCM_CREDENTIALS &&
1613 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
1614
1615 assert(!ucred);
1616 ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
1617 }
1618
1619 if (!ucred)
1620 return -EIO;
1621
1622 *ret_ucred = *ucred;
1623
1624 return 0;
1625}
1626
8fc7b2a2 1627static int can_forward_coredump(Context *context, const PidRef *pid) {
a108c43e
NR
1628 _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
1629 int r;
1630
8fc7b2a2 1631 assert(context);
0b8b1332
LP
1632 assert(pidref_is_set(pid));
1633 assert(!pidref_is_remote(pid));
1634
e6a8687b
ZJS
1635 /* We need to avoid a situation where the attacker crashes a SUID process or a root daemon and
1636 * quickly replaces it with a namespaced process and we forward the coredump to the attacker, into
1637 * the namespace. With %F/pidfd we can reliably check the namespace of the original process, hence we
1638 * can allow forwarding. */
76e0ab49 1639 if (!context->got_pidfd && context->dumpable != SUID_DUMP_USER)
8fc7b2a2
ZJS
1640 return false;
1641
0b8b1332 1642 r = cg_pidref_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
a108c43e
NR
1643 if (r < 0)
1644 return r;
1645
1646 r = path_extract_directory(cgroup, &path);
1647 if (r < 0)
1648 return r;
1649
1650 r = cg_path_get_unit_path(path, &unit);
1651 if (r == -ENOMEM)
1652 return log_oom();
1653 if (r == -ENXIO)
1654 /* No valid units in this path. */
1655 return false;
1656 if (r < 0)
1657 return r;
1658
1659 /* We require that this process belongs to a delegated cgroup
1660 * (i.e. Delegate=yes), with CoredumpReceive=yes also. */
1661 r = cg_is_delegated(unit);
1662 if (r <= 0)
1663 return r;
1664
1665 return cg_has_coredump_receive(unit);
1666}
1667
1668static int forward_coredump_to_container(Context *context) {
1669 _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF;
71136404 1670 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
0b8b1332 1671 pid_t child;
a108c43e 1672 struct ucred ucred = {
313537da 1673 .pid = context->pidref.pid,
a108c43e
NR
1674 .uid = context->uid,
1675 .gid = context->gid,
1676 };
1677 int r;
1678
313537da
LP
1679 assert(context);
1680
0b8b1332
LP
1681 _cleanup_(pidref_done) PidRef leader_pid = PIDREF_NULL;
1682 r = namespace_get_leader(&context->pidref, NAMESPACE_PID, &leader_pid);
a108c43e
NR
1683 if (r < 0)
1684 return log_debug_errno(r, "Failed to get namespace leader: %m");
1685
8fc7b2a2 1686 r = can_forward_coredump(context, &leader_pid);
a108c43e
NR
1687 if (r < 0)
1688 return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
1689 if (r == 0)
1690 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
1691 "Coredump will not be forwarded because no target cgroup was found.");
1692
1693 r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair));
1694 if (r < 0)
1695 return log_debug_errno(r, "Failed to create socket pair: %m");
1696
1697 r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true);
1698 if (r < 0)
1699 return log_debug_errno(r, "Failed to set SO_PASSCRED: %m");
1700
0b8b1332 1701 r = pidref_namespace_open(&leader_pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd);
a108c43e 1702 if (r < 0)
0b8b1332 1703 return log_debug_errno(r, "Failed to open namespaces of PID " PID_FMT ": %m", leader_pid.pid);
a108c43e
NR
1704
1705 r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0,
e9ccae31 1706 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
a108c43e
NR
1707 pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child);
1708 if (r < 0)
0b8b1332 1709 return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", leader_pid.pid);
a108c43e 1710 if (r == 0) {
a108c43e
NR
1711 pair[0] = safe_close(pair[0]);
1712
3f8999a7 1713 r = access_nofollow("/run/systemd/coredump", W_OK);
7c1dd9e2
MY
1714 if (r < 0) {
1715 log_debug_errno(r, "Cannot find coredump socket, exiting: %m");
a108c43e
NR
1716 _exit(EXIT_FAILURE);
1717 }
1718
1719 r = receive_ucred(pair[1], &ucred);
1720 if (r < 0) {
1721 log_debug_errno(r, "Failed to receive ucred and fd: %m");
1722 _exit(EXIT_FAILURE);
1723 }
1724
313537da 1725 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = iovw_new();
a108c43e
NR
1726 if (!iovw) {
1727 log_oom();
1728 _exit(EXIT_FAILURE);
1729 }
1730
1731 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1732 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1733 (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1");
1734
1735 for (int i = 0; i < _META_ARGV_MAX; i++) {
a108c43e
NR
1736 char buf[DECIMAL_STR_MAX(pid_t)];
1737 const char *t = context->meta[i];
1738
ea8eb370 1739 /* Patch some of the fields with the translated ucred data */
1d03d970 1740 switch (i) {
a108c43e
NR
1741
1742 case META_ARGV_PID:
1743 xsprintf(buf, PID_FMT, ucred.pid);
1744 t = buf;
a108c43e
NR
1745 break;
1746
1747 case META_ARGV_UID:
1748 xsprintf(buf, UID_FMT, ucred.uid);
1749 t = buf;
1750 break;
1751
1752 case META_ARGV_GID:
1753 xsprintf(buf, GID_FMT, ucred.gid);
1754 t = buf;
1755 break;
1756
a108c43e 1757 default:
5c9feb2d 1758 ;
a108c43e
NR
1759 }
1760
1761 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1762 if (r < 0) {
1763 log_debug_errno(r, "Failed to construct iovec: %m");
1764 _exit(EXIT_FAILURE);
1765 }
1766 }
1767
313537da 1768 _cleanup_(context_done) Context child_context = CONTEXT_NULL;
32756e57 1769 r = context_parse_iovw(&child_context, iovw);
a108c43e
NR
1770 if (r < 0) {
1771 log_debug_errno(r, "Failed to save context: %m");
1772 _exit(EXIT_FAILURE);
1773 }
1774
1775 r = gather_pid_metadata_from_procfs(iovw, &child_context);
1776 if (r < 0) {
1777 log_debug_errno(r, "Failed to gather metadata from procfs: %m");
1778 _exit(EXIT_FAILURE);
1779 }
1780
313537da 1781 r = send_iovec(iovw, STDIN_FILENO, &context->pidref, /* mount_tree_fd= */ -EBADF);
a108c43e
NR
1782 if (r < 0) {
1783 log_debug_errno(r, "Failed to send iovec to coredump socket: %m");
1784 _exit(EXIT_FAILURE);
1785 }
1786
1787 _exit(EXIT_SUCCESS);
1788 }
1789
1790 pair[1] = safe_close(pair[1]);
1791
1792 /* We need to translate the PID, UID, and GID of the crashing process
1793 * to the container's namespaces. Do this by sending an SCM_CREDENTIALS
1794 * message on a socket pair, and read the result when we join the
1795 * container. The kernel will perform the translation for us. */
1796 r = send_ucred(pair[0], &ucred);
1797 if (r < 0)
1798 return log_debug_errno(r, "Failed to send metadata to container: %m");
1799
1800 r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0);
1801 if (r < 0)
1802 return log_debug_errno(r, "Failed to wait for child to terminate: %m");
1803 if (r != EXIT_SUCCESS)
4e494e6a 1804 return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container.");
a108c43e
NR
1805
1806 return 0;
1807}
1808
00f73980 1809static int acquire_pid_mount_tree_fd(const Context *context, int *ret_fd) {
b8fe1b1d
MS
1810 /* Don't bother preparing environment if we can't pass it to libdwfl. */
1811#if !HAVE_DWFL_SET_SYSROOT
1812 *ret_fd = -EOPNOTSUPP;
1813 log_debug("dwfl_set_sysroot() is not supported.");
1814#else
1815 _cleanup_close_ int mntns_fd = -EBADF, root_fd = -EBADF, fd = -EBADF;
68511ceb 1816 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
b8fe1b1d 1817 int r;
68511ceb
MS
1818
1819 assert(context);
b8fe1b1d 1820 assert(ret_fd);
68511ceb 1821
e26a7e08 1822 if (!arg_enter_namespace) {
b8fe1b1d
MS
1823 *ret_fd = -EHOSTDOWN;
1824 log_debug("EnterNamespace=no so we won't use mount tree of the crashed process for generating backtrace.");
1825 return 0;
1826 }
68511ceb
MS
1827
1828 if (socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair) < 0)
1829 return log_error_errno(errno, "Failed to create socket pair: %m");
1830
36812cb6
LP
1831 r = pidref_namespace_open(
1832 &context->pidref,
1833 /* ret_pidns_fd= */ NULL,
1834 &mntns_fd,
1835 /* ret_netns_fd= */ NULL,
1836 /* ret_userns_fd= */ NULL,
1837 &root_fd);
68511ceb
MS
1838 if (r < 0)
1839 return log_error_errno(r, "Failed to open mount namespace of crashing process: %m");
1840
a88e72be
MS
1841 r = namespace_fork("(sd-mount-tree-ns)",
1842 "(sd-mount-tree)",
1843 /* except_fds= */ NULL,
1844 /* n_except_fds= */ 0,
c287f0f7 1845 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGKILL|FORK_LOG|FORK_WAIT,
a88e72be
MS
1846 /* pidns_fd= */ -EBADF,
1847 mntns_fd,
1848 /* netns_fd= */ -EBADF,
1849 /* userns_fd= */ -EBADF,
1850 root_fd,
c287f0f7 1851 NULL);
68511ceb 1852 if (r < 0)
e5bad3a7 1853 return r;
68511ceb
MS
1854 if (r == 0) {
1855 pair[0] = safe_close(pair[0]);
1856
84289ab9
MS
1857 fd = open_tree(-EBADF, "/", AT_NO_AUTOMOUNT | AT_RECURSIVE | AT_SYMLINK_NOFOLLOW | OPEN_TREE_CLOEXEC | OPEN_TREE_CLONE);
1858 if (fd < 0) {
68511ceb
MS
1859 log_error_errno(errno, "Failed to clone mount tree: %m");
1860 _exit(EXIT_FAILURE);
1861 }
1862
84289ab9 1863 r = send_one_fd(pair[1], fd, 0);
68511ceb
MS
1864 if (r < 0) {
1865 log_error_errno(r, "Failed to send mount tree to parent: %m");
1866 _exit(EXIT_FAILURE);
1867 }
1868
1869 _exit(EXIT_SUCCESS);
1870 }
1871
1872 pair[1] = safe_close(pair[1]);
1873
68511ceb
MS
1874 fd = receive_one_fd(pair[0], MSG_DONTWAIT);
1875 if (fd < 0)
1876 return log_error_errno(fd, "Failed to receive mount tree: %m");
1877
b8fe1b1d
MS
1878 *ret_fd = TAKE_FD(fd);
1879#endif
1880 return 0;
68511ceb
MS
1881}
1882
92b8e5e7 1883static int process_kernel(int argc, char *argv[]) {
6257e2fb 1884 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
313537da 1885 _cleanup_(context_done) Context context = CONTEXT_NULL;
960b0458 1886 int r;
9aa82023 1887
1f9d2a81
DDM
1888 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1889 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1890 * /dev/null. */
5bb1d7fb 1891 r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
1f9d2a81
DDM
1892 if (r < 0)
1893 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1894
988e89ee
ZJS
1895 log_debug("Processing coredump received from the kernel...");
1896
9a435388
FB
1897 iovw = iovw_new();
1898 if (!iovw)
1899 return log_oom();
1900
f46c706b
FB
1901 /* Collect all process metadata passed by the kernel through argv[] */
1902 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
92e92d71 1903 if (r < 0)
6257e2fb 1904 return r;
86562420 1905
f46c706b 1906 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1907 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1908 if (r < 0)
6257e2fb 1909 return r;
f46c706b 1910
1e344c1d 1911 if (!context.is_journald)
f46c706b 1912 /* OK, now we know it's not the journal, hence we can make use of it now. */
1e344c1d 1913 log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
f46c706b 1914
2a9b1a76
HB
1915 /* Log minimal metadata now, so it is not lost if the system is about to shut down. */
1916 log_info("Process %s (%s) of user %s terminated abnormally with signal %s/%s, processing...",
960b0458
LP
1917 context.meta[META_ARGV_PID], context.meta[META_COMM],
1918 context.meta[META_ARGV_UID], context.meta[META_ARGV_SIGNAL],
1919 signal_to_string(context.signo));
2a9b1a76 1920
92b8e5e7 1921 r = pidref_in_same_namespace(/* pid1 = */ NULL, &context.pidref, NAMESPACE_PID);
a108c43e
NR
1922 if (r < 0)
1923 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1924 if (r == 0) {
1925 /* If this fails, fallback to the old behavior so that
1926 * there is still some record of the crash. */
1927 r = forward_coredump_to_container(&context);
1928 if (r >= 0)
1929 return 0;
68511ceb 1930
00f73980 1931 r = acquire_pid_mount_tree_fd(&context, &context.mount_tree_fd);
b8fe1b1d 1932 if (r < 0)
68511ceb 1933 log_warning_errno(r, "Failed to access the mount tree of a container, ignoring: %m");
a108c43e
NR
1934 }
1935
c673f1f6 1936 /* If this is PID 1, disable coredump collection, we'll unlikely be able to process
f46c706b
FB
1937 * it later on.
1938 *
1939 * FIXME: maybe we should disable coredumps generation from the beginning and
c673f1f6
ZJS
1940 * re-enable it only when we know it's either safe (i.e. we're not running OOM) or
1941 * it's not PID 1 ? */
f46c706b
FB
1942 if (context.is_pid1) {
1943 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1944 disable_coredumps();
1945 }
34c10968 1946
a108c43e
NR
1947 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1948 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1949
f46c706b 1950 if (context.is_journald || context.is_pid1)
313537da 1951 return submit_coredump(&context, iovw, STDIN_FILENO);
9aa82023 1952
313537da 1953 return send_iovec(iovw, STDIN_FILENO, &context.pidref, context.mount_tree_fd);
3c171f0b 1954}
34c10968 1955
988e89ee 1956static int process_backtrace(int argc, char *argv[]) {
3a19fe46
YW
1957 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1958 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
313537da 1959 _cleanup_(context_done) Context context = CONTEXT_NULL;
9a435388 1960 char *message;
988e89ee
ZJS
1961 int r;
1962
ea8eb370
LP
1963 assert(argc >= 2);
1964
988e89ee
ZJS
1965 log_debug("Processing backtrace on stdin...");
1966
9a435388
FB
1967 iovw = iovw_new();
1968 if (!iovw)
5b45a160
ZJS
1969 return log_oom();
1970
2a3bebd0
FB
1971 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1972 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
f46c706b
FB
1973
1974 /* Collect all process metadata from argv[] by making sure to skip the
1975 * '--backtrace' option */
1976 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
988e89ee 1977 if (r < 0)
3a19fe46 1978 return r;
aaeb2522 1979
f46c706b 1980 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1981 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1982 if (r < 0)
3a19fe46 1983 return r;
988e89ee 1984
86562420 1985 for (;;) {
5b45a160 1986 r = journal_importer_process_data(&importer);
3a19fe46
YW
1987 if (r < 0)
1988 return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
d74dc4f2
ZJS
1989 if (r == 1 || /* complete entry */
1990 journal_importer_eof(&importer)) /* end of data */
5b45a160 1991 break;
988e89ee 1992 }
988e89ee 1993
5b45a160
ZJS
1994 if (journal_importer_eof(&importer)) {
1995 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
988e89ee 1996
f46c706b
FB
1997 message = strjoina("Process ", context.meta[META_ARGV_PID],
1998 " (", context.meta[META_COMM], ")"
1999 " of user ", context.meta[META_ARGV_UID],
2000 " failed with ", context.meta[META_ARGV_SIGNAL]);
9a435388
FB
2001
2002 r = iovw_put_string_field(iovw, "MESSAGE=", message);
2003 if (r < 0)
3a19fe46 2004 return r;
5b45a160 2005 } else {
3a19fe46
YW
2006 /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
2007 * end of the array. */
2008 r = iovw_append(iovw, &importer.iovw);
2009 if (r < 0)
2010 return r;
9a435388 2011 }
988e89ee 2012
9a435388 2013 r = sd_journal_sendv(iovw->iovec, iovw->count);
988e89ee 2014 if (r < 0)
3a19fe46 2015 return log_error_errno(r, "Failed to log backtrace: %m");
988e89ee 2016
3a19fe46 2017 return 0;
988e89ee
ZJS
2018}
2019
4515a95e 2020static int run(int argc, char *argv[]) {
3c171f0b 2021 int r;
fee80f69 2022
9aa82023
ZJS
2023 /* First, log to a safe place, since we don't know what crashed and it might
2024 * be journald which we'd rather not log to then. */
8d4e028f 2025
1e344c1d 2026 log_set_target_and_open(LOG_TARGET_KMSG);
8d4e028f 2027
3c171f0b 2028 /* Make sure we never enter a loop */
9ce8e3e4 2029 (void) set_dumpable(SUID_DUMP_DISABLE);
8d4e028f 2030
3c171f0b
LP
2031 /* Ignore all parse errors */
2032 (void) parse_config();
fee80f69 2033
3c171f0b
LP
2034 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
2035 log_debug("Selected compression %s.", yes_no(arg_compress));
fee80f69 2036
3c171f0b 2037 r = sd_listen_fds(false);
4515a95e
ZJS
2038 if (r < 0)
2039 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
fee80f69 2040
9aa82023
ZJS
2041 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
2042 * are invoked from the kernel as coredump handler. */
988e89ee
ZJS
2043 if (r == 0) {
2044 if (streq_ptr(argv[1], "--backtrace"))
4515a95e 2045 return process_backtrace(argc, argv);
988e89ee 2046 else
4515a95e 2047 return process_kernel(argc, argv);
988e89ee 2048 } else if (r == 1)
4515a95e 2049 return process_socket(SD_LISTEN_FDS_START);
f5e04665 2050
baaa35ad
ZJS
2051 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
2052 "Received unexpected number of file descriptors.");
f5e04665 2053}
4515a95e
ZJS
2054
2055DEFINE_MAIN_FUNCTION(run);