]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/coredump/coredump.c
process-util: introduce namespace_get_leader helper
[thirdparty/systemd.git] / src / coredump / coredump.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
f5e04665
LP
2
3#include <errno.h>
803a3464
LP
4#include <stdio.h>
5#include <sys/prctl.h>
587f2a5e 6#include <sys/statvfs.h>
3e4d0f6c 7#include <sys/auxv.h>
cacd6403 8#include <sys/xattr.h>
4f5dd394 9#include <unistd.h>
f5e04665 10
73a99163 11#include "sd-daemon.h"
f11943c5
LP
12#include "sd-journal.h"
13#include "sd-login.h"
73a99163 14#include "sd-messages.h"
4f5dd394
LP
15
16#include "acl-util.h"
b5efdb8a 17#include "alloc-util.h"
587f2a5e 18#include "bus-error.h"
430f0182 19#include "capability-util.h"
ba1261bc 20#include "cgroup-util.h"
4f5dd394 21#include "compress.h"
34c10968
LP
22#include "conf-parser.h"
23#include "copy.h"
c8715007 24#include "coredump-util.h"
f11943c5 25#include "coredump-vacuum.h"
a0956174 26#include "dirent-util.h"
ea680f05 27#include "elf-util.h"
4f5dd394 28#include "escape.h"
3ffd4af2 29#include "fd-util.h"
4f5dd394 30#include "fileio.h"
f4f15635 31#include "fs-util.h"
afc5dbf3 32#include "io-util.h"
b18453ed 33#include "journal-importer.h"
5edf875b 34#include "journal-send.h"
4f5dd394
LP
35#include "log.h"
36#include "macro.h"
5e332028 37#include "main-func.h"
0a970718 38#include "memory-util.h"
2485b7e2 39#include "memstream-util.h"
35cd0ba5 40#include "mkdir-label.h"
6bedfcbb 41#include "parse-util.h"
0b452006 42#include "process-util.h"
d14bcb4e 43#include "signal-util.h"
3c171f0b 44#include "socket-util.h"
4f5dd394 45#include "special.h"
587f2a5e 46#include "stat-util.h"
8b43440b 47#include "string-table.h"
07630cea 48#include "string-util.h"
4f5dd394 49#include "strv.h"
bf819d3a 50#include "sync-util.h"
e4de7287 51#include "tmpfile-util.h"
b085d224 52#include "uid-alloc-range.h"
b1d4f8e1 53#include "user-util.h"
34727273 54
da890466 55/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
e677041e
LP
56#if __SIZEOF_POINTER__ == 4
57#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
58#elif __SIZEOF_POINTER__ == 8
59#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
60#else
61#error "Unexpected pointer size"
62#endif
34c10968 63
bdfd7b2c 64/* The maximum size up to which we leave the coredump around on disk */
34c10968
LP
65#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
66
bdfd7b2c 67/* The maximum size up to which we store the coredump in the journal */
25cad95c 68#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
34c10968 69#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
25cad95c
YW
70#else
71/* oss-fuzz limits memory usage. */
72#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
73#endif
f5e04665 74
587f2a5e
LB
75/* When checking for available memory and setting lower limits, don't
76 * go below 4MB for writing core files to storage. */
77#define PROCESS_SIZE_MIN (4U*1024U*1024U)
78
c4aa09b0 79/* Make sure to not make this larger than the maximum journal entry
27f931d1 80 * size. See DATA_SIZE_MAX in journal-importer.h. */
874bc134 81assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
f5e04665
LP
82
83enum {
f46c706b 84 /* We use these as array indexes for our process metadata cache.
ea5cc2a8 85 *
f46c706b
FB
86 * The first indices of the cache stores the same metadata as the ones passed by
87 * the kernel via argv[], ie the strings array passed by the kernel according to
88 * our pattern defined in /proc/sys/kernel/core_pattern (see man:core(5)). */
89
90 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
91 META_ARGV_UID, /* %u: as seen in the initial user namespace */
92 META_ARGV_GID, /* %g: as seen in the initial user namespace */
93 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
e503019b 94 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
f46c706b
FB
95 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
96 META_ARGV_HOSTNAME, /* %h: hostname */
97 _META_ARGV_MAX,
98
99 /* The following indexes are cached for a couple of special fields we use (and
100 * thereby need to be retrieved quickly) for naming coredump files, and attaching
101 * xattrs. Unlike the previous ones they are retrieved from the runtime
102 * environment. */
103
104 META_COMM = _META_ARGV_MAX,
105 _META_MANDATORY_MAX,
106
107 /* The rest are similar to the previous ones except that we won't fail if one of
108 * them is missing. */
109
110 META_EXE = _META_MANDATORY_MAX,
111 META_UNIT,
3e4d0f6c 112 META_PROC_AUXV,
f46c706b 113 _META_MAX
f5e04665
LP
114};
115
f46c706b 116static const char * const meta_field_names[_META_MAX] = {
510a1466
ZJS
117 [META_ARGV_PID] = "COREDUMP_PID=",
118 [META_ARGV_UID] = "COREDUMP_UID=",
119 [META_ARGV_GID] = "COREDUMP_GID=",
120 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
121 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
122 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
123 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
124 [META_COMM] = "COREDUMP_COMM=",
125 [META_EXE] = "COREDUMP_EXE=",
126 [META_UNIT] = "COREDUMP_UNIT=",
3e4d0f6c 127 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
f46c706b
FB
128};
129
130typedef struct Context {
131 const char *meta[_META_MAX];
3e4d0f6c 132 size_t meta_size[_META_MAX];
f46c706b 133 pid_t pid;
9764bca9
NR
134 uid_t uid;
135 gid_t gid;
f46c706b
FB
136 bool is_pid1;
137 bool is_journald;
138} Context;
139
34c10968
LP
140typedef enum CoredumpStorage {
141 COREDUMP_STORAGE_NONE,
142 COREDUMP_STORAGE_EXTERNAL,
143 COREDUMP_STORAGE_JOURNAL,
34c10968 144 _COREDUMP_STORAGE_MAX,
2d93c20e 145 _COREDUMP_STORAGE_INVALID = -EINVAL,
34c10968
LP
146} CoredumpStorage;
147
34c10968 148static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
510a1466 149 [COREDUMP_STORAGE_NONE] = "none",
34c10968 150 [COREDUMP_STORAGE_EXTERNAL] = "external",
510a1466 151 [COREDUMP_STORAGE_JOURNAL] = "journal",
34c10968
LP
152};
153
154DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
8c9571d0 155static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting");
34727273
ZJS
156
157static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
8c9571d0 158static bool arg_compress = true;
59f448cf
LP
159static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
160static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
6e2b4a69 161static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
f5fbe71d
YW
162static uint64_t arg_keep_free = UINT64_MAX;
163static uint64_t arg_max_use = UINT64_MAX;
34c10968
LP
164
165static int parse_config(void) {
34c10968 166 static const ConfigTableItem items[] = {
510a1466
ZJS
167 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
168 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
169 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
170 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
171 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
172 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
173 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
34c10968
LP
174 {}
175 };
176
4a78074f
LP
177 int r;
178
179 r = config_parse_config_file(
180 "coredump.conf",
181 "Coredump\0",
182 config_item_table_lookup,
183 items,
184 CONFIG_PARSE_WARN,
185 /* userdata= */ NULL);
186 if (r < 0)
187 return r;
188
189 /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
190 * efficiency reasons. journald wouldn't accept anything larger anyway. */
191 if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
192 log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
193 FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
194 arg_journal_size_max = JOURNAL_SIZE_MAX;
195 }
196
197 return 0;
34c10968
LP
198}
199
a1e92eee 200static uint64_t storage_size_max(void) {
ee0449fd
ZJS
201 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
202 return arg_external_size_max;
203 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
204 return arg_journal_size_max;
205 assert(arg_storage == COREDUMP_STORAGE_NONE);
206 return 0;
73a99163
ZJS
207}
208
3e4d0f6c
ZJS
209static int fix_acl(int fd, uid_t uid, bool allow_user) {
210 assert(fd >= 0);
211 assert(uid_is_valid(uid));
34c10968 212
349cc4a5 213#if HAVE_ACL
709f6e46 214 int r;
34c10968 215
3e4d0f6c
ZJS
216 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
217 if (!allow_user)
218 return 0;
b59233e6 219
05fd2156 220 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY)
34c10968
LP
221 return 0;
222
d81be4e7 223 /* Make sure normal users can read (but not write or delete) their own coredumps */
567aeb58 224 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
709f6e46 225 if (r < 0)
567aeb58 226 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
34c10968
LP
227#endif
228
229 return 0;
230}
231
f46c706b
FB
232static int fix_xattr(int fd, const Context *context) {
233
234 static const char * const xattrs[_META_MAX] = {
510a1466
ZJS
235 [META_ARGV_PID] = "user.coredump.pid",
236 [META_ARGV_UID] = "user.coredump.uid",
237 [META_ARGV_GID] = "user.coredump.gid",
238 [META_ARGV_SIGNAL] = "user.coredump.signal",
239 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
240 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
241 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
242 [META_COMM] = "user.coredump.comm",
243 [META_EXE] = "user.coredump.exe",
0cd77f97
LP
244 };
245
34c10968
LP
246 int r = 0;
247
b59233e6
LP
248 assert(fd >= 0);
249
60ecc386 250 /* Attach some metadata to coredumps via extended attributes. Just because we can. */
34c10968 251
fe96c0f8 252 for (unsigned i = 0; i < _META_MAX; i++) {
1eef15b1
ZJS
253 int k;
254
f46c706b 255 if (isempty(context->meta[i]) || !xattrs[i])
0cd77f97 256 continue;
34c10968 257
60ecc386
ZJS
258 k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
259 RET_GATHER(r, k);
0cd77f97 260 }
34c10968
LP
261
262 return r;
263}
264
b0b21dce 265#define filename_escape(s) xescape((s), "./ ")
34c10968 266
a1e92eee 267static const char *coredump_tmpfile_name(const char *s) {
1da3cb81 268 return s ?: "(unnamed temporary file)";
0c773903
EV
269}
270
b59233e6
LP
271static int fix_permissions(
272 int fd,
273 const char *filename,
274 const char *target,
f46c706b 275 const Context *context,
3e4d0f6c 276 bool allow_user) {
b59233e6 277
03532f0a
LP
278 int r;
279
b59233e6 280 assert(fd >= 0);
b59233e6 281 assert(target);
3c171f0b 282 assert(context);
cfd652ed
ZJS
283
284 /* Ignore errors on these */
3c171f0b 285 (void) fchmod(fd, 0640);
9764bca9 286 (void) fix_acl(fd, context->uid, allow_user);
3c171f0b 287 (void) fix_xattr(fd, context);
cfd652ed 288
74402bf0 289 r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
03532f0a
LP
290 if (r < 0)
291 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
cfd652ed
ZJS
292
293 return 0;
294}
295
59f448cf 296static int maybe_remove_external_coredump(const char *filename, uint64_t size) {
cfd652ed 297
b59233e6 298 /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */
cfd652ed 299
fc6cec86 300 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
cfd652ed
ZJS
301 size <= arg_external_size_max)
302 return 0;
303
304 if (!filename)
305 return 1;
306
4a62c710
MS
307 if (unlink(filename) < 0 && errno != ENOENT)
308 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
cfd652ed
ZJS
309
310 return 1;
311}
312
f46c706b 313static int make_filename(const Context *context, char **ret) {
b59233e6 314 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
a7f7d1bd 315 sd_id128_t boot = {};
34c10968
LP
316 int r;
317
3c171f0b 318 assert(context);
34c10968 319
f46c706b 320 c = filename_escape(context->meta[META_COMM]);
34c10968 321 if (!c)
b59233e6 322 return -ENOMEM;
34c10968 323
f46c706b 324 u = filename_escape(context->meta[META_ARGV_UID]);
0dc5d23c 325 if (!u)
b59233e6 326 return -ENOMEM;
34c10968
LP
327
328 r = sd_id128_get_boot(&boot);
b59233e6 329 if (r < 0)
34c10968 330 return r;
34c10968 331
f46c706b 332 p = filename_escape(context->meta[META_ARGV_PID]);
b59233e6
LP
333 if (!p)
334 return -ENOMEM;
335
f46c706b 336 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
b59233e6
LP
337 if (!t)
338 return -ENOMEM;
339
340 if (asprintf(ret,
64a5384f 341 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
34c10968 342 c,
0dc5d23c 343 u,
34c10968
LP
344 SD_ID128_FORMAT_VAL(boot),
345 p,
b59233e6
LP
346 t) < 0)
347 return -ENOMEM;
348
349 return 0;
350}
351
3e4d0f6c
ZJS
352static int grant_user_access(int core_fd, const Context *context) {
353 int at_secure = -1;
354 uid_t uid = UID_INVALID, euid = UID_INVALID;
355 uid_t gid = GID_INVALID, egid = GID_INVALID;
356 int r;
357
358 assert(core_fd >= 0);
359 assert(context);
360
361 if (!context->meta[META_PROC_AUXV])
362 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
363
364 uint8_t elf[EI_NIDENT];
365 errno = 0;
366 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
367 return log_warning_errno(errno_or_else(EIO),
368 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
369
370 if (elf[EI_MAG0] != ELFMAG0 ||
371 elf[EI_MAG1] != ELFMAG1 ||
372 elf[EI_MAG2] != ELFMAG2 ||
373 elf[EI_MAG3] != ELFMAG3 ||
374 elf[EI_VERSION] != EV_CURRENT)
375 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
376 "Core file does not have ELF header, not adjusting permissions.");
377 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
378 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
379 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
380 "Core file has strange ELF class, not adjusting permissions.");
381
382 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
383 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
384 "Core file has non-native endianness, not adjusting permissions.");
385
cb38fdbe
ZJS
386 r = parse_auxv(LOG_WARNING,
387 /* elf_class= */ elf[EI_CLASS],
388 context->meta[META_PROC_AUXV],
389 context->meta_size[META_PROC_AUXV],
390 &at_secure, &uid, &euid, &gid, &egid);
3e4d0f6c
ZJS
391 if (r < 0)
392 return r;
393
394 /* We allow access if we got all the data and at_secure is not set and
395 * the uid/gid matches euid/egid. */
396 bool ret =
397 at_secure == 0 &&
398 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
399 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
400 log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
401 ret ? "permit" : "restrict",
402 uid, euid, gid, egid, yes_no(at_secure));
403 return ret;
404}
405
b59233e6 406static int save_external_coredump(
f46c706b 407 const Context *context,
3c171f0b 408 int input_fd,
b59233e6 409 char **ret_filename,
5f3e0a74
HW
410 int *ret_node_fd,
411 int *ret_data_fd,
0cd4e913 412 uint64_t *ret_size,
587f2a5e 413 uint64_t *ret_compressed_size,
cc4419ed 414 bool *ret_truncated) {
b59233e6 415
587f2a5e
LB
416 _cleanup_(unlink_and_freep) char *tmp = NULL;
417 _cleanup_free_ char *fn = NULL;
254d1313 418 _cleanup_close_ int fd = -EBADF;
ee0449fd 419 uint64_t rlimit, process_limit, max_size;
587f2a5e 420 bool truncated, storage_on_tmpfs;
b59233e6
LP
421 struct stat st;
422 int r;
423
3c171f0b 424 assert(context);
b59233e6 425 assert(ret_filename);
5f3e0a74
HW
426 assert(ret_node_fd);
427 assert(ret_data_fd);
b59233e6 428 assert(ret_size);
587f2a5e
LB
429 assert(ret_compressed_size);
430 assert(ret_truncated);
b59233e6 431
f46c706b 432 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit);
bdfd7b2c 433 if (r < 0)
f46c706b
FB
434 return log_error_errno(r, "Failed to parse resource limit '%s': %m",
435 context->meta[META_ARGV_RLIMIT]);
d7a0f1f4 436 if (rlimit < page_size())
f46c706b 437 /* Is coredumping disabled? Then don't bother saving/processing the
3a559f22 438 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
f46c706b
FB
439 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
440 * is usually the same as PAGE_SIZE. */
baaa35ad
ZJS
441 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
442 "Resource limits disable core dumping for process %s (%s).",
f46c706b 443 context->meta[META_ARGV_PID], context->meta[META_COMM]);
bdfd7b2c 444
ee0449fd 445 process_limit = MAX(arg_process_size_max, storage_size_max());
baaa35ad
ZJS
446 if (process_limit == 0)
447 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
448 "Limits for coredump processing and storage are both 0, not dumping core.");
ee0449fd 449
bdfd7b2c 450 /* Never store more than the process configured, or than we actually shall keep or process */
ee0449fd 451 max_size = MIN(rlimit, process_limit);
bdfd7b2c 452
3c171f0b 453 r = make_filename(context, &fn);
23bbb0de
MS
454 if (r < 0)
455 return log_error_errno(r, "Failed to determine coredump file name: %m");
34c10968 456
1fbe8d0c 457 (void) mkdir_parents_label(fn, 0755);
803a3464 458
03532f0a 459 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
4a62c710 460 if (fd < 0)
03532f0a 461 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
803a3464 462
587f2a5e
LB
463 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
464 * the service or the slice it belongs to. This is common on low-resources systems,
465 * to avoid crashing processes to take away too many system resources.
466 * Check the cgroup settings, and set max_size to a bit less than half of the
467 * available memory left to the process.
468 * Then, attempt to write the core file uncompressed first - if the write gets
469 * interrupted, we know we won't be able to write it all, so instead compress what
470 * was written so far, delete the uncompressed truncated core, and then continue
471 * compressing from STDIN. Given the compressed core cannot be larger than the
472 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
473 * should be able to at least store the full compressed core file. */
474
475 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
476 if (storage_on_tmpfs && arg_compress) {
477 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
478 uint64_t cgroup_limit = UINT64_MAX;
479 struct statvfs sv;
480
481 /* If we can't get the cgroup limit, just ignore it, but don't fail,
482 * try anyway with the config settings. */
483 r = sd_bus_default_system(&bus);
484 if (r < 0)
485 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
486 else {
487 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
488
489 r = sd_bus_get_property_trivial(
490 bus,
491 "org.freedesktop.systemd1",
492 "/org/freedesktop/systemd1/unit/self",
493 "org.freedesktop.systemd1.Service",
494 "MemoryAvailable",
495 &error,
496 't', &cgroup_limit);
497 if (r < 0)
498 log_warning_errno(r,
499 "Failed to query MemoryAvailable for current unit, "
500 "falling back to static config settings: %s",
501 bus_error_message(&error, r));
502 }
803a3464 503
587f2a5e
LB
504 max_size = MIN(cgroup_limit, max_size);
505 max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
506 max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
507
508 /* tmpfs might get full quickly, so check the available space too.
509 * But don't worry about errors here, failing to access the storage
510 * location will be better logged when writing to it. */
8facac5f 511 if (fstatvfs(fd, &sv) >= 0)
587f2a5e 512 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
34c10968 513
587f2a5e 514 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
7849c2ac
TA
515 }
516
587f2a5e
LB
517 r = copy_bytes(input_fd, fd, max_size, 0);
518 if (r < 0)
519 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
520 context->meta[META_ARGV_PID], context->meta[META_COMM]);
521 truncated = r == 1;
cfd652ed 522
3e4d0f6c
ZJS
523 bool allow_user = grant_user_access(fd, context) > 0;
524
587f2a5e
LB
525#if HAVE_COMPRESSION
526 if (arg_compress) {
527 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
528 _cleanup_free_ char *fn_compressed = NULL;
254d1313 529 _cleanup_close_ int fd_compressed = -EBADF;
587f2a5e
LB
530 uint64_t uncompressed_size = 0;
531
532 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
533 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
cfd652ed 534
ee00684c 535 fn_compressed = strjoin(fn, default_compression_extension());
587f2a5e
LB
536 if (!fn_compressed)
537 return log_oom();
cfd652ed 538
03532f0a 539 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
587f2a5e
LB
540 if (fd_compressed < 0)
541 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
cfd652ed 542
587f2a5e
LB
543 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
544 if (r < 0)
545 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
546
547 if (truncated && storage_on_tmpfs) {
548 uint64_t partial_uncompressed_size = 0;
549
550 /* Uncompressed write was truncated and we are writing to tmpfs: delete
551 * the uncompressed core, and compress the remaining part from STDIN. */
552
553 tmp = unlink_and_free(tmp);
554 fd = safe_close(fd);
555
556 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
557 if (r < 0)
558 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
559 uncompressed_size += partial_uncompressed_size;
b59233e6
LP
560 }
561
9764bca9 562 r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
cfd652ed 563 if (r < 0)
587f2a5e 564 return r;
b59233e6 565
587f2a5e
LB
566 if (fstat(fd_compressed, &st) < 0)
567 return log_error_errno(errno,
568 "Failed to fstat core file %s: %m",
569 coredump_tmpfile_name(tmp_compressed));
cfd652ed 570
587f2a5e
LB
571 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
572 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
573 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
574 *ret_data_fd = TAKE_FD(fd);
575 *ret_size = uncompressed_size;
576 *ret_truncated = truncated;
577 tmp_compressed = mfree(tmp_compressed);
cfd652ed 578
cfd652ed 579 return 0;
34c10968 580 }
3b1a55e1 581#endif
5f3e0a74 582
587f2a5e
LB
583 if (truncated)
584 log_struct(LOG_INFO,
08e86b15
DDM
585 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
586 "SIZE_LIMIT=%"PRIu64, max_size,
587f2a5e
LB
587 "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
588
9764bca9 589 r = fix_permissions(fd, tmp, fn, context, allow_user);
cfd652ed 590 if (r < 0)
587f2a5e
LB
591 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
592
593 if (fstat(fd, &st) < 0)
594 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
595
596 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
597 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
34c10968 598
0cfb0971 599 *ret_filename = TAKE_PTR(fn);
1cc6c93a 600 *ret_data_fd = TAKE_FD(fd);
59f448cf 601 *ret_size = (uint64_t) st.st_size;
587f2a5e 602 *ret_truncated = truncated;
34c10968 603
34c10968 604 return 0;
34c10968
LP
605}
606
607static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
608 _cleanup_free_ char *field = NULL;
609 ssize_t n;
610
8d4e028f 611 assert(fd >= 0);
34c10968
LP
612 assert(ret);
613 assert(ret_size);
614
4a62c710
MS
615 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
616 return log_warning_errno(errno, "Failed to seek: %m");
803a3464 617
34c10968 618 field = malloc(9 + size);
a73c74db
LP
619 if (!field)
620 return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
621 "Failed to allocate memory for coredump, coredump will not be stored.");
34c10968
LP
622
623 memcpy(field, "COREDUMP=", 9);
624
a73c74db
LP
625 /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
626 * 0x7ffff000 bytes max. Hence call things in a loop. */
627 n = loop_read(fd, field + 9, size, /* do_poll= */ false);
23bbb0de
MS
628 if (n < 0)
629 return log_error_errno((int) n, "Failed to read core data: %m");
baaa35ad
ZJS
630 if ((size_t) n < size)
631 return log_error_errno(SYNTHETIC_ERRNO(EIO),
632 "Core data too short.");
34c10968 633
1cc6c93a 634 *ret = TAKE_PTR(field);
34c10968
LP
635 *ret_size = size + 9;
636
34c10968
LP
637 return 0;
638}
803a3464 639
3f132692
JF
640/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
641 * 0:/dev/pts/23
642 * pos: 0
643 * flags: 0100002
644 *
645 * 1:/dev/pts/23
646 * pos: 0
647 * flags: 0100002
648 *
649 * 2:/dev/pts/23
650 * pos: 0
651 * flags: 0100002
652 * EOF
653 */
2485b7e2
YW
654static int compose_open_fds(pid_t pid, char **ret) {
655 _cleanup_(memstream_done) MemStream m = {};
4d84bc2f 656 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
254d1313 657 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
59059b4a 658 const char *fddelim = "", *path;
2485b7e2 659 FILE *stream;
7b26ea6f 660 int r;
3f132692
JF
661
662 assert(pid >= 0);
2485b7e2 663 assert(ret);
3f132692 664
59059b4a 665 path = procfs_file_alloca(pid, "fd");
3f132692 666 proc_fd_dir = opendir(path);
59059b4a
ZJS
667 if (!proc_fd_dir)
668 return -errno;
3f132692 669
4d84bc2f 670 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
59059b4a
ZJS
671 if (proc_fdinfo_fd < 0)
672 return -errno;
3f132692 673
2485b7e2 674 stream = memstream_init(&m);
3f132692
JF
675 if (!stream)
676 return -ENOMEM;
677
af3b864d 678 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
3f132692 679 _cleanup_fclose_ FILE *fdinfo = NULL;
4d84bc2f 680 _cleanup_free_ char *fdname = NULL;
254d1313 681 _cleanup_close_ int fd = -EBADF;
3f132692 682
af3b864d 683 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
3f132692
JF
684 if (r < 0)
685 return r;
686
af3b864d 687 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
3f132692
JF
688 fddelim = "\n";
689
690 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
af3b864d 691 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
59059b4a 692 if (fd < 0)
3f132692
JF
693 continue;
694
b46c3e49
VC
695 fdinfo = take_fdopen(&fd, "r");
696 if (!fdinfo)
3f132692
JF
697 continue;
698
7b26ea6f
LP
699 for (;;) {
700 _cleanup_free_ char *line = NULL;
701
702 r = read_line(fdinfo, LONG_LINE_MAX, &line);
703 if (r < 0)
704 return r;
705 if (r == 0)
706 break;
707
0d536673 708 fputs(line, stream);
7b26ea6f 709 fputc('\n', stream);
4d84bc2f 710 }
3f132692
JF
711 }
712
2485b7e2 713 return memstream_finalize(&m, ret, NULL);
3f132692
JF
714}
715
7ed03ce6
JF
716/* Returns 1 if the parent was found.
717 * Returns 0 if there is not a process we can call the pid's
718 * container parent (the pid's process isn't 'containerized').
719 * Returns a negative number on errors.
720 */
721static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
7ed03ce6
JF
722 pid_t container_pid;
723 const char *proc_root_path;
724 struct stat root_stat, proc_root_stat;
83844031 725 int r;
7ed03ce6
JF
726
727 /* To compare inodes of / and /proc/[pid]/root */
728 if (stat("/", &root_stat) < 0)
729 return -errno;
730
731 proc_root_path = procfs_file_alloca(pid, "root");
732 if (stat(proc_root_path, &proc_root_stat) < 0)
733 return -errno;
734
735 /* The process uses system root. */
c20c77ef 736 if (stat_inode_same(&proc_root_stat, &root_stat)) {
7ed03ce6
JF
737 *cmdline = NULL;
738 return 0;
739 }
740
ade39d9a 741 r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
7ed03ce6
JF
742 if (r < 0)
743 return r;
744
5dd55303 745 r = get_process_cmdline(container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, cmdline);
d3cba4ea
EV
746 if (r < 0)
747 return r;
748
749 return 1;
7ed03ce6
JF
750}
751
f46c706b 752static int change_uid_gid(const Context *context) {
9764bca9
NR
753 uid_t uid = context->uid;
754 gid_t gid = context->gid;
3c171f0b 755 int r;
34c10968 756
28add648 757 if (uid_is_system(uid)) {
888e378d
LP
758 const char *user = "systemd-coredump";
759
fafff8f1 760 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
888e378d
LP
761 if (r < 0) {
762 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
763 uid = gid = 0;
764 }
888e378d 765 }
3c171f0b
LP
766
767 return drop_privileges(uid, gid, 0);
768}
8c8549db 769
3c171f0b 770static int submit_coredump(
3e4d0f6c 771 const Context *context,
9a435388 772 struct iovec_wrapper *iovw,
3c171f0b 773 int input_fd) {
34c10968 774
c546154a 775 _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
254d1313 776 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
9a435388 777 _cleanup_free_ char *filename = NULL, *coredump_data = NULL;
51d3783d 778 _cleanup_free_ char *stacktrace = NULL;
9a435388 779 char *core_message;
c546154a 780 const char *module_name;
587f2a5e 781 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
f46c706b 782 bool truncated = false;
c546154a 783 JsonVariant *module_json;
3c171f0b 784 int r;
83844031 785
3c171f0b 786 assert(context);
9a435388 787 assert(iovw);
3c171f0b 788 assert(input_fd >= 0);
f5e04665 789
3c171f0b
LP
790 /* Vacuum before we write anything again */
791 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
803a3464 792
3c171f0b 793 /* Always stream the coredump to disk, if that's possible */
0cd4e913 794 r = save_external_coredump(context, input_fd,
587f2a5e
LB
795 &filename, &coredump_node_fd, &coredump_fd,
796 &coredump_size, &coredump_compressed_size, &truncated);
3c171f0b
LP
797 if (r < 0)
798 /* Skip whole core dumping part */
799 goto log;
800
51d3783d
FB
801 /* If we don't want to keep the coredump on disk, remove it now, as later on we
802 * will lack the privileges for it. However, we keep the fd to it, so that we can
803 * still process it and log it. */
587f2a5e 804 r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
3c171f0b
LP
805 if (r < 0)
806 return r;
633c3e8a 807 if (r == 0)
2a3bebd0 808 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
633c3e8a 809 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
5206a724 810 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
587f2a5e 811 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
f5e04665 812
3c171f0b
LP
813 /* Vacuum again, but exclude the coredump we just created */
814 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
8c9571d0 815
51d3783d
FB
816 /* Now, let's drop privileges to become the user who owns the segfaulted process
817 * and allocate the coredump memory under the user's uid. This also ensures that
818 * the credentials journald will see are the ones of the coredumping user, thus
819 * making sure the user gets access to the core dump. Let's also get rid of all
3c171f0b
LP
820 * capabilities, if we run as root, we won't need them anymore. */
821 r = change_uid_gid(context);
822 if (r < 0)
823 return log_error_errno(r, "Failed to drop privileges: %m");
34c10968 824
5238e957 825 /* Try to get a stack trace if we can */
c790632c 826 if (coredump_size > arg_process_size_max)
51d3783d
FB
827 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
828 "than %"PRIu64" (the configured maximum)",
6e9ef603 829 coredump_size, arg_process_size_max);
c790632c
ZJS
830 else if (coredump_fd >= 0) {
831 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
832
61aea456
LB
833 (void) parse_elf_object(coredump_fd,
834 context->meta[META_EXE],
c790632c 835 /* fork_disable_dump= */ skip, /* avoid loops */
61aea456
LB
836 &stacktrace,
837 &json_metadata);
c790632c 838 }
51d3783d 839
3c171f0b 840log:
f46c706b
FB
841 core_message = strjoina("Process ", context->meta[META_ARGV_PID],
842 " (", context->meta[META_COMM], ") of user ",
843 context->meta[META_ARGV_UID], " dumped core.",
844 context->is_journald && filename ? "\nCoredump diverted to " : NULL,
845 context->is_journald && filename ? filename : NULL);
51d3783d 846
9a435388 847 core_message = strjoina(core_message, stacktrace ? "\n\n" : NULL, stacktrace);
92e92d71 848
5edf875b
DDM
849 if (context->is_journald)
850 /* We might not be able to log to the journal, so let's always print the message to another
851 * log target. The target was set previously to something safe. */
9a435388 852 log_dispatch(LOG_ERR, 0, core_message);
92e92d71 853
2a3bebd0 854 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
3c171f0b 855
0cd4e913 856 if (truncated)
2a3bebd0 857 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
0cd4e913 858
c546154a
LB
859 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
860 * attach it as journal metadata. */
861 if (json_metadata) {
862 _cleanup_free_ char *formatted_json = NULL;
863
864 r = json_variant_format(json_metadata, 0, &formatted_json);
865 if (r < 0)
866 return log_error_errno(r, "Failed to format JSON package metadata: %m");
867
671769c9 868 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
c546154a
LB
869 }
870
c790632c
ZJS
871 /* In the unlikely scenario that context->meta[META_EXE] is not available,
872 * let's avoid guessing the module name and skip the loop. */
873 if (context->meta[META_EXE])
874 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
875 JsonVariant *t;
c546154a 876
c790632c
ZJS
877 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
878 if (!path_equal_filename(module_name, context->meta[META_EXE]))
879 continue;
c546154a 880
c790632c
ZJS
881 t = json_variant_by_key(module_json, "name");
882 if (t)
883 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", json_variant_string(t));
1f2abb79 884
c790632c
ZJS
885 t = json_variant_by_key(module_json, "version");
886 if (t)
887 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", json_variant_string(t));
888 }
c546154a 889
3c171f0b 890 /* Optionally store the entire coredump in the journal */
587f2a5e 891 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
6e9ef603
ZJS
892 if (coredump_size <= arg_journal_size_max) {
893 size_t sz = 0;
894
895 /* Store the coredump itself in the journal */
896
897 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
9a435388
FB
898 if (r >= 0) {
899 if (iovw_put(iovw, coredump_data, sz) >= 0)
900 TAKE_PTR(coredump_data);
901 } else
6e9ef603
ZJS
902 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
903 } else
5206a724 904 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
6e9ef603 905 coredump_size, arg_journal_size_max);
f5e04665
LP
906 }
907
5edf875b
DDM
908 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
909 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
910 * the coredump to the socket. */
911
912 if (context->is_journald) {
913 r = journal_fd_nonblock(true);
914 if (r < 0)
915 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
916 }
917
9a435388 918 r = sd_journal_sendv(iovw->iovec, iovw->count);
5edf875b
DDM
919
920 if (context->is_journald) {
921 int k;
922
923 k = journal_fd_nonblock(false);
924 if (k < 0)
925 return log_error_errno(k, "Failed to make journal socket blocking: %m");
926 }
927
928 if (r == -EAGAIN && context->is_journald)
929 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
930 else if (r < 0)
3c171f0b
LP
931 return log_error_errno(r, "Failed to log coredump: %m");
932
933 return 0;
934}
935
f46c706b 936static int save_context(Context *context, const struct iovec_wrapper *iovw) {
f46c706b
FB
937 const char *unit;
938 int r;
3c171f0b 939
3c171f0b 940 assert(context);
f46c706b
FB
941 assert(iovw);
942 assert(iovw->count >= _META_ARGV_MAX);
3c171f0b 943
f46c706b 944 /* The context does not allocate any memory on its own */
3c171f0b 945
fe96c0f8 946 for (size_t n = 0; n < iovw->count; n++) {
f46c706b 947 struct iovec *iovec = iovw->iovec + n;
92e92d71 948
fe96c0f8 949 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
f46c706b
FB
950 /* Note that these strings are NUL terminated, because we made sure that a
951 * trailing NUL byte is in the buffer, though not included in the iov_len
952 * count (see process_socket() and gather_pid_metadata_*()) */
953 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
3c171f0b 954
3e4d0f6c 955 const char *p = startswith(iovec->iov_base, meta_field_names[i]);
f46c706b
FB
956 if (p) {
957 context->meta[i] = p;
3e4d0f6c 958 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
f46c706b
FB
959 break;
960 }
961 }
3c171f0b 962 }
f46c706b
FB
963
964 if (!context->meta[META_ARGV_PID])
965 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
966 "Failed to find the PID of crashing process");
967
968 r = parse_pid(context->meta[META_ARGV_PID], &context->pid);
969 if (r < 0)
970 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
971
9764bca9
NR
972 r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
973 if (r < 0)
974 return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
975
976 r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
977 if (r < 0)
978 return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
979
f46c706b
FB
980 unit = context->meta[META_UNIT];
981 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
982 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
983
984 return 0;
3c171f0b
LP
985}
986
987static int process_socket(int fd) {
254d1313 988 _cleanup_close_ int input_fd = -EBADF;
f46c706b 989 Context context = {};
9a435388
FB
990 struct iovec_wrapper iovw = {};
991 struct iovec iovec;
fe96c0f8 992 int r;
3c171f0b
LP
993
994 assert(fd >= 0);
995
d2acb93d 996 log_setup();
3c171f0b 997
988e89ee
ZJS
998 log_debug("Processing coredump received on stdin...");
999
3c171f0b 1000 for (;;) {
fb29cdbe 1001 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
3c171f0b
LP
1002 struct msghdr mh = {
1003 .msg_control = &control,
1004 .msg_controllen = sizeof(control),
1005 .msg_iovlen = 1,
1006 };
1007 ssize_t n;
fe1ef0f8 1008 ssize_t l;
3c171f0b 1009
fe1ef0f8
EV
1010 l = next_datagram_size_fd(fd);
1011 if (l < 0) {
1012 r = log_error_errno(l, "Failed to determine datagram size to read: %m");
3c171f0b
LP
1013 goto finish;
1014 }
1015
9a435388
FB
1016 iovec.iov_len = l;
1017 iovec.iov_base = malloc(l + 1);
1018 if (!iovec.iov_base) {
3c171f0b
LP
1019 r = log_oom();
1020 goto finish;
1021 }
1022
9a435388 1023 mh.msg_iov = &iovec;
3c171f0b 1024
3691bcf3 1025 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
3c171f0b 1026 if (n < 0) {
9a435388 1027 free(iovec.iov_base);
3691bcf3 1028 r = log_error_errno(n, "Failed to receive datagram: %m");
3c171f0b
LP
1029 goto finish;
1030 }
1031
9a435388
FB
1032 /* The final zero-length datagram carries the file descriptor and tells us
1033 * that we're done. */
3c171f0b 1034 if (n == 0) {
dac556fa 1035 struct cmsghdr *found;
3c171f0b 1036
9a435388 1037 free(iovec.iov_base);
3c171f0b 1038
dac556fa 1039 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
3c171f0b 1040 if (!found) {
3691bcf3
LP
1041 cmsg_close_all(&mh);
1042 r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1043 "Coredump file descriptor missing.");
3c171f0b
LP
1044 goto finish;
1045 }
1046
f8540bde 1047 assert(input_fd < 0);
b1d02191 1048 input_fd = *CMSG_TYPED_DATA(found, int);
3c171f0b 1049 break;
3691bcf3
LP
1050 } else
1051 cmsg_close_all(&mh);
3c171f0b
LP
1052
1053 /* Add trailing NUL byte, in case these are strings */
9a435388
FB
1054 ((char*) iovec.iov_base)[n] = 0;
1055 iovec.iov_len = (size_t) n;
3c171f0b 1056
9a435388
FB
1057 r = iovw_put(&iovw, iovec.iov_base, iovec.iov_len);
1058 if (r < 0)
1059 goto finish;
34c10968
LP
1060 }
1061
61233823 1062 /* Make sure we got all data we really need */
f8540bde 1063 assert(input_fd >= 0);
3c171f0b 1064
f46c706b
FB
1065 r = save_context(&context, &iovw);
1066 if (r < 0)
1067 goto finish;
1068
1069 /* Make sure we received at least all fields we need. */
fe96c0f8 1070 for (int i = 0; i < _META_MANDATORY_MAX; i++)
f46c706b
FB
1071 if (!context.meta[i]) {
1072 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1073 "A mandatory argument (%i) has not been sent, aborting.",
1074 i);
1075 goto finish;
1076 }
80002f66 1077
f46c706b 1078 r = submit_coredump(&context, &iovw, input_fd);
3c171f0b
LP
1079
1080finish:
9a435388 1081 iovw_free_contents(&iovw, true);
3c171f0b
LP
1082 return r;
1083}
1084
9a435388 1085static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
254d1313 1086 _cleanup_close_ int fd = -EBADF;
3c171f0b
LP
1087 int r;
1088
9a435388 1089 assert(iovw);
3c171f0b
LP
1090 assert(input_fd >= 0);
1091
1092 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1093 if (fd < 0)
1094 return log_error_errno(errno, "Failed to create coredump socket: %m");
1095
1861986a
LP
1096 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1097 if (r < 0)
1098 return log_error_errno(r, "Failed to connect to coredump service: %m");
3c171f0b 1099
fe96c0f8 1100 for (size_t i = 0; i < iovw->count; i++) {
fec603eb 1101 struct msghdr mh = {
9a435388 1102 .msg_iov = iovw->iovec + i,
fec603eb
LP
1103 .msg_iovlen = 1,
1104 };
1105 struct iovec copy[2];
1106
1107 for (;;) {
1108 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1109 break;
1110
1111 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
f46c706b
FB
1112 /* This field didn't fit? That's a pity. Given that this is
1113 * just metadata, let's truncate the field at half, and try
1114 * again. We append three dots, in order to show that this is
1115 * truncated. */
fec603eb
LP
1116
1117 if (mh.msg_iov != copy) {
f46c706b
FB
1118 /* We don't want to modify the caller's iovec, hence
1119 * let's create our own array, consisting of two new
1120 * iovecs, where the first is a (truncated) copy of
1121 * what we want to send, and the second one contains
1122 * the trailing dots. */
9a435388 1123 copy[0] = iovw->iovec[i];
ed0cb346 1124 copy[1] = IOVEC_MAKE(((char[]){'.', '.', '.'}), 3);
fec603eb
LP
1125
1126 mh.msg_iov = copy;
1127 mh.msg_iovlen = 2;
1128 }
1129
1130 copy[0].iov_len /= 2; /* halve it, and try again */
1131 continue;
1132 }
3c171f0b 1133
3c171f0b 1134 return log_error_errno(errno, "Failed to send coredump datagram: %m");
fec603eb 1135 }
1eef15b1
ZJS
1136 }
1137
3c171f0b
LP
1138 r = send_one_fd(fd, input_fd, 0);
1139 if (r < 0)
1140 return log_error_errno(r, "Failed to send coredump fd: %m");
1eef15b1 1141
3c171f0b
LP
1142 return 0;
1143}
1eef15b1 1144
64a5384f
LP
1145static int gather_pid_metadata_from_argv(
1146 struct iovec_wrapper *iovw,
1147 Context *context,
1148 int argc, char **argv) {
1149
f46c706b 1150 _cleanup_free_ char *free_timestamp = NULL;
fe96c0f8 1151 int r, signo;
3c171f0b 1152 char *t;
3c171f0b 1153
e6aa443f
LP
1154 assert(iovw);
1155 assert(context);
1156
f46c706b
FB
1157 /* We gather all metadata that were passed via argv[] into an array of iovecs that
1158 * we'll forward to the socket unit */
3c171f0b 1159
f46c706b
FB
1160 if (argc < _META_ARGV_MAX)
1161 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1162 "Not enough arguments passed by the kernel (%i, expected %i).",
1163 argc, _META_ARGV_MAX);
3c171f0b 1164
fe96c0f8 1165 for (int i = 0; i < _META_ARGV_MAX; i++) {
3c171f0b 1166
f46c706b 1167 t = argv[i];
3c171f0b 1168
f46c706b 1169 switch (i) {
64a5384f 1170
f46c706b
FB
1171 case META_ARGV_TIMESTAMP:
1172 /* The journal fields contain the timestamp padded with six
1173 * zeroes, so that the kernel-supplied 1s granularity timestamps
e503019b 1174 * becomes 1μs granularity, i.e. the granularity systemd usually
f46c706b
FB
1175 * operates in. */
1176 t = free_timestamp = strjoin(argv[i], "000000");
1177 if (!t)
1178 return log_oom();
1179 break;
64a5384f 1180
f46c706b
FB
1181 case META_ARGV_SIGNAL:
1182 /* For signal, record its pretty name too */
1183 if (safe_atoi(argv[i], &signo) >= 0 && SIGNAL_VALID(signo))
2a3bebd0
FB
1184 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG",
1185 signal_to_string(signo));
f46c706b 1186 break;
64a5384f 1187
f46c706b
FB
1188 default:
1189 break;
c8091d92
LP
1190 }
1191
f46c706b
FB
1192 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1193 if (r < 0)
1194 return r;
8c8549db 1195 }
803a3464 1196
f46c706b
FB
1197 /* Cache some of the process metadata we collected so far and that we'll need to
1198 * access soon */
1199 return save_context(context, iovw);
1200}
3c171f0b 1201
db9ac801 1202static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
f46c706b
FB
1203 uid_t owner_uid;
1204 pid_t pid;
1205 char *t;
3e4d0f6c 1206 size_t size;
f46c706b
FB
1207 const char *p;
1208 int r;
f5e04665 1209
e6aa443f
LP
1210 assert(iovw);
1211 assert(context);
1212
f46c706b
FB
1213 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1214 * structure. (It remains valid, with the first iovec fields initialized.) */
f5e04665 1215
f46c706b 1216 pid = context->pid;
f5e04665 1217
f46c706b
FB
1218 /* The following is mandatory */
1219 r = get_process_comm(pid, &t);
9a435388 1220 if (r < 0)
f46c706b 1221 return log_error_errno(r, "Failed to get COMM: %m");
f5e04665 1222
f46c706b 1223 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
9a435388
FB
1224 if (r < 0)
1225 return r;
f45b8015 1226
c790632c 1227 /* The following are optional, but we use them if present. */
2a3bebd0
FB
1228 r = get_process_exe(pid, &t);
1229 if (r >= 0)
1230 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1231 if (r < 0)
f46c706b 1232 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
bdfd7b2c 1233
f46c706b 1234 if (cg_pid_get_unit(pid, &t) >= 0)
2a3bebd0 1235 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
f5e04665 1236
f46c706b 1237 if (cg_pid_get_user_unit(pid, &t) >= 0)
2a3bebd0 1238 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
f46c706b 1239
9aa82023 1240 if (sd_pid_get_session(pid, &t) >= 0)
9a435388 1241 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
f5e04665 1242
a035f819 1243 if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) {
9a435388 1244 r = asprintf(&t, UID_FMT, owner_uid);
7de80bfe 1245 if (r > 0)
9a435388 1246 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
f5e04665
LP
1247 }
1248
9aa82023 1249 if (sd_pid_get_slice(pid, &t) >= 0)
2a3bebd0 1250 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
f5e04665 1251
5dd55303 1252 if (get_process_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
2a3bebd0 1253 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
a035f819 1254
9aa82023 1255 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
2a3bebd0 1256 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
a035f819 1257
9aa82023 1258 if (compose_open_fds(pid, &t) >= 0)
2a3bebd0 1259 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
3f132692
JF
1260
1261 p = procfs_file_alloca(pid, "status");
627055ce 1262 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1263 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
3f132692
JF
1264
1265 p = procfs_file_alloca(pid, "maps");
627055ce 1266 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1267 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
3f132692
JF
1268
1269 p = procfs_file_alloca(pid, "limits");
627055ce 1270 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1271 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
3f132692
JF
1272
1273 p = procfs_file_alloca(pid, "cgroup");
3e4d0f6c 1274 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1275 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
3f132692 1276
d7032b1f 1277 p = procfs_file_alloca(pid, "mountinfo");
3e4d0f6c 1278 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1279 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
d7032b1f 1280
3e4d0f6c
ZJS
1281 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1282 p = procfs_file_alloca(pid, "auxv");
1283 if (read_full_virtual_file(p, &t, &size) >= 0) {
1284 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1285 if (buf) {
1286 /* Add a dummy terminator to make save_context() happy. */
1287 *((uint8_t*) mempcpy(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size)) = '\0';
1288 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1289 }
1290
1291 free(t);
1292 }
1293
9aa82023 1294 if (get_process_cwd(pid, &t) >= 0)
2a3bebd0 1295 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
3f132692
JF
1296
1297 if (get_process_root(pid, &t) >= 0) {
9aa82023
ZJS
1298 bool proc_self_root_is_slash;
1299
1300 proc_self_root_is_slash = strcmp(t, "/") == 0;
3f132692 1301
2a3bebd0 1302 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
7ed03ce6
JF
1303
1304 /* If the process' root is "/", then there is a chance it has
1305 * mounted own root and hence being containerized. */
9aa82023 1306 if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0)
2a3bebd0 1307 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
3f132692
JF
1308 }
1309
9aa82023 1310 if (get_process_environ(pid, &t) >= 0)
2a3bebd0 1311 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
9aa82023 1312
f46c706b
FB
1313 /* we successfully acquired all metadata */
1314 return save_context(context, iovw);
9aa82023 1315}
3f132692 1316
9aa82023 1317static int process_kernel(int argc, char* argv[]) {
6257e2fb 1318 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1319 Context context = {};
9aa82023
ZJS
1320 int r;
1321
1f9d2a81
DDM
1322 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1323 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1324 * /dev/null. */
5bb1d7fb 1325 r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
1f9d2a81
DDM
1326 if (r < 0)
1327 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1328
988e89ee
ZJS
1329 log_debug("Processing coredump received from the kernel...");
1330
9a435388
FB
1331 iovw = iovw_new();
1332 if (!iovw)
1333 return log_oom();
1334
2a3bebd0
FB
1335 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1336 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
f46c706b
FB
1337
1338 /* Collect all process metadata passed by the kernel through argv[] */
1339 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
92e92d71 1340 if (r < 0)
6257e2fb 1341 return r;
86562420 1342
f46c706b 1343 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1344 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1345 if (r < 0)
6257e2fb 1346 return r;
f46c706b 1347
1e344c1d 1348 if (!context.is_journald)
f46c706b 1349 /* OK, now we know it's not the journal, hence we can make use of it now. */
1e344c1d 1350 log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
f46c706b
FB
1351
1352 /* If this is PID 1 disable coredump collection, we'll unlikely be able to process
1353 * it later on.
1354 *
1355 * FIXME: maybe we should disable coredumps generation from the beginning and
1356 * re-enable it only when we know it's either safe (ie we're not running OOM) or
1357 * it's not pid1 ? */
1358 if (context.is_pid1) {
1359 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1360 disable_coredumps();
1361 }
34c10968 1362
f46c706b 1363 if (context.is_journald || context.is_pid1)
6257e2fb 1364 return submit_coredump(&context, iovw, STDIN_FILENO);
9aa82023 1365
6257e2fb 1366 return send_iovec(iovw, STDIN_FILENO);
3c171f0b 1367}
34c10968 1368
988e89ee 1369static int process_backtrace(int argc, char *argv[]) {
3a19fe46
YW
1370 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1371 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1372 Context context = {};
9a435388 1373 char *message;
988e89ee
ZJS
1374 int r;
1375
1376 log_debug("Processing backtrace on stdin...");
1377
9a435388
FB
1378 iovw = iovw_new();
1379 if (!iovw)
5b45a160
ZJS
1380 return log_oom();
1381
2a3bebd0
FB
1382 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1383 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
f46c706b
FB
1384
1385 /* Collect all process metadata from argv[] by making sure to skip the
1386 * '--backtrace' option */
1387 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
988e89ee 1388 if (r < 0)
3a19fe46 1389 return r;
aaeb2522 1390
f46c706b 1391 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1392 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1393 if (r < 0)
3a19fe46 1394 return r;
988e89ee 1395
86562420 1396 for (;;) {
5b45a160 1397 r = journal_importer_process_data(&importer);
3a19fe46
YW
1398 if (r < 0)
1399 return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
d74dc4f2
ZJS
1400 if (r == 1 || /* complete entry */
1401 journal_importer_eof(&importer)) /* end of data */
5b45a160 1402 break;
988e89ee 1403 }
988e89ee 1404
5b45a160
ZJS
1405 if (journal_importer_eof(&importer)) {
1406 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
988e89ee 1407
f46c706b
FB
1408 message = strjoina("Process ", context.meta[META_ARGV_PID],
1409 " (", context.meta[META_COMM], ")"
1410 " of user ", context.meta[META_ARGV_UID],
1411 " failed with ", context.meta[META_ARGV_SIGNAL]);
9a435388
FB
1412
1413 r = iovw_put_string_field(iovw, "MESSAGE=", message);
1414 if (r < 0)
3a19fe46 1415 return r;
5b45a160 1416 } else {
3a19fe46
YW
1417 /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
1418 * end of the array. */
1419 r = iovw_append(iovw, &importer.iovw);
1420 if (r < 0)
1421 return r;
9a435388 1422 }
988e89ee 1423
9a435388 1424 r = sd_journal_sendv(iovw->iovec, iovw->count);
988e89ee 1425 if (r < 0)
3a19fe46 1426 return log_error_errno(r, "Failed to log backtrace: %m");
988e89ee 1427
3a19fe46 1428 return 0;
988e89ee
ZJS
1429}
1430
4515a95e 1431static int run(int argc, char *argv[]) {
3c171f0b 1432 int r;
fee80f69 1433
9aa82023
ZJS
1434 /* First, log to a safe place, since we don't know what crashed and it might
1435 * be journald which we'd rather not log to then. */
8d4e028f 1436
1e344c1d 1437 log_set_target_and_open(LOG_TARGET_KMSG);
8d4e028f 1438
3c171f0b
LP
1439 /* Make sure we never enter a loop */
1440 (void) prctl(PR_SET_DUMPABLE, 0);
8d4e028f 1441
3c171f0b
LP
1442 /* Ignore all parse errors */
1443 (void) parse_config();
fee80f69 1444
3c171f0b
LP
1445 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
1446 log_debug("Selected compression %s.", yes_no(arg_compress));
fee80f69 1447
3c171f0b 1448 r = sd_listen_fds(false);
4515a95e
ZJS
1449 if (r < 0)
1450 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
fee80f69 1451
9aa82023
ZJS
1452 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
1453 * are invoked from the kernel as coredump handler. */
988e89ee
ZJS
1454 if (r == 0) {
1455 if (streq_ptr(argv[1], "--backtrace"))
4515a95e 1456 return process_backtrace(argc, argv);
988e89ee 1457 else
4515a95e 1458 return process_kernel(argc, argv);
988e89ee 1459 } else if (r == 1)
4515a95e 1460 return process_socket(SD_LISTEN_FDS_START);
f5e04665 1461
baaa35ad
ZJS
1462 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1463 "Received unexpected number of file descriptors.");
f5e04665 1464}
4515a95e
ZJS
1465
1466DEFINE_MAIN_FUNCTION(run);