]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/coredump/coredump.c
NEWS: add warnings about read-only fs and libkmod being dlopen'ed
[thirdparty/systemd.git] / src / coredump / coredump.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
f5e04665
LP
2
3#include <errno.h>
803a3464
LP
4#include <stdio.h>
5#include <sys/prctl.h>
587f2a5e 6#include <sys/statvfs.h>
3e4d0f6c 7#include <sys/auxv.h>
cacd6403 8#include <sys/xattr.h>
4f5dd394 9#include <unistd.h>
f5e04665 10
73a99163 11#include "sd-daemon.h"
f11943c5
LP
12#include "sd-journal.h"
13#include "sd-login.h"
73a99163 14#include "sd-messages.h"
4f5dd394
LP
15
16#include "acl-util.h"
b5efdb8a 17#include "alloc-util.h"
587f2a5e 18#include "bus-error.h"
430f0182 19#include "capability-util.h"
ba1261bc 20#include "cgroup-util.h"
4f5dd394 21#include "compress.h"
34c10968
LP
22#include "conf-parser.h"
23#include "copy.h"
c8715007 24#include "coredump-util.h"
f11943c5 25#include "coredump-vacuum.h"
a0956174 26#include "dirent-util.h"
ea680f05 27#include "elf-util.h"
4f5dd394 28#include "escape.h"
3ffd4af2 29#include "fd-util.h"
4f5dd394 30#include "fileio.h"
f4f15635 31#include "fs-util.h"
bd1ae178 32#include "iovec-util.h"
b18453ed 33#include "journal-importer.h"
5edf875b 34#include "journal-send.h"
4f5dd394
LP
35#include "log.h"
36#include "macro.h"
5e332028 37#include "main-func.h"
0a970718 38#include "memory-util.h"
2485b7e2 39#include "memstream-util.h"
35cd0ba5 40#include "mkdir-label.h"
a108c43e 41#include "namespace-util.h"
6bedfcbb 42#include "parse-util.h"
a108c43e 43#include "path-util.h"
0b452006 44#include "process-util.h"
d14bcb4e 45#include "signal-util.h"
3c171f0b 46#include "socket-util.h"
4f5dd394 47#include "special.h"
587f2a5e 48#include "stat-util.h"
8b43440b 49#include "string-table.h"
07630cea 50#include "string-util.h"
4f5dd394 51#include "strv.h"
bf819d3a 52#include "sync-util.h"
e4de7287 53#include "tmpfile-util.h"
8e1ac16b 54#include "uid-classification.h"
b1d4f8e1 55#include "user-util.h"
34727273 56
da890466 57/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
e677041e
LP
58#if __SIZEOF_POINTER__ == 4
59#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
60#elif __SIZEOF_POINTER__ == 8
61#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
62#else
63#error "Unexpected pointer size"
64#endif
34c10968 65
bdfd7b2c 66/* The maximum size up to which we leave the coredump around on disk */
34c10968
LP
67#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
68
bdfd7b2c 69/* The maximum size up to which we store the coredump in the journal */
25cad95c 70#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
34c10968 71#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
25cad95c
YW
72#else
73/* oss-fuzz limits memory usage. */
74#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
75#endif
f5e04665 76
587f2a5e
LB
77/* When checking for available memory and setting lower limits, don't
78 * go below 4MB for writing core files to storage. */
79#define PROCESS_SIZE_MIN (4U*1024U*1024U)
80
c4aa09b0 81/* Make sure to not make this larger than the maximum journal entry
27f931d1 82 * size. See DATA_SIZE_MAX in journal-importer.h. */
874bc134 83assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
f5e04665
LP
84
85enum {
f46c706b 86 /* We use these as array indexes for our process metadata cache.
ea5cc2a8 87 *
f46c706b
FB
88 * The first indices of the cache stores the same metadata as the ones passed by
89 * the kernel via argv[], ie the strings array passed by the kernel according to
90 * our pattern defined in /proc/sys/kernel/core_pattern (see man:core(5)). */
91
92 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
93 META_ARGV_UID, /* %u: as seen in the initial user namespace */
94 META_ARGV_GID, /* %g: as seen in the initial user namespace */
95 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
e503019b 96 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
f46c706b
FB
97 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
98 META_ARGV_HOSTNAME, /* %h: hostname */
99 _META_ARGV_MAX,
100
101 /* The following indexes are cached for a couple of special fields we use (and
102 * thereby need to be retrieved quickly) for naming coredump files, and attaching
103 * xattrs. Unlike the previous ones they are retrieved from the runtime
104 * environment. */
105
106 META_COMM = _META_ARGV_MAX,
107 _META_MANDATORY_MAX,
108
109 /* The rest are similar to the previous ones except that we won't fail if one of
110 * them is missing. */
111
112 META_EXE = _META_MANDATORY_MAX,
113 META_UNIT,
3e4d0f6c 114 META_PROC_AUXV,
f46c706b 115 _META_MAX
f5e04665
LP
116};
117
f46c706b 118static const char * const meta_field_names[_META_MAX] = {
510a1466
ZJS
119 [META_ARGV_PID] = "COREDUMP_PID=",
120 [META_ARGV_UID] = "COREDUMP_UID=",
121 [META_ARGV_GID] = "COREDUMP_GID=",
122 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
123 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
124 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
125 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
126 [META_COMM] = "COREDUMP_COMM=",
127 [META_EXE] = "COREDUMP_EXE=",
128 [META_UNIT] = "COREDUMP_UNIT=",
3e4d0f6c 129 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
f46c706b
FB
130};
131
132typedef struct Context {
133 const char *meta[_META_MAX];
3e4d0f6c 134 size_t meta_size[_META_MAX];
f46c706b 135 pid_t pid;
9764bca9
NR
136 uid_t uid;
137 gid_t gid;
f46c706b
FB
138 bool is_pid1;
139 bool is_journald;
140} Context;
141
34c10968
LP
142typedef enum CoredumpStorage {
143 COREDUMP_STORAGE_NONE,
144 COREDUMP_STORAGE_EXTERNAL,
145 COREDUMP_STORAGE_JOURNAL,
34c10968 146 _COREDUMP_STORAGE_MAX,
2d93c20e 147 _COREDUMP_STORAGE_INVALID = -EINVAL,
34c10968
LP
148} CoredumpStorage;
149
34c10968 150static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
510a1466 151 [COREDUMP_STORAGE_NONE] = "none",
34c10968 152 [COREDUMP_STORAGE_EXTERNAL] = "external",
510a1466 153 [COREDUMP_STORAGE_JOURNAL] = "journal",
34c10968
LP
154};
155
156DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
8c9571d0 157static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting");
34727273
ZJS
158
159static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
8c9571d0 160static bool arg_compress = true;
59f448cf
LP
161static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
162static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
6e2b4a69 163static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
f5fbe71d
YW
164static uint64_t arg_keep_free = UINT64_MAX;
165static uint64_t arg_max_use = UINT64_MAX;
34c10968
LP
166
167static int parse_config(void) {
34c10968 168 static const ConfigTableItem items[] = {
510a1466
ZJS
169 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
170 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
171 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
172 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
173 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
174 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
175 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
34c10968
LP
176 {}
177 };
178
4a78074f
LP
179 int r;
180
6378f257 181 r = config_parse_standard_file_with_dropins(
e5abff37 182 "systemd/coredump.conf",
4a78074f
LP
183 "Coredump\0",
184 config_item_table_lookup,
185 items,
186 CONFIG_PARSE_WARN,
187 /* userdata= */ NULL);
188 if (r < 0)
189 return r;
190
191 /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
192 * efficiency reasons. journald wouldn't accept anything larger anyway. */
193 if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
194 log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
195 FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
196 arg_journal_size_max = JOURNAL_SIZE_MAX;
197 }
198
199 return 0;
34c10968
LP
200}
201
a1e92eee 202static uint64_t storage_size_max(void) {
ee0449fd
ZJS
203 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
204 return arg_external_size_max;
205 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
206 return arg_journal_size_max;
207 assert(arg_storage == COREDUMP_STORAGE_NONE);
208 return 0;
73a99163
ZJS
209}
210
3e4d0f6c
ZJS
211static int fix_acl(int fd, uid_t uid, bool allow_user) {
212 assert(fd >= 0);
213 assert(uid_is_valid(uid));
34c10968 214
349cc4a5 215#if HAVE_ACL
709f6e46 216 int r;
34c10968 217
3e4d0f6c
ZJS
218 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
219 if (!allow_user)
220 return 0;
b59233e6 221
05fd2156 222 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY)
34c10968
LP
223 return 0;
224
d81be4e7 225 /* Make sure normal users can read (but not write or delete) their own coredumps */
567aeb58 226 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
709f6e46 227 if (r < 0)
567aeb58 228 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
34c10968
LP
229#endif
230
231 return 0;
232}
233
f46c706b
FB
234static int fix_xattr(int fd, const Context *context) {
235
236 static const char * const xattrs[_META_MAX] = {
510a1466
ZJS
237 [META_ARGV_PID] = "user.coredump.pid",
238 [META_ARGV_UID] = "user.coredump.uid",
239 [META_ARGV_GID] = "user.coredump.gid",
240 [META_ARGV_SIGNAL] = "user.coredump.signal",
241 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
242 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
243 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
244 [META_COMM] = "user.coredump.comm",
245 [META_EXE] = "user.coredump.exe",
0cd77f97
LP
246 };
247
34c10968
LP
248 int r = 0;
249
b59233e6
LP
250 assert(fd >= 0);
251
60ecc386 252 /* Attach some metadata to coredumps via extended attributes. Just because we can. */
34c10968 253
fe96c0f8 254 for (unsigned i = 0; i < _META_MAX; i++) {
1eef15b1
ZJS
255 int k;
256
f46c706b 257 if (isempty(context->meta[i]) || !xattrs[i])
0cd77f97 258 continue;
34c10968 259
60ecc386
ZJS
260 k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
261 RET_GATHER(r, k);
0cd77f97 262 }
34c10968
LP
263
264 return r;
265}
266
b0b21dce 267#define filename_escape(s) xescape((s), "./ ")
34c10968 268
a1e92eee 269static const char *coredump_tmpfile_name(const char *s) {
1da3cb81 270 return s ?: "(unnamed temporary file)";
0c773903
EV
271}
272
b59233e6
LP
273static int fix_permissions(
274 int fd,
275 const char *filename,
276 const char *target,
f46c706b 277 const Context *context,
3e4d0f6c 278 bool allow_user) {
b59233e6 279
03532f0a
LP
280 int r;
281
b59233e6 282 assert(fd >= 0);
b59233e6 283 assert(target);
3c171f0b 284 assert(context);
cfd652ed
ZJS
285
286 /* Ignore errors on these */
3c171f0b 287 (void) fchmod(fd, 0640);
9764bca9 288 (void) fix_acl(fd, context->uid, allow_user);
3c171f0b 289 (void) fix_xattr(fd, context);
cfd652ed 290
74402bf0 291 r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
03532f0a
LP
292 if (r < 0)
293 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
cfd652ed
ZJS
294
295 return 0;
296}
297
59f448cf 298static int maybe_remove_external_coredump(const char *filename, uint64_t size) {
cfd652ed 299
b59233e6 300 /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */
cfd652ed 301
fc6cec86 302 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
cfd652ed
ZJS
303 size <= arg_external_size_max)
304 return 0;
305
306 if (!filename)
307 return 1;
308
4a62c710
MS
309 if (unlink(filename) < 0 && errno != ENOENT)
310 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
cfd652ed
ZJS
311
312 return 1;
313}
314
f46c706b 315static int make_filename(const Context *context, char **ret) {
b59233e6 316 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
a7f7d1bd 317 sd_id128_t boot = {};
34c10968
LP
318 int r;
319
3c171f0b 320 assert(context);
34c10968 321
f46c706b 322 c = filename_escape(context->meta[META_COMM]);
34c10968 323 if (!c)
b59233e6 324 return -ENOMEM;
34c10968 325
f46c706b 326 u = filename_escape(context->meta[META_ARGV_UID]);
0dc5d23c 327 if (!u)
b59233e6 328 return -ENOMEM;
34c10968
LP
329
330 r = sd_id128_get_boot(&boot);
b59233e6 331 if (r < 0)
34c10968 332 return r;
34c10968 333
f46c706b 334 p = filename_escape(context->meta[META_ARGV_PID]);
b59233e6
LP
335 if (!p)
336 return -ENOMEM;
337
f46c706b 338 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
b59233e6
LP
339 if (!t)
340 return -ENOMEM;
341
342 if (asprintf(ret,
64a5384f 343 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
34c10968 344 c,
0dc5d23c 345 u,
34c10968
LP
346 SD_ID128_FORMAT_VAL(boot),
347 p,
b59233e6
LP
348 t) < 0)
349 return -ENOMEM;
350
351 return 0;
352}
353
3e4d0f6c
ZJS
354static int grant_user_access(int core_fd, const Context *context) {
355 int at_secure = -1;
356 uid_t uid = UID_INVALID, euid = UID_INVALID;
357 uid_t gid = GID_INVALID, egid = GID_INVALID;
358 int r;
359
360 assert(core_fd >= 0);
361 assert(context);
362
363 if (!context->meta[META_PROC_AUXV])
364 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
365
366 uint8_t elf[EI_NIDENT];
367 errno = 0;
368 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
369 return log_warning_errno(errno_or_else(EIO),
370 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
371
372 if (elf[EI_MAG0] != ELFMAG0 ||
373 elf[EI_MAG1] != ELFMAG1 ||
374 elf[EI_MAG2] != ELFMAG2 ||
375 elf[EI_MAG3] != ELFMAG3 ||
376 elf[EI_VERSION] != EV_CURRENT)
377 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
378 "Core file does not have ELF header, not adjusting permissions.");
379 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
380 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
381 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
382 "Core file has strange ELF class, not adjusting permissions.");
383
384 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
385 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
386 "Core file has non-native endianness, not adjusting permissions.");
387
cb38fdbe
ZJS
388 r = parse_auxv(LOG_WARNING,
389 /* elf_class= */ elf[EI_CLASS],
390 context->meta[META_PROC_AUXV],
391 context->meta_size[META_PROC_AUXV],
392 &at_secure, &uid, &euid, &gid, &egid);
3e4d0f6c
ZJS
393 if (r < 0)
394 return r;
395
396 /* We allow access if we got all the data and at_secure is not set and
397 * the uid/gid matches euid/egid. */
398 bool ret =
399 at_secure == 0 &&
400 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
401 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
402 log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
403 ret ? "permit" : "restrict",
404 uid, euid, gid, egid, yes_no(at_secure));
405 return ret;
406}
407
b59233e6 408static int save_external_coredump(
f46c706b 409 const Context *context,
3c171f0b 410 int input_fd,
b59233e6 411 char **ret_filename,
5f3e0a74
HW
412 int *ret_node_fd,
413 int *ret_data_fd,
0cd4e913 414 uint64_t *ret_size,
587f2a5e 415 uint64_t *ret_compressed_size,
cc4419ed 416 bool *ret_truncated) {
b59233e6 417
587f2a5e
LB
418 _cleanup_(unlink_and_freep) char *tmp = NULL;
419 _cleanup_free_ char *fn = NULL;
254d1313 420 _cleanup_close_ int fd = -EBADF;
ee0449fd 421 uint64_t rlimit, process_limit, max_size;
587f2a5e 422 bool truncated, storage_on_tmpfs;
b59233e6
LP
423 struct stat st;
424 int r;
425
3c171f0b 426 assert(context);
b59233e6 427 assert(ret_filename);
5f3e0a74
HW
428 assert(ret_node_fd);
429 assert(ret_data_fd);
b59233e6 430 assert(ret_size);
587f2a5e
LB
431 assert(ret_compressed_size);
432 assert(ret_truncated);
b59233e6 433
f46c706b 434 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit);
bdfd7b2c 435 if (r < 0)
f46c706b
FB
436 return log_error_errno(r, "Failed to parse resource limit '%s': %m",
437 context->meta[META_ARGV_RLIMIT]);
d7a0f1f4 438 if (rlimit < page_size())
f46c706b 439 /* Is coredumping disabled? Then don't bother saving/processing the
3a559f22 440 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
f46c706b
FB
441 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
442 * is usually the same as PAGE_SIZE. */
baaa35ad
ZJS
443 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
444 "Resource limits disable core dumping for process %s (%s).",
f46c706b 445 context->meta[META_ARGV_PID], context->meta[META_COMM]);
bdfd7b2c 446
ee0449fd 447 process_limit = MAX(arg_process_size_max, storage_size_max());
baaa35ad
ZJS
448 if (process_limit == 0)
449 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
450 "Limits for coredump processing and storage are both 0, not dumping core.");
ee0449fd 451
bdfd7b2c 452 /* Never store more than the process configured, or than we actually shall keep or process */
ee0449fd 453 max_size = MIN(rlimit, process_limit);
bdfd7b2c 454
3c171f0b 455 r = make_filename(context, &fn);
23bbb0de
MS
456 if (r < 0)
457 return log_error_errno(r, "Failed to determine coredump file name: %m");
34c10968 458
1fbe8d0c 459 (void) mkdir_parents_label(fn, 0755);
803a3464 460
03532f0a 461 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
4a62c710 462 if (fd < 0)
03532f0a 463 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
803a3464 464
587f2a5e
LB
465 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
466 * the service or the slice it belongs to. This is common on low-resources systems,
467 * to avoid crashing processes to take away too many system resources.
468 * Check the cgroup settings, and set max_size to a bit less than half of the
469 * available memory left to the process.
470 * Then, attempt to write the core file uncompressed first - if the write gets
471 * interrupted, we know we won't be able to write it all, so instead compress what
472 * was written so far, delete the uncompressed truncated core, and then continue
473 * compressing from STDIN. Given the compressed core cannot be larger than the
474 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
475 * should be able to at least store the full compressed core file. */
476
477 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
478 if (storage_on_tmpfs && arg_compress) {
479 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
480 uint64_t cgroup_limit = UINT64_MAX;
481 struct statvfs sv;
482
483 /* If we can't get the cgroup limit, just ignore it, but don't fail,
484 * try anyway with the config settings. */
485 r = sd_bus_default_system(&bus);
486 if (r < 0)
487 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
488 else {
489 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
490
491 r = sd_bus_get_property_trivial(
492 bus,
493 "org.freedesktop.systemd1",
494 "/org/freedesktop/systemd1/unit/self",
495 "org.freedesktop.systemd1.Service",
496 "MemoryAvailable",
497 &error,
498 't', &cgroup_limit);
499 if (r < 0)
500 log_warning_errno(r,
501 "Failed to query MemoryAvailable for current unit, "
502 "falling back to static config settings: %s",
503 bus_error_message(&error, r));
504 }
803a3464 505
587f2a5e
LB
506 max_size = MIN(cgroup_limit, max_size);
507 max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
508 max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
509
510 /* tmpfs might get full quickly, so check the available space too.
511 * But don't worry about errors here, failing to access the storage
512 * location will be better logged when writing to it. */
8facac5f 513 if (fstatvfs(fd, &sv) >= 0)
587f2a5e 514 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
34c10968 515
587f2a5e 516 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
7849c2ac
TA
517 }
518
587f2a5e
LB
519 r = copy_bytes(input_fd, fd, max_size, 0);
520 if (r < 0)
521 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
522 context->meta[META_ARGV_PID], context->meta[META_COMM]);
523 truncated = r == 1;
cfd652ed 524
3e4d0f6c
ZJS
525 bool allow_user = grant_user_access(fd, context) > 0;
526
587f2a5e
LB
527#if HAVE_COMPRESSION
528 if (arg_compress) {
529 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
530 _cleanup_free_ char *fn_compressed = NULL;
254d1313 531 _cleanup_close_ int fd_compressed = -EBADF;
587f2a5e
LB
532 uint64_t uncompressed_size = 0;
533
86cbbc6d 534 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 535 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
cfd652ed 536
ee00684c 537 fn_compressed = strjoin(fn, default_compression_extension());
587f2a5e
LB
538 if (!fn_compressed)
539 return log_oom();
cfd652ed 540
03532f0a 541 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
587f2a5e
LB
542 if (fd_compressed < 0)
543 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
cfd652ed 544
587f2a5e
LB
545 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
546 if (r < 0)
547 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
548
549 if (truncated && storage_on_tmpfs) {
550 uint64_t partial_uncompressed_size = 0;
551
552 /* Uncompressed write was truncated and we are writing to tmpfs: delete
553 * the uncompressed core, and compress the remaining part from STDIN. */
554
555 tmp = unlink_and_free(tmp);
556 fd = safe_close(fd);
557
558 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
559 if (r < 0)
560 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
561 uncompressed_size += partial_uncompressed_size;
b59233e6
LP
562 }
563
9764bca9 564 r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
cfd652ed 565 if (r < 0)
587f2a5e 566 return r;
b59233e6 567
587f2a5e
LB
568 if (fstat(fd_compressed, &st) < 0)
569 return log_error_errno(errno,
570 "Failed to fstat core file %s: %m",
571 coredump_tmpfile_name(tmp_compressed));
cfd652ed 572
587f2a5e
LB
573 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
574 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
575 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
576 *ret_data_fd = TAKE_FD(fd);
577 *ret_size = uncompressed_size;
578 *ret_truncated = truncated;
579 tmp_compressed = mfree(tmp_compressed);
cfd652ed 580
cfd652ed 581 return 0;
34c10968 582 }
3b1a55e1 583#endif
5f3e0a74 584
587f2a5e
LB
585 if (truncated)
586 log_struct(LOG_INFO,
08e86b15
DDM
587 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
588 "SIZE_LIMIT=%"PRIu64, max_size,
587f2a5e
LB
589 "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
590
9764bca9 591 r = fix_permissions(fd, tmp, fn, context, allow_user);
cfd652ed 592 if (r < 0)
587f2a5e
LB
593 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
594
595 if (fstat(fd, &st) < 0)
596 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
597
86cbbc6d 598 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 599 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
34c10968 600
0cfb0971 601 *ret_filename = TAKE_PTR(fn);
1cc6c93a 602 *ret_data_fd = TAKE_FD(fd);
59f448cf 603 *ret_size = (uint64_t) st.st_size;
587f2a5e 604 *ret_truncated = truncated;
34c10968 605
34c10968 606 return 0;
34c10968
LP
607}
608
609static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
610 _cleanup_free_ char *field = NULL;
611 ssize_t n;
612
8d4e028f 613 assert(fd >= 0);
34c10968
LP
614 assert(ret);
615 assert(ret_size);
616
86cbbc6d 617 if (lseek(fd, 0, SEEK_SET) < 0)
4a62c710 618 return log_warning_errno(errno, "Failed to seek: %m");
803a3464 619
34c10968 620 field = malloc(9 + size);
a73c74db
LP
621 if (!field)
622 return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
623 "Failed to allocate memory for coredump, coredump will not be stored.");
34c10968
LP
624
625 memcpy(field, "COREDUMP=", 9);
626
a73c74db
LP
627 /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
628 * 0x7ffff000 bytes max. Hence call things in a loop. */
629 n = loop_read(fd, field + 9, size, /* do_poll= */ false);
23bbb0de
MS
630 if (n < 0)
631 return log_error_errno((int) n, "Failed to read core data: %m");
baaa35ad
ZJS
632 if ((size_t) n < size)
633 return log_error_errno(SYNTHETIC_ERRNO(EIO),
634 "Core data too short.");
34c10968 635
1cc6c93a 636 *ret = TAKE_PTR(field);
34c10968
LP
637 *ret_size = size + 9;
638
34c10968
LP
639 return 0;
640}
803a3464 641
3f132692
JF
642/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
643 * 0:/dev/pts/23
644 * pos: 0
645 * flags: 0100002
646 *
647 * 1:/dev/pts/23
648 * pos: 0
649 * flags: 0100002
650 *
651 * 2:/dev/pts/23
652 * pos: 0
653 * flags: 0100002
654 * EOF
655 */
2485b7e2
YW
656static int compose_open_fds(pid_t pid, char **ret) {
657 _cleanup_(memstream_done) MemStream m = {};
4d84bc2f 658 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
254d1313 659 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
59059b4a 660 const char *fddelim = "", *path;
2485b7e2 661 FILE *stream;
7b26ea6f 662 int r;
3f132692
JF
663
664 assert(pid >= 0);
2485b7e2 665 assert(ret);
3f132692 666
59059b4a 667 path = procfs_file_alloca(pid, "fd");
3f132692 668 proc_fd_dir = opendir(path);
59059b4a
ZJS
669 if (!proc_fd_dir)
670 return -errno;
3f132692 671
4d84bc2f 672 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
59059b4a
ZJS
673 if (proc_fdinfo_fd < 0)
674 return -errno;
3f132692 675
2485b7e2 676 stream = memstream_init(&m);
3f132692
JF
677 if (!stream)
678 return -ENOMEM;
679
af3b864d 680 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
3f132692 681 _cleanup_fclose_ FILE *fdinfo = NULL;
4d84bc2f 682 _cleanup_free_ char *fdname = NULL;
254d1313 683 _cleanup_close_ int fd = -EBADF;
3f132692 684
af3b864d 685 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
3f132692
JF
686 if (r < 0)
687 return r;
688
af3b864d 689 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
3f132692
JF
690 fddelim = "\n";
691
692 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
af3b864d 693 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
59059b4a 694 if (fd < 0)
3f132692
JF
695 continue;
696
b46c3e49
VC
697 fdinfo = take_fdopen(&fd, "r");
698 if (!fdinfo)
3f132692
JF
699 continue;
700
7b26ea6f
LP
701 for (;;) {
702 _cleanup_free_ char *line = NULL;
703
704 r = read_line(fdinfo, LONG_LINE_MAX, &line);
705 if (r < 0)
706 return r;
707 if (r == 0)
708 break;
709
0d536673 710 fputs(line, stream);
7b26ea6f 711 fputc('\n', stream);
4d84bc2f 712 }
3f132692
JF
713 }
714
2485b7e2 715 return memstream_finalize(&m, ret, NULL);
3f132692
JF
716}
717
7ed03ce6
JF
718/* Returns 1 if the parent was found.
719 * Returns 0 if there is not a process we can call the pid's
720 * container parent (the pid's process isn't 'containerized').
721 * Returns a negative number on errors.
722 */
723static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
7ed03ce6
JF
724 pid_t container_pid;
725 const char *proc_root_path;
726 struct stat root_stat, proc_root_stat;
83844031 727 int r;
7ed03ce6
JF
728
729 /* To compare inodes of / and /proc/[pid]/root */
730 if (stat("/", &root_stat) < 0)
731 return -errno;
732
733 proc_root_path = procfs_file_alloca(pid, "root");
734 if (stat(proc_root_path, &proc_root_stat) < 0)
735 return -errno;
736
737 /* The process uses system root. */
c20c77ef 738 if (stat_inode_same(&proc_root_stat, &root_stat)) {
7ed03ce6
JF
739 *cmdline = NULL;
740 return 0;
741 }
742
ade39d9a 743 r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
7ed03ce6
JF
744 if (r < 0)
745 return r;
746
a034620f 747 r = pid_get_cmdline(container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, cmdline);
d3cba4ea
EV
748 if (r < 0)
749 return r;
750
751 return 1;
7ed03ce6
JF
752}
753
f46c706b 754static int change_uid_gid(const Context *context) {
9764bca9
NR
755 uid_t uid = context->uid;
756 gid_t gid = context->gid;
3c171f0b 757 int r;
34c10968 758
28add648 759 if (uid_is_system(uid)) {
888e378d
LP
760 const char *user = "systemd-coredump";
761
fafff8f1 762 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
888e378d
LP
763 if (r < 0) {
764 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
765 uid = gid = 0;
766 }
888e378d 767 }
3c171f0b
LP
768
769 return drop_privileges(uid, gid, 0);
770}
8c8549db 771
3c171f0b 772static int submit_coredump(
3e4d0f6c 773 const Context *context,
9a435388 774 struct iovec_wrapper *iovw,
3c171f0b 775 int input_fd) {
34c10968 776
c546154a 777 _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
254d1313 778 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
9a435388 779 _cleanup_free_ char *filename = NULL, *coredump_data = NULL;
51d3783d 780 _cleanup_free_ char *stacktrace = NULL;
c546154a 781 const char *module_name;
587f2a5e 782 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
6fea39ba 783 bool truncated = false, written = false;
c546154a 784 JsonVariant *module_json;
3c171f0b 785 int r;
83844031 786
3c171f0b 787 assert(context);
9a435388 788 assert(iovw);
3c171f0b 789 assert(input_fd >= 0);
f5e04665 790
3c171f0b
LP
791 /* Vacuum before we write anything again */
792 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
803a3464 793
3c171f0b 794 /* Always stream the coredump to disk, if that's possible */
c8e94763
LP
795 written = save_external_coredump(
796 context, input_fd,
797 &filename, &coredump_node_fd, &coredump_fd,
798 &coredump_size, &coredump_compressed_size, &truncated) >= 0;
799 if (written) {
800 /* If we could write it to disk we can now process it. */
801 /* If we don't want to keep the coredump on disk, remove it now, as later on we
802 * will lack the privileges for it. However, we keep the fd to it, so that we can
803 * still process it and log it. */
804 r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
805 if (r < 0)
806 return r;
807 if (r == 0)
808 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
809 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
810 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
811 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
812
813 /* Vacuum again, but exclude the coredump we just created */
814 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
815 }
6fea39ba 816
c8e94763
LP
817 /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
818 * coredump memory under the user's uid. This also ensures that the credentials journald will see are
819 * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's
820 * also get rid of all capabilities, if we run as root, we won't need them anymore. */
3c171f0b
LP
821 r = change_uid_gid(context);
822 if (r < 0)
823 return log_error_errno(r, "Failed to drop privileges: %m");
34c10968 824
c8e94763
LP
825 if (written) {
826 /* Try to get a stack trace if we can */
827 if (coredump_size > arg_process_size_max)
828 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
829 "than %"PRIu64" (the configured maximum)",
830 coredump_size, arg_process_size_max);
831 else if (coredump_fd >= 0) {
832 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
833
834 (void) parse_elf_object(coredump_fd,
835 context->meta[META_EXE],
836 /* fork_disable_dump= */ skip, /* avoid loops */
837 &stacktrace,
838 &json_metadata);
839 }
c790632c 840 }
51d3783d 841
6fea39ba 842 _cleanup_free_ char *core_message = NULL;
6fea39ba
LP
843 core_message = strjoin(
844 "Process ", context->meta[META_ARGV_PID],
845 " (", context->meta[META_COMM],
846 ") of user ", context->meta[META_ARGV_UID],
847 written ? " dumped core." : " terminated abnormally without generating a coredump.");
848 if (!core_message)
849 return log_oom();
850
851 if (context->is_journald && filename)
852 if (!strextend(&core_message, "\nCoredump diverted to ", filename))
853 return log_oom();
51d3783d 854
6fea39ba
LP
855 if (stacktrace)
856 if (!strextend(&core_message, "\n\n", stacktrace))
857 return log_oom();
92e92d71 858
5edf875b
DDM
859 if (context->is_journald)
860 /* We might not be able to log to the journal, so let's always print the message to another
861 * log target. The target was set previously to something safe. */
9a435388 862 log_dispatch(LOG_ERR, 0, core_message);
92e92d71 863
2a3bebd0 864 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
3c171f0b 865
0cd4e913 866 if (truncated)
2a3bebd0 867 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
0cd4e913 868
c546154a
LB
869 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
870 * attach it as journal metadata. */
871 if (json_metadata) {
872 _cleanup_free_ char *formatted_json = NULL;
873
874 r = json_variant_format(json_metadata, 0, &formatted_json);
875 if (r < 0)
876 return log_error_errno(r, "Failed to format JSON package metadata: %m");
877
671769c9 878 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
c546154a
LB
879 }
880
c790632c
ZJS
881 /* In the unlikely scenario that context->meta[META_EXE] is not available,
882 * let's avoid guessing the module name and skip the loop. */
883 if (context->meta[META_EXE])
884 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
885 JsonVariant *t;
c546154a 886
c790632c
ZJS
887 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
888 if (!path_equal_filename(module_name, context->meta[META_EXE]))
889 continue;
c546154a 890
c790632c
ZJS
891 t = json_variant_by_key(module_json, "name");
892 if (t)
893 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", json_variant_string(t));
1f2abb79 894
c790632c
ZJS
895 t = json_variant_by_key(module_json, "version");
896 if (t)
897 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", json_variant_string(t));
898 }
c546154a 899
3c171f0b 900 /* Optionally store the entire coredump in the journal */
587f2a5e 901 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
6e9ef603
ZJS
902 if (coredump_size <= arg_journal_size_max) {
903 size_t sz = 0;
904
905 /* Store the coredump itself in the journal */
906
907 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
9a435388
FB
908 if (r >= 0) {
909 if (iovw_put(iovw, coredump_data, sz) >= 0)
910 TAKE_PTR(coredump_data);
911 } else
6e9ef603
ZJS
912 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
913 } else
5206a724 914 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
6e9ef603 915 coredump_size, arg_journal_size_max);
f5e04665
LP
916 }
917
5edf875b
DDM
918 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
919 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
920 * the coredump to the socket. */
921
922 if (context->is_journald) {
923 r = journal_fd_nonblock(true);
924 if (r < 0)
925 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
926 }
927
9a435388 928 r = sd_journal_sendv(iovw->iovec, iovw->count);
5edf875b
DDM
929
930 if (context->is_journald) {
931 int k;
932
933 k = journal_fd_nonblock(false);
934 if (k < 0)
935 return log_error_errno(k, "Failed to make journal socket blocking: %m");
936 }
937
938 if (r == -EAGAIN && context->is_journald)
939 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
940 else if (r < 0)
3c171f0b
LP
941 return log_error_errno(r, "Failed to log coredump: %m");
942
943 return 0;
944}
945
f46c706b 946static int save_context(Context *context, const struct iovec_wrapper *iovw) {
f46c706b
FB
947 const char *unit;
948 int r;
3c171f0b 949
3c171f0b 950 assert(context);
f46c706b
FB
951 assert(iovw);
952 assert(iovw->count >= _META_ARGV_MAX);
3c171f0b 953
f46c706b 954 /* The context does not allocate any memory on its own */
3c171f0b 955
fe96c0f8 956 for (size_t n = 0; n < iovw->count; n++) {
f46c706b 957 struct iovec *iovec = iovw->iovec + n;
92e92d71 958
fe96c0f8 959 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
f46c706b
FB
960 /* Note that these strings are NUL terminated, because we made sure that a
961 * trailing NUL byte is in the buffer, though not included in the iov_len
962 * count (see process_socket() and gather_pid_metadata_*()) */
963 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
3c171f0b 964
3e4d0f6c 965 const char *p = startswith(iovec->iov_base, meta_field_names[i]);
f46c706b
FB
966 if (p) {
967 context->meta[i] = p;
3e4d0f6c 968 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
f46c706b
FB
969 break;
970 }
971 }
3c171f0b 972 }
f46c706b
FB
973
974 if (!context->meta[META_ARGV_PID])
975 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
976 "Failed to find the PID of crashing process");
977
978 r = parse_pid(context->meta[META_ARGV_PID], &context->pid);
979 if (r < 0)
980 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
981
9764bca9
NR
982 r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
983 if (r < 0)
984 return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
985
986 r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
987 if (r < 0)
988 return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
989
f46c706b
FB
990 unit = context->meta[META_UNIT];
991 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
992 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
993
994 return 0;
3c171f0b
LP
995}
996
997static int process_socket(int fd) {
254d1313 998 _cleanup_close_ int input_fd = -EBADF;
f46c706b 999 Context context = {};
9a435388
FB
1000 struct iovec_wrapper iovw = {};
1001 struct iovec iovec;
fe96c0f8 1002 int r;
3c171f0b
LP
1003
1004 assert(fd >= 0);
1005
d2acb93d 1006 log_setup();
3c171f0b 1007
988e89ee
ZJS
1008 log_debug("Processing coredump received on stdin...");
1009
3c171f0b 1010 for (;;) {
fb29cdbe 1011 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
3c171f0b
LP
1012 struct msghdr mh = {
1013 .msg_control = &control,
1014 .msg_controllen = sizeof(control),
1015 .msg_iovlen = 1,
1016 };
1017 ssize_t n;
fe1ef0f8 1018 ssize_t l;
3c171f0b 1019
fe1ef0f8
EV
1020 l = next_datagram_size_fd(fd);
1021 if (l < 0) {
1022 r = log_error_errno(l, "Failed to determine datagram size to read: %m");
3c171f0b
LP
1023 goto finish;
1024 }
1025
9a435388
FB
1026 iovec.iov_len = l;
1027 iovec.iov_base = malloc(l + 1);
1028 if (!iovec.iov_base) {
3c171f0b
LP
1029 r = log_oom();
1030 goto finish;
1031 }
1032
9a435388 1033 mh.msg_iov = &iovec;
3c171f0b 1034
3691bcf3 1035 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
3c171f0b 1036 if (n < 0) {
9a435388 1037 free(iovec.iov_base);
3691bcf3 1038 r = log_error_errno(n, "Failed to receive datagram: %m");
3c171f0b
LP
1039 goto finish;
1040 }
1041
9a435388
FB
1042 /* The final zero-length datagram carries the file descriptor and tells us
1043 * that we're done. */
3c171f0b 1044 if (n == 0) {
dac556fa 1045 struct cmsghdr *found;
3c171f0b 1046
9a435388 1047 free(iovec.iov_base);
3c171f0b 1048
dac556fa 1049 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
3c171f0b 1050 if (!found) {
3691bcf3
LP
1051 cmsg_close_all(&mh);
1052 r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1053 "Coredump file descriptor missing.");
3c171f0b
LP
1054 goto finish;
1055 }
1056
f8540bde 1057 assert(input_fd < 0);
b1d02191 1058 input_fd = *CMSG_TYPED_DATA(found, int);
3c171f0b 1059 break;
3691bcf3
LP
1060 } else
1061 cmsg_close_all(&mh);
3c171f0b
LP
1062
1063 /* Add trailing NUL byte, in case these are strings */
9a435388
FB
1064 ((char*) iovec.iov_base)[n] = 0;
1065 iovec.iov_len = (size_t) n;
3c171f0b 1066
9a435388
FB
1067 r = iovw_put(&iovw, iovec.iov_base, iovec.iov_len);
1068 if (r < 0)
1069 goto finish;
34c10968
LP
1070 }
1071
61233823 1072 /* Make sure we got all data we really need */
f8540bde 1073 assert(input_fd >= 0);
3c171f0b 1074
f46c706b
FB
1075 r = save_context(&context, &iovw);
1076 if (r < 0)
1077 goto finish;
1078
1079 /* Make sure we received at least all fields we need. */
fe96c0f8 1080 for (int i = 0; i < _META_MANDATORY_MAX; i++)
f46c706b
FB
1081 if (!context.meta[i]) {
1082 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1083 "A mandatory argument (%i) has not been sent, aborting.",
1084 i);
1085 goto finish;
1086 }
80002f66 1087
f46c706b 1088 r = submit_coredump(&context, &iovw, input_fd);
3c171f0b
LP
1089
1090finish:
9a435388 1091 iovw_free_contents(&iovw, true);
3c171f0b
LP
1092 return r;
1093}
1094
9a435388 1095static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
254d1313 1096 _cleanup_close_ int fd = -EBADF;
3c171f0b
LP
1097 int r;
1098
9a435388 1099 assert(iovw);
3c171f0b
LP
1100 assert(input_fd >= 0);
1101
1102 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1103 if (fd < 0)
1104 return log_error_errno(errno, "Failed to create coredump socket: %m");
1105
1861986a
LP
1106 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1107 if (r < 0)
1108 return log_error_errno(r, "Failed to connect to coredump service: %m");
3c171f0b 1109
fe96c0f8 1110 for (size_t i = 0; i < iovw->count; i++) {
fec603eb 1111 struct msghdr mh = {
9a435388 1112 .msg_iov = iovw->iovec + i,
fec603eb
LP
1113 .msg_iovlen = 1,
1114 };
1115 struct iovec copy[2];
1116
1117 for (;;) {
1118 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1119 break;
1120
1121 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
f46c706b
FB
1122 /* This field didn't fit? That's a pity. Given that this is
1123 * just metadata, let's truncate the field at half, and try
1124 * again. We append three dots, in order to show that this is
1125 * truncated. */
fec603eb
LP
1126
1127 if (mh.msg_iov != copy) {
f46c706b
FB
1128 /* We don't want to modify the caller's iovec, hence
1129 * let's create our own array, consisting of two new
1130 * iovecs, where the first is a (truncated) copy of
1131 * what we want to send, and the second one contains
1132 * the trailing dots. */
9a435388 1133 copy[0] = iovw->iovec[i];
ed0cb346 1134 copy[1] = IOVEC_MAKE(((char[]){'.', '.', '.'}), 3);
fec603eb
LP
1135
1136 mh.msg_iov = copy;
1137 mh.msg_iovlen = 2;
1138 }
1139
1140 copy[0].iov_len /= 2; /* halve it, and try again */
1141 continue;
1142 }
3c171f0b 1143
3c171f0b 1144 return log_error_errno(errno, "Failed to send coredump datagram: %m");
fec603eb 1145 }
1eef15b1
ZJS
1146 }
1147
3c171f0b
LP
1148 r = send_one_fd(fd, input_fd, 0);
1149 if (r < 0)
1150 return log_error_errno(r, "Failed to send coredump fd: %m");
1eef15b1 1151
3c171f0b
LP
1152 return 0;
1153}
1eef15b1 1154
64a5384f
LP
1155static int gather_pid_metadata_from_argv(
1156 struct iovec_wrapper *iovw,
1157 Context *context,
1158 int argc, char **argv) {
1159
f46c706b 1160 _cleanup_free_ char *free_timestamp = NULL;
fe96c0f8 1161 int r, signo;
3c171f0b 1162 char *t;
3c171f0b 1163
e6aa443f
LP
1164 assert(iovw);
1165 assert(context);
1166
f46c706b
FB
1167 /* We gather all metadata that were passed via argv[] into an array of iovecs that
1168 * we'll forward to the socket unit */
3c171f0b 1169
f46c706b
FB
1170 if (argc < _META_ARGV_MAX)
1171 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1172 "Not enough arguments passed by the kernel (%i, expected %i).",
1173 argc, _META_ARGV_MAX);
3c171f0b 1174
fe96c0f8 1175 for (int i = 0; i < _META_ARGV_MAX; i++) {
3c171f0b 1176
f46c706b 1177 t = argv[i];
3c171f0b 1178
f46c706b 1179 switch (i) {
64a5384f 1180
f46c706b
FB
1181 case META_ARGV_TIMESTAMP:
1182 /* The journal fields contain the timestamp padded with six
1183 * zeroes, so that the kernel-supplied 1s granularity timestamps
e503019b 1184 * becomes 1μs granularity, i.e. the granularity systemd usually
f46c706b
FB
1185 * operates in. */
1186 t = free_timestamp = strjoin(argv[i], "000000");
1187 if (!t)
1188 return log_oom();
1189 break;
64a5384f 1190
f46c706b
FB
1191 case META_ARGV_SIGNAL:
1192 /* For signal, record its pretty name too */
1193 if (safe_atoi(argv[i], &signo) >= 0 && SIGNAL_VALID(signo))
2a3bebd0
FB
1194 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG",
1195 signal_to_string(signo));
f46c706b 1196 break;
64a5384f 1197
f46c706b
FB
1198 default:
1199 break;
c8091d92
LP
1200 }
1201
f46c706b
FB
1202 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1203 if (r < 0)
1204 return r;
8c8549db 1205 }
803a3464 1206
f46c706b
FB
1207 /* Cache some of the process metadata we collected so far and that we'll need to
1208 * access soon */
1209 return save_context(context, iovw);
1210}
3c171f0b 1211
db9ac801 1212static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
f46c706b
FB
1213 uid_t owner_uid;
1214 pid_t pid;
1215 char *t;
3e4d0f6c 1216 size_t size;
f46c706b
FB
1217 const char *p;
1218 int r;
f5e04665 1219
e6aa443f
LP
1220 assert(iovw);
1221 assert(context);
1222
f46c706b
FB
1223 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1224 * structure. (It remains valid, with the first iovec fields initialized.) */
f5e04665 1225
f46c706b 1226 pid = context->pid;
f5e04665 1227
f46c706b 1228 /* The following is mandatory */
d7d74854 1229 r = pid_get_comm(pid, &t);
9a435388 1230 if (r < 0)
f46c706b 1231 return log_error_errno(r, "Failed to get COMM: %m");
f5e04665 1232
f46c706b 1233 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
9a435388
FB
1234 if (r < 0)
1235 return r;
f45b8015 1236
c790632c 1237 /* The following are optional, but we use them if present. */
2a3bebd0
FB
1238 r = get_process_exe(pid, &t);
1239 if (r >= 0)
1240 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1241 if (r < 0)
f46c706b 1242 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
bdfd7b2c 1243
f46c706b 1244 if (cg_pid_get_unit(pid, &t) >= 0)
2a3bebd0 1245 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
f5e04665 1246
f46c706b 1247 if (cg_pid_get_user_unit(pid, &t) >= 0)
2a3bebd0 1248 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
f46c706b 1249
9aa82023 1250 if (sd_pid_get_session(pid, &t) >= 0)
9a435388 1251 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
f5e04665 1252
a035f819 1253 if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) {
9a435388 1254 r = asprintf(&t, UID_FMT, owner_uid);
7de80bfe 1255 if (r > 0)
9a435388 1256 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
f5e04665
LP
1257 }
1258
9aa82023 1259 if (sd_pid_get_slice(pid, &t) >= 0)
2a3bebd0 1260 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
f5e04665 1261
a034620f 1262 if (pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
2a3bebd0 1263 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
a035f819 1264
9aa82023 1265 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
2a3bebd0 1266 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
a035f819 1267
9aa82023 1268 if (compose_open_fds(pid, &t) >= 0)
2a3bebd0 1269 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
3f132692
JF
1270
1271 p = procfs_file_alloca(pid, "status");
627055ce 1272 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1273 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
3f132692
JF
1274
1275 p = procfs_file_alloca(pid, "maps");
627055ce 1276 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1277 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
3f132692
JF
1278
1279 p = procfs_file_alloca(pid, "limits");
627055ce 1280 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1281 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
3f132692
JF
1282
1283 p = procfs_file_alloca(pid, "cgroup");
3e4d0f6c 1284 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1285 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
3f132692 1286
d7032b1f 1287 p = procfs_file_alloca(pid, "mountinfo");
3e4d0f6c 1288 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1289 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
d7032b1f 1290
3e4d0f6c
ZJS
1291 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1292 p = procfs_file_alloca(pid, "auxv");
1293 if (read_full_virtual_file(p, &t, &size) >= 0) {
1294 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1295 if (buf) {
1296 /* Add a dummy terminator to make save_context() happy. */
1297 *((uint8_t*) mempcpy(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size)) = '\0';
1298 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1299 }
1300
1301 free(t);
1302 }
1303
9aa82023 1304 if (get_process_cwd(pid, &t) >= 0)
2a3bebd0 1305 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
3f132692
JF
1306
1307 if (get_process_root(pid, &t) >= 0) {
9aa82023
ZJS
1308 bool proc_self_root_is_slash;
1309
1310 proc_self_root_is_slash = strcmp(t, "/") == 0;
3f132692 1311
2a3bebd0 1312 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
7ed03ce6
JF
1313
1314 /* If the process' root is "/", then there is a chance it has
1315 * mounted own root and hence being containerized. */
9aa82023 1316 if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0)
2a3bebd0 1317 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
3f132692
JF
1318 }
1319
9aa82023 1320 if (get_process_environ(pid, &t) >= 0)
2a3bebd0 1321 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
9aa82023 1322
f46c706b
FB
1323 /* we successfully acquired all metadata */
1324 return save_context(context, iovw);
9aa82023 1325}
3f132692 1326
a108c43e
NR
1327static int send_ucred(int transport_fd, struct ucred *ucred) {
1328 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1329 struct msghdr mh = {
1330 .msg_control = &control,
1331 .msg_controllen = sizeof(control),
1332 };
1333 struct cmsghdr *cmsg;
1334
1335 assert(transport_fd >= 0);
1336
1337 cmsg = CMSG_FIRSTHDR(&mh);
1338 *cmsg = (struct cmsghdr) {
1339 .cmsg_level = SOL_SOCKET,
1340 .cmsg_type = SCM_CREDENTIALS,
1341 .cmsg_len = CMSG_LEN(sizeof(struct ucred)),
1342 };
1343 memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred));
1344
1345 return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL));
1346}
1347
1348static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
1349 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1350 struct msghdr mh = {
1351 .msg_control = &control,
1352 .msg_controllen = sizeof(control),
1353 };
1354 struct cmsghdr *cmsg = NULL;
1355 struct ucred *ucred = NULL;
1356 ssize_t n;
1357
1358 assert(ret_ucred);
1359
1360 n = recvmsg_safe(transport_fd, &mh, 0);
1361 if (n < 0)
1362 return n;
1363
1364 CMSG_FOREACH(cmsg, &mh)
1365 if (cmsg->cmsg_level == SOL_SOCKET &&
1366 cmsg->cmsg_type == SCM_CREDENTIALS &&
1367 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
1368
1369 assert(!ucred);
1370 ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
1371 }
1372
1373 if (!ucred)
1374 return -EIO;
1375
1376 *ret_ucred = *ucred;
1377
1378 return 0;
1379}
1380
1381static int can_forward_coredump(pid_t pid) {
1382 _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
1383 int r;
1384
1385 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1386 if (r < 0)
1387 return r;
1388
1389 r = path_extract_directory(cgroup, &path);
1390 if (r < 0)
1391 return r;
1392
1393 r = cg_path_get_unit_path(path, &unit);
1394 if (r == -ENOMEM)
1395 return log_oom();
1396 if (r == -ENXIO)
1397 /* No valid units in this path. */
1398 return false;
1399 if (r < 0)
1400 return r;
1401
1402 /* We require that this process belongs to a delegated cgroup
1403 * (i.e. Delegate=yes), with CoredumpReceive=yes also. */
1404 r = cg_is_delegated(unit);
1405 if (r <= 0)
1406 return r;
1407
1408 return cg_has_coredump_receive(unit);
1409}
1410
1411static int forward_coredump_to_container(Context *context) {
1412 _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF;
71136404 1413 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
a108c43e
NR
1414 pid_t pid, child;
1415 struct ucred ucred = {
1416 .pid = context->pid,
1417 .uid = context->uid,
1418 .gid = context->gid,
1419 };
1420 int r;
1421
1422 r = namespace_get_leader(context->pid, NAMESPACE_PID, &pid);
1423 if (r < 0)
1424 return log_debug_errno(r, "Failed to get namespace leader: %m");
1425
1426 r = can_forward_coredump(pid);
1427 if (r < 0)
1428 return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
1429 if (r == 0)
1430 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
1431 "Coredump will not be forwarded because no target cgroup was found.");
1432
1433 r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair));
1434 if (r < 0)
1435 return log_debug_errno(r, "Failed to create socket pair: %m");
1436
1437 r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true);
1438 if (r < 0)
1439 return log_debug_errno(r, "Failed to set SO_PASSCRED: %m");
1440
1441 r = namespace_open(pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd);
1442 if (r < 0)
1443 return log_debug_errno(r, "Failed to join namespaces of PID " PID_FMT ": %m", pid);
1444
1445 r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0,
e9ccae31 1446 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
a108c43e
NR
1447 pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child);
1448 if (r < 0)
1449 return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", pid);
1450 if (r == 0) {
1451 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
1452 Context child_context = {};
1453
1454 pair[0] = safe_close(pair[0]);
1455
1456 if (laccess("/run/systemd/coredump", W_OK) < 0) {
1457 log_debug_errno(errno, "Cannot find coredump socket, exiting: %m");
1458 _exit(EXIT_FAILURE);
1459 }
1460
1461 r = receive_ucred(pair[1], &ucred);
1462 if (r < 0) {
1463 log_debug_errno(r, "Failed to receive ucred and fd: %m");
1464 _exit(EXIT_FAILURE);
1465 }
1466
1467 iovw = iovw_new();
1468 if (!iovw) {
1469 log_oom();
1470 _exit(EXIT_FAILURE);
1471 }
1472
1473 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1474 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1475 (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1");
1476
1477 for (int i = 0; i < _META_ARGV_MAX; i++) {
1478 int signo;
1479 char buf[DECIMAL_STR_MAX(pid_t)];
1480 const char *t = context->meta[i];
1481
1d03d970 1482 switch (i) {
a108c43e
NR
1483
1484 case META_ARGV_PID:
1485 xsprintf(buf, PID_FMT, ucred.pid);
1486 t = buf;
1487
1488 break;
1489
1490 case META_ARGV_UID:
1491 xsprintf(buf, UID_FMT, ucred.uid);
1492 t = buf;
1493 break;
1494
1495 case META_ARGV_GID:
1496 xsprintf(buf, GID_FMT, ucred.gid);
1497 t = buf;
1498 break;
1499
1500 case META_ARGV_SIGNAL:
1501 if (safe_atoi(t, &signo) >= 0 && SIGNAL_VALID(signo))
1502 (void) iovw_put_string_field(iovw,
1503 "COREDUMP_SIGNAL_NAME=SIG",
1504 signal_to_string(signo));
1505 break;
1506
1507 default:
1508 break;
1509 }
1510
1511 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1512 if (r < 0) {
1513 log_debug_errno(r, "Failed to construct iovec: %m");
1514 _exit(EXIT_FAILURE);
1515 }
1516 }
1517
1518 r = save_context(&child_context, iovw);
1519 if (r < 0) {
1520 log_debug_errno(r, "Failed to save context: %m");
1521 _exit(EXIT_FAILURE);
1522 }
1523
1524 r = gather_pid_metadata_from_procfs(iovw, &child_context);
1525 if (r < 0) {
1526 log_debug_errno(r, "Failed to gather metadata from procfs: %m");
1527 _exit(EXIT_FAILURE);
1528 }
1529
1530 r = send_iovec(iovw, STDIN_FILENO);
1531 if (r < 0) {
1532 log_debug_errno(r, "Failed to send iovec to coredump socket: %m");
1533 _exit(EXIT_FAILURE);
1534 }
1535
1536 _exit(EXIT_SUCCESS);
1537 }
1538
1539 pair[1] = safe_close(pair[1]);
1540
1541 /* We need to translate the PID, UID, and GID of the crashing process
1542 * to the container's namespaces. Do this by sending an SCM_CREDENTIALS
1543 * message on a socket pair, and read the result when we join the
1544 * container. The kernel will perform the translation for us. */
1545 r = send_ucred(pair[0], &ucred);
1546 if (r < 0)
1547 return log_debug_errno(r, "Failed to send metadata to container: %m");
1548
1549 r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0);
1550 if (r < 0)
1551 return log_debug_errno(r, "Failed to wait for child to terminate: %m");
1552 if (r != EXIT_SUCCESS)
1553 return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container: %m");
1554
1555 return 0;
1556}
1557
9aa82023 1558static int process_kernel(int argc, char* argv[]) {
6257e2fb 1559 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1560 Context context = {};
2a9b1a76 1561 int r, signo;
9aa82023 1562
1f9d2a81
DDM
1563 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1564 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1565 * /dev/null. */
5bb1d7fb 1566 r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
1f9d2a81
DDM
1567 if (r < 0)
1568 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1569
988e89ee
ZJS
1570 log_debug("Processing coredump received from the kernel...");
1571
9a435388
FB
1572 iovw = iovw_new();
1573 if (!iovw)
1574 return log_oom();
1575
f46c706b
FB
1576 /* Collect all process metadata passed by the kernel through argv[] */
1577 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
92e92d71 1578 if (r < 0)
6257e2fb 1579 return r;
86562420 1580
f46c706b 1581 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1582 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1583 if (r < 0)
6257e2fb 1584 return r;
f46c706b 1585
1e344c1d 1586 if (!context.is_journald)
f46c706b 1587 /* OK, now we know it's not the journal, hence we can make use of it now. */
1e344c1d 1588 log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
f46c706b 1589
2a9b1a76
HB
1590 /* Log minimal metadata now, so it is not lost if the system is about to shut down. */
1591 log_info("Process %s (%s) of user %s terminated abnormally with signal %s/%s, processing...",
1592 context.meta[META_ARGV_PID], context.meta[META_COMM],
1593 context.meta[META_ARGV_UID], context.meta[META_ARGV_SIGNAL],
1594 strna(safe_atoi(context.meta[META_ARGV_SIGNAL], &signo) >= 0 ? signal_to_string(signo) : NULL));
1595
a108c43e
NR
1596 r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID);
1597 if (r < 0)
1598 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1599 if (r == 0) {
1600 /* If this fails, fallback to the old behavior so that
1601 * there is still some record of the crash. */
1602 r = forward_coredump_to_container(&context);
1603 if (r >= 0)
1604 return 0;
1605 }
1606
f46c706b
FB
1607 /* If this is PID 1 disable coredump collection, we'll unlikely be able to process
1608 * it later on.
1609 *
1610 * FIXME: maybe we should disable coredumps generation from the beginning and
1611 * re-enable it only when we know it's either safe (ie we're not running OOM) or
1612 * it's not pid1 ? */
1613 if (context.is_pid1) {
1614 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1615 disable_coredumps();
1616 }
34c10968 1617
a108c43e
NR
1618 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1619 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1620
f46c706b 1621 if (context.is_journald || context.is_pid1)
6257e2fb 1622 return submit_coredump(&context, iovw, STDIN_FILENO);
9aa82023 1623
6257e2fb 1624 return send_iovec(iovw, STDIN_FILENO);
3c171f0b 1625}
34c10968 1626
988e89ee 1627static int process_backtrace(int argc, char *argv[]) {
3a19fe46
YW
1628 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1629 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1630 Context context = {};
9a435388 1631 char *message;
988e89ee
ZJS
1632 int r;
1633
1634 log_debug("Processing backtrace on stdin...");
1635
9a435388
FB
1636 iovw = iovw_new();
1637 if (!iovw)
5b45a160
ZJS
1638 return log_oom();
1639
2a3bebd0
FB
1640 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1641 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
f46c706b
FB
1642
1643 /* Collect all process metadata from argv[] by making sure to skip the
1644 * '--backtrace' option */
1645 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
988e89ee 1646 if (r < 0)
3a19fe46 1647 return r;
aaeb2522 1648
f46c706b 1649 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1650 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1651 if (r < 0)
3a19fe46 1652 return r;
988e89ee 1653
86562420 1654 for (;;) {
5b45a160 1655 r = journal_importer_process_data(&importer);
3a19fe46
YW
1656 if (r < 0)
1657 return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
d74dc4f2
ZJS
1658 if (r == 1 || /* complete entry */
1659 journal_importer_eof(&importer)) /* end of data */
5b45a160 1660 break;
988e89ee 1661 }
988e89ee 1662
5b45a160
ZJS
1663 if (journal_importer_eof(&importer)) {
1664 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
988e89ee 1665
f46c706b
FB
1666 message = strjoina("Process ", context.meta[META_ARGV_PID],
1667 " (", context.meta[META_COMM], ")"
1668 " of user ", context.meta[META_ARGV_UID],
1669 " failed with ", context.meta[META_ARGV_SIGNAL]);
9a435388
FB
1670
1671 r = iovw_put_string_field(iovw, "MESSAGE=", message);
1672 if (r < 0)
3a19fe46 1673 return r;
5b45a160 1674 } else {
3a19fe46
YW
1675 /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
1676 * end of the array. */
1677 r = iovw_append(iovw, &importer.iovw);
1678 if (r < 0)
1679 return r;
9a435388 1680 }
988e89ee 1681
9a435388 1682 r = sd_journal_sendv(iovw->iovec, iovw->count);
988e89ee 1683 if (r < 0)
3a19fe46 1684 return log_error_errno(r, "Failed to log backtrace: %m");
988e89ee 1685
3a19fe46 1686 return 0;
988e89ee
ZJS
1687}
1688
4515a95e 1689static int run(int argc, char *argv[]) {
3c171f0b 1690 int r;
fee80f69 1691
9aa82023
ZJS
1692 /* First, log to a safe place, since we don't know what crashed and it might
1693 * be journald which we'd rather not log to then. */
8d4e028f 1694
1e344c1d 1695 log_set_target_and_open(LOG_TARGET_KMSG);
8d4e028f 1696
3c171f0b
LP
1697 /* Make sure we never enter a loop */
1698 (void) prctl(PR_SET_DUMPABLE, 0);
8d4e028f 1699
3c171f0b
LP
1700 /* Ignore all parse errors */
1701 (void) parse_config();
fee80f69 1702
3c171f0b
LP
1703 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
1704 log_debug("Selected compression %s.", yes_no(arg_compress));
fee80f69 1705
3c171f0b 1706 r = sd_listen_fds(false);
4515a95e
ZJS
1707 if (r < 0)
1708 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
fee80f69 1709
9aa82023
ZJS
1710 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
1711 * are invoked from the kernel as coredump handler. */
988e89ee
ZJS
1712 if (r == 0) {
1713 if (streq_ptr(argv[1], "--backtrace"))
4515a95e 1714 return process_backtrace(argc, argv);
988e89ee 1715 else
4515a95e 1716 return process_kernel(argc, argv);
988e89ee 1717 } else if (r == 1)
4515a95e 1718 return process_socket(SD_LISTEN_FDS_START);
f5e04665 1719
baaa35ad
ZJS
1720 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1721 "Received unexpected number of file descriptors.");
f5e04665 1722}
4515a95e
ZJS
1723
1724DEFINE_MAIN_FUNCTION(run);