]> git.ipfire.org Git - thirdparty/systemd.git/blame - src/coredump/coredump.c
Merge pull request #32562 from Werkov/test-cgroup-opensuse
[thirdparty/systemd.git] / src / coredump / coredump.c
CommitLineData
db9ecf05 1/* SPDX-License-Identifier: LGPL-2.1-or-later */
f5e04665
LP
2
3#include <errno.h>
803a3464
LP
4#include <stdio.h>
5#include <sys/prctl.h>
587f2a5e 6#include <sys/statvfs.h>
3e4d0f6c 7#include <sys/auxv.h>
cacd6403 8#include <sys/xattr.h>
4f5dd394 9#include <unistd.h>
f5e04665 10
73a99163 11#include "sd-daemon.h"
f11943c5
LP
12#include "sd-journal.h"
13#include "sd-login.h"
73a99163 14#include "sd-messages.h"
4f5dd394
LP
15
16#include "acl-util.h"
b5efdb8a 17#include "alloc-util.h"
587f2a5e 18#include "bus-error.h"
430f0182 19#include "capability-util.h"
ba1261bc 20#include "cgroup-util.h"
4f5dd394 21#include "compress.h"
34c10968
LP
22#include "conf-parser.h"
23#include "copy.h"
c8715007 24#include "coredump-util.h"
f11943c5 25#include "coredump-vacuum.h"
a0956174 26#include "dirent-util.h"
ea680f05 27#include "elf-util.h"
4f5dd394 28#include "escape.h"
3ffd4af2 29#include "fd-util.h"
4f5dd394 30#include "fileio.h"
f4f15635 31#include "fs-util.h"
bd1ae178 32#include "iovec-util.h"
b18453ed 33#include "journal-importer.h"
5edf875b 34#include "journal-send.h"
4f5dd394
LP
35#include "log.h"
36#include "macro.h"
5e332028 37#include "main-func.h"
0a970718 38#include "memory-util.h"
2485b7e2 39#include "memstream-util.h"
35cd0ba5 40#include "mkdir-label.h"
a108c43e 41#include "namespace-util.h"
6bedfcbb 42#include "parse-util.h"
a108c43e 43#include "path-util.h"
0b452006 44#include "process-util.h"
d14bcb4e 45#include "signal-util.h"
3c171f0b 46#include "socket-util.h"
4f5dd394 47#include "special.h"
587f2a5e 48#include "stat-util.h"
8b43440b 49#include "string-table.h"
07630cea 50#include "string-util.h"
4f5dd394 51#include "strv.h"
bf819d3a 52#include "sync-util.h"
e4de7287 53#include "tmpfile-util.h"
8e1ac16b 54#include "uid-classification.h"
b1d4f8e1 55#include "user-util.h"
34727273 56
da890466 57/* The maximum size up to which we process coredumps. We use 1G on 32-bit systems, and 32G on 64-bit systems */
e677041e
LP
58#if __SIZEOF_POINTER__ == 4
59#define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
60#elif __SIZEOF_POINTER__ == 8
61#define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
62#else
63#error "Unexpected pointer size"
64#endif
34c10968 65
bdfd7b2c 66/* The maximum size up to which we leave the coredump around on disk */
34c10968
LP
67#define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
68
bdfd7b2c 69/* The maximum size up to which we store the coredump in the journal */
25cad95c 70#ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
34c10968 71#define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
25cad95c
YW
72#else
73/* oss-fuzz limits memory usage. */
74#define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
75#endif
f5e04665 76
587f2a5e
LB
77/* When checking for available memory and setting lower limits, don't
78 * go below 4MB for writing core files to storage. */
79#define PROCESS_SIZE_MIN (4U*1024U*1024U)
80
c4aa09b0 81/* Make sure to not make this larger than the maximum journal entry
27f931d1 82 * size. See DATA_SIZE_MAX in journal-importer.h. */
874bc134 83assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
f5e04665
LP
84
85enum {
f46c706b 86 /* We use these as array indexes for our process metadata cache.
ea5cc2a8 87 *
f46c706b
FB
88 * The first indices of the cache stores the same metadata as the ones passed by
89 * the kernel via argv[], ie the strings array passed by the kernel according to
90 * our pattern defined in /proc/sys/kernel/core_pattern (see man:core(5)). */
91
92 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
93 META_ARGV_UID, /* %u: as seen in the initial user namespace */
94 META_ARGV_GID, /* %g: as seen in the initial user namespace */
95 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
e503019b 96 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to μs granularity) */
f46c706b
FB
97 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
98 META_ARGV_HOSTNAME, /* %h: hostname */
99 _META_ARGV_MAX,
100
101 /* The following indexes are cached for a couple of special fields we use (and
102 * thereby need to be retrieved quickly) for naming coredump files, and attaching
103 * xattrs. Unlike the previous ones they are retrieved from the runtime
104 * environment. */
105
106 META_COMM = _META_ARGV_MAX,
107 _META_MANDATORY_MAX,
108
109 /* The rest are similar to the previous ones except that we won't fail if one of
110 * them is missing. */
111
112 META_EXE = _META_MANDATORY_MAX,
113 META_UNIT,
3e4d0f6c 114 META_PROC_AUXV,
f46c706b 115 _META_MAX
f5e04665
LP
116};
117
f46c706b 118static const char * const meta_field_names[_META_MAX] = {
510a1466
ZJS
119 [META_ARGV_PID] = "COREDUMP_PID=",
120 [META_ARGV_UID] = "COREDUMP_UID=",
121 [META_ARGV_GID] = "COREDUMP_GID=",
122 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
123 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
124 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
125 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
126 [META_COMM] = "COREDUMP_COMM=",
127 [META_EXE] = "COREDUMP_EXE=",
128 [META_UNIT] = "COREDUMP_UNIT=",
3e4d0f6c 129 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
f46c706b
FB
130};
131
132typedef struct Context {
133 const char *meta[_META_MAX];
3e4d0f6c 134 size_t meta_size[_META_MAX];
f46c706b 135 pid_t pid;
9764bca9
NR
136 uid_t uid;
137 gid_t gid;
f46c706b
FB
138 bool is_pid1;
139 bool is_journald;
140} Context;
141
34c10968
LP
142typedef enum CoredumpStorage {
143 COREDUMP_STORAGE_NONE,
144 COREDUMP_STORAGE_EXTERNAL,
145 COREDUMP_STORAGE_JOURNAL,
34c10968 146 _COREDUMP_STORAGE_MAX,
2d93c20e 147 _COREDUMP_STORAGE_INVALID = -EINVAL,
34c10968
LP
148} CoredumpStorage;
149
34c10968 150static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
510a1466 151 [COREDUMP_STORAGE_NONE] = "none",
34c10968 152 [COREDUMP_STORAGE_EXTERNAL] = "external",
510a1466 153 [COREDUMP_STORAGE_JOURNAL] = "journal",
34c10968
LP
154};
155
156DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
8c9571d0 157static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting");
34727273
ZJS
158
159static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
8c9571d0 160static bool arg_compress = true;
59f448cf
LP
161static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
162static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
6e2b4a69 163static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
f5fbe71d
YW
164static uint64_t arg_keep_free = UINT64_MAX;
165static uint64_t arg_max_use = UINT64_MAX;
34c10968
LP
166
167static int parse_config(void) {
34c10968 168 static const ConfigTableItem items[] = {
510a1466
ZJS
169 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
170 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
171 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
172 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
173 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
174 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
175 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
34c10968
LP
176 {}
177 };
178
4a78074f
LP
179 int r;
180
6378f257 181 r = config_parse_standard_file_with_dropins(
e5abff37 182 "systemd/coredump.conf",
4a78074f
LP
183 "Coredump\0",
184 config_item_table_lookup,
185 items,
186 CONFIG_PARSE_WARN,
187 /* userdata= */ NULL);
188 if (r < 0)
189 return r;
190
191 /* Let's make sure we fix up the maximum size we send to the journal here on the client side, for
192 * efficiency reasons. journald wouldn't accept anything larger anyway. */
193 if (arg_journal_size_max > JOURNAL_SIZE_MAX) {
194 log_warning("JournalSizeMax= set to larger value (%s) than journald would accept (%s), lowering automatically.",
195 FORMAT_BYTES(arg_journal_size_max), FORMAT_BYTES(JOURNAL_SIZE_MAX));
196 arg_journal_size_max = JOURNAL_SIZE_MAX;
197 }
198
199 return 0;
34c10968
LP
200}
201
a1e92eee 202static uint64_t storage_size_max(void) {
ee0449fd
ZJS
203 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
204 return arg_external_size_max;
205 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
206 return arg_journal_size_max;
207 assert(arg_storage == COREDUMP_STORAGE_NONE);
208 return 0;
73a99163
ZJS
209}
210
3e4d0f6c
ZJS
211static int fix_acl(int fd, uid_t uid, bool allow_user) {
212 assert(fd >= 0);
213 assert(uid_is_valid(uid));
34c10968 214
349cc4a5 215#if HAVE_ACL
709f6e46 216 int r;
34c10968 217
3e4d0f6c
ZJS
218 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
219 if (!allow_user)
220 return 0;
b59233e6 221
05fd2156 222 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY)
34c10968
LP
223 return 0;
224
d81be4e7 225 /* Make sure normal users can read (but not write or delete) their own coredumps */
567aeb58 226 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
709f6e46 227 if (r < 0)
567aeb58 228 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
34c10968
LP
229#endif
230
231 return 0;
232}
233
f46c706b
FB
234static int fix_xattr(int fd, const Context *context) {
235
236 static const char * const xattrs[_META_MAX] = {
510a1466
ZJS
237 [META_ARGV_PID] = "user.coredump.pid",
238 [META_ARGV_UID] = "user.coredump.uid",
239 [META_ARGV_GID] = "user.coredump.gid",
240 [META_ARGV_SIGNAL] = "user.coredump.signal",
241 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
242 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
243 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
244 [META_COMM] = "user.coredump.comm",
245 [META_EXE] = "user.coredump.exe",
0cd77f97
LP
246 };
247
34c10968
LP
248 int r = 0;
249
b59233e6
LP
250 assert(fd >= 0);
251
60ecc386 252 /* Attach some metadata to coredumps via extended attributes. Just because we can. */
34c10968 253
fe96c0f8 254 for (unsigned i = 0; i < _META_MAX; i++) {
1eef15b1
ZJS
255 int k;
256
f46c706b 257 if (isempty(context->meta[i]) || !xattrs[i])
0cd77f97 258 continue;
34c10968 259
60ecc386
ZJS
260 k = RET_NERRNO(fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE));
261 RET_GATHER(r, k);
0cd77f97 262 }
34c10968
LP
263
264 return r;
265}
266
b0b21dce 267#define filename_escape(s) xescape((s), "./ ")
34c10968 268
a1e92eee 269static const char *coredump_tmpfile_name(const char *s) {
1da3cb81 270 return s ?: "(unnamed temporary file)";
0c773903
EV
271}
272
b59233e6
LP
273static int fix_permissions(
274 int fd,
275 const char *filename,
276 const char *target,
f46c706b 277 const Context *context,
3e4d0f6c 278 bool allow_user) {
b59233e6 279
03532f0a
LP
280 int r;
281
b59233e6 282 assert(fd >= 0);
b59233e6 283 assert(target);
3c171f0b 284 assert(context);
cfd652ed
ZJS
285
286 /* Ignore errors on these */
3c171f0b 287 (void) fchmod(fd, 0640);
9764bca9 288 (void) fix_acl(fd, context->uid, allow_user);
3c171f0b 289 (void) fix_xattr(fd, context);
cfd652ed 290
74402bf0 291 r = link_tmpfile(fd, filename, target, LINK_TMPFILE_SYNC);
03532f0a
LP
292 if (r < 0)
293 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
cfd652ed
ZJS
294
295 return 0;
296}
297
59f448cf 298static int maybe_remove_external_coredump(const char *filename, uint64_t size) {
cfd652ed 299
b59233e6 300 /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */
cfd652ed 301
fc6cec86 302 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
cfd652ed
ZJS
303 size <= arg_external_size_max)
304 return 0;
305
306 if (!filename)
307 return 1;
308
4a62c710
MS
309 if (unlink(filename) < 0 && errno != ENOENT)
310 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
cfd652ed
ZJS
311
312 return 1;
313}
314
f46c706b 315static int make_filename(const Context *context, char **ret) {
b59233e6 316 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
a7f7d1bd 317 sd_id128_t boot = {};
34c10968
LP
318 int r;
319
3c171f0b 320 assert(context);
34c10968 321
f46c706b 322 c = filename_escape(context->meta[META_COMM]);
34c10968 323 if (!c)
b59233e6 324 return -ENOMEM;
34c10968 325
f46c706b 326 u = filename_escape(context->meta[META_ARGV_UID]);
0dc5d23c 327 if (!u)
b59233e6 328 return -ENOMEM;
34c10968
LP
329
330 r = sd_id128_get_boot(&boot);
b59233e6 331 if (r < 0)
34c10968 332 return r;
34c10968 333
f46c706b 334 p = filename_escape(context->meta[META_ARGV_PID]);
b59233e6
LP
335 if (!p)
336 return -ENOMEM;
337
f46c706b 338 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
b59233e6
LP
339 if (!t)
340 return -ENOMEM;
341
342 if (asprintf(ret,
64a5384f 343 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
34c10968 344 c,
0dc5d23c 345 u,
34c10968
LP
346 SD_ID128_FORMAT_VAL(boot),
347 p,
b59233e6
LP
348 t) < 0)
349 return -ENOMEM;
350
351 return 0;
352}
353
3e4d0f6c
ZJS
354static int grant_user_access(int core_fd, const Context *context) {
355 int at_secure = -1;
356 uid_t uid = UID_INVALID, euid = UID_INVALID;
357 uid_t gid = GID_INVALID, egid = GID_INVALID;
358 int r;
359
360 assert(core_fd >= 0);
361 assert(context);
362
363 if (!context->meta[META_PROC_AUXV])
364 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
365
366 uint8_t elf[EI_NIDENT];
367 errno = 0;
368 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
369 return log_warning_errno(errno_or_else(EIO),
370 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
371
372 if (elf[EI_MAG0] != ELFMAG0 ||
373 elf[EI_MAG1] != ELFMAG1 ||
374 elf[EI_MAG2] != ELFMAG2 ||
375 elf[EI_MAG3] != ELFMAG3 ||
376 elf[EI_VERSION] != EV_CURRENT)
377 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
378 "Core file does not have ELF header, not adjusting permissions.");
379 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
380 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
381 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
382 "Core file has strange ELF class, not adjusting permissions.");
383
384 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
385 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
386 "Core file has non-native endianness, not adjusting permissions.");
387
cb38fdbe
ZJS
388 r = parse_auxv(LOG_WARNING,
389 /* elf_class= */ elf[EI_CLASS],
390 context->meta[META_PROC_AUXV],
391 context->meta_size[META_PROC_AUXV],
392 &at_secure, &uid, &euid, &gid, &egid);
3e4d0f6c
ZJS
393 if (r < 0)
394 return r;
395
396 /* We allow access if we got all the data and at_secure is not set and
397 * the uid/gid matches euid/egid. */
398 bool ret =
399 at_secure == 0 &&
400 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
401 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
402 log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
403 ret ? "permit" : "restrict",
404 uid, euid, gid, egid, yes_no(at_secure));
405 return ret;
406}
407
b59233e6 408static int save_external_coredump(
f46c706b 409 const Context *context,
3c171f0b 410 int input_fd,
b59233e6 411 char **ret_filename,
5f3e0a74
HW
412 int *ret_node_fd,
413 int *ret_data_fd,
0cd4e913 414 uint64_t *ret_size,
587f2a5e 415 uint64_t *ret_compressed_size,
cc4419ed 416 bool *ret_truncated) {
b59233e6 417
587f2a5e
LB
418 _cleanup_(unlink_and_freep) char *tmp = NULL;
419 _cleanup_free_ char *fn = NULL;
254d1313 420 _cleanup_close_ int fd = -EBADF;
ee0449fd 421 uint64_t rlimit, process_limit, max_size;
587f2a5e 422 bool truncated, storage_on_tmpfs;
b59233e6
LP
423 struct stat st;
424 int r;
425
3c171f0b 426 assert(context);
b59233e6 427 assert(ret_filename);
5f3e0a74
HW
428 assert(ret_node_fd);
429 assert(ret_data_fd);
b59233e6 430 assert(ret_size);
587f2a5e
LB
431 assert(ret_compressed_size);
432 assert(ret_truncated);
b59233e6 433
f46c706b 434 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit);
bdfd7b2c 435 if (r < 0)
f46c706b
FB
436 return log_error_errno(r, "Failed to parse resource limit '%s': %m",
437 context->meta[META_ARGV_RLIMIT]);
d7a0f1f4 438 if (rlimit < page_size())
f46c706b 439 /* Is coredumping disabled? Then don't bother saving/processing the
3a559f22 440 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
f46c706b
FB
441 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
442 * is usually the same as PAGE_SIZE. */
baaa35ad
ZJS
443 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
444 "Resource limits disable core dumping for process %s (%s).",
f46c706b 445 context->meta[META_ARGV_PID], context->meta[META_COMM]);
bdfd7b2c 446
ee0449fd 447 process_limit = MAX(arg_process_size_max, storage_size_max());
baaa35ad
ZJS
448 if (process_limit == 0)
449 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
450 "Limits for coredump processing and storage are both 0, not dumping core.");
ee0449fd 451
bdfd7b2c 452 /* Never store more than the process configured, or than we actually shall keep or process */
ee0449fd 453 max_size = MIN(rlimit, process_limit);
bdfd7b2c 454
3c171f0b 455 r = make_filename(context, &fn);
23bbb0de
MS
456 if (r < 0)
457 return log_error_errno(r, "Failed to determine coredump file name: %m");
34c10968 458
1fbe8d0c 459 (void) mkdir_parents_label(fn, 0755);
803a3464 460
03532f0a 461 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
4a62c710 462 if (fd < 0)
03532f0a 463 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
803a3464 464
587f2a5e
LB
465 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
466 * the service or the slice it belongs to. This is common on low-resources systems,
467 * to avoid crashing processes to take away too many system resources.
468 * Check the cgroup settings, and set max_size to a bit less than half of the
469 * available memory left to the process.
470 * Then, attempt to write the core file uncompressed first - if the write gets
471 * interrupted, we know we won't be able to write it all, so instead compress what
472 * was written so far, delete the uncompressed truncated core, and then continue
473 * compressing from STDIN. Given the compressed core cannot be larger than the
474 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
475 * should be able to at least store the full compressed core file. */
476
477 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
478 if (storage_on_tmpfs && arg_compress) {
479 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
480 uint64_t cgroup_limit = UINT64_MAX;
481 struct statvfs sv;
482
483 /* If we can't get the cgroup limit, just ignore it, but don't fail,
484 * try anyway with the config settings. */
485 r = sd_bus_default_system(&bus);
486 if (r < 0)
487 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
488 else {
489 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
490
491 r = sd_bus_get_property_trivial(
492 bus,
493 "org.freedesktop.systemd1",
494 "/org/freedesktop/systemd1/unit/self",
495 "org.freedesktop.systemd1.Service",
496 "MemoryAvailable",
497 &error,
498 't', &cgroup_limit);
499 if (r < 0)
500 log_warning_errno(r,
501 "Failed to query MemoryAvailable for current unit, "
502 "falling back to static config settings: %s",
503 bus_error_message(&error, r));
504 }
803a3464 505
587f2a5e
LB
506 max_size = MIN(cgroup_limit, max_size);
507 max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
508 max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
509
510 /* tmpfs might get full quickly, so check the available space too.
511 * But don't worry about errors here, failing to access the storage
512 * location will be better logged when writing to it. */
8facac5f 513 if (fstatvfs(fd, &sv) >= 0)
587f2a5e 514 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
34c10968 515
587f2a5e 516 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
7849c2ac
TA
517 }
518
587f2a5e
LB
519 r = copy_bytes(input_fd, fd, max_size, 0);
520 if (r < 0)
521 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
522 context->meta[META_ARGV_PID], context->meta[META_COMM]);
523 truncated = r == 1;
cfd652ed 524
3e4d0f6c
ZJS
525 bool allow_user = grant_user_access(fd, context) > 0;
526
587f2a5e
LB
527#if HAVE_COMPRESSION
528 if (arg_compress) {
529 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
530 _cleanup_free_ char *fn_compressed = NULL;
254d1313 531 _cleanup_close_ int fd_compressed = -EBADF;
587f2a5e
LB
532 uint64_t uncompressed_size = 0;
533
86cbbc6d 534 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 535 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
cfd652ed 536
ee00684c 537 fn_compressed = strjoin(fn, default_compression_extension());
587f2a5e
LB
538 if (!fn_compressed)
539 return log_oom();
cfd652ed 540
03532f0a 541 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
587f2a5e
LB
542 if (fd_compressed < 0)
543 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
cfd652ed 544
587f2a5e
LB
545 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
546 if (r < 0)
547 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
548
549 if (truncated && storage_on_tmpfs) {
550 uint64_t partial_uncompressed_size = 0;
551
552 /* Uncompressed write was truncated and we are writing to tmpfs: delete
553 * the uncompressed core, and compress the remaining part from STDIN. */
554
555 tmp = unlink_and_free(tmp);
556 fd = safe_close(fd);
557
558 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
559 if (r < 0)
560 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
561 uncompressed_size += partial_uncompressed_size;
b59233e6
LP
562 }
563
9764bca9 564 r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, allow_user);
cfd652ed 565 if (r < 0)
587f2a5e 566 return r;
b59233e6 567
587f2a5e
LB
568 if (fstat(fd_compressed, &st) < 0)
569 return log_error_errno(errno,
570 "Failed to fstat core file %s: %m",
571 coredump_tmpfile_name(tmp_compressed));
cfd652ed 572
587f2a5e
LB
573 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
574 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
575 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
576 *ret_data_fd = TAKE_FD(fd);
577 *ret_size = uncompressed_size;
578 *ret_truncated = truncated;
579 tmp_compressed = mfree(tmp_compressed);
cfd652ed 580
cfd652ed 581 return 0;
34c10968 582 }
3b1a55e1 583#endif
5f3e0a74 584
587f2a5e
LB
585 if (truncated)
586 log_struct(LOG_INFO,
08e86b15
DDM
587 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
588 "SIZE_LIMIT=%"PRIu64, max_size,
587f2a5e
LB
589 "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
590
9764bca9 591 r = fix_permissions(fd, tmp, fn, context, allow_user);
cfd652ed 592 if (r < 0)
587f2a5e
LB
593 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
594
595 if (fstat(fd, &st) < 0)
596 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
597
86cbbc6d 598 if (lseek(fd, 0, SEEK_SET) < 0)
587f2a5e 599 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
34c10968 600
0cfb0971 601 *ret_filename = TAKE_PTR(fn);
1cc6c93a 602 *ret_data_fd = TAKE_FD(fd);
59f448cf 603 *ret_size = (uint64_t) st.st_size;
587f2a5e 604 *ret_truncated = truncated;
34c10968 605
34c10968 606 return 0;
34c10968
LP
607}
608
609static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
610 _cleanup_free_ char *field = NULL;
611 ssize_t n;
612
8d4e028f 613 assert(fd >= 0);
34c10968
LP
614 assert(ret);
615 assert(ret_size);
616
86cbbc6d 617 if (lseek(fd, 0, SEEK_SET) < 0)
4a62c710 618 return log_warning_errno(errno, "Failed to seek: %m");
803a3464 619
34c10968 620 field = malloc(9 + size);
a73c74db
LP
621 if (!field)
622 return log_warning_errno(SYNTHETIC_ERRNO(ENOMEM),
623 "Failed to allocate memory for coredump, coredump will not be stored.");
34c10968
LP
624
625 memcpy(field, "COREDUMP=", 9);
626
a73c74db
LP
627 /* NB: simple read() would fail for overly large coredumps, since read() on Linux can only deal with
628 * 0x7ffff000 bytes max. Hence call things in a loop. */
629 n = loop_read(fd, field + 9, size, /* do_poll= */ false);
23bbb0de
MS
630 if (n < 0)
631 return log_error_errno((int) n, "Failed to read core data: %m");
baaa35ad 632 if ((size_t) n < size)
4e494e6a 633 return log_error_errno(SYNTHETIC_ERRNO(EIO), "Core data too short.");
34c10968 634
1cc6c93a 635 *ret = TAKE_PTR(field);
34c10968
LP
636 *ret_size = size + 9;
637
34c10968
LP
638 return 0;
639}
803a3464 640
3f132692
JF
641/* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
642 * 0:/dev/pts/23
643 * pos: 0
644 * flags: 0100002
645 *
646 * 1:/dev/pts/23
647 * pos: 0
648 * flags: 0100002
649 *
650 * 2:/dev/pts/23
651 * pos: 0
652 * flags: 0100002
653 * EOF
654 */
2485b7e2
YW
655static int compose_open_fds(pid_t pid, char **ret) {
656 _cleanup_(memstream_done) MemStream m = {};
4d84bc2f 657 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
254d1313 658 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
59059b4a 659 const char *fddelim = "", *path;
2485b7e2 660 FILE *stream;
7b26ea6f 661 int r;
3f132692
JF
662
663 assert(pid >= 0);
2485b7e2 664 assert(ret);
3f132692 665
59059b4a 666 path = procfs_file_alloca(pid, "fd");
3f132692 667 proc_fd_dir = opendir(path);
59059b4a
ZJS
668 if (!proc_fd_dir)
669 return -errno;
3f132692 670
4d84bc2f 671 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
59059b4a
ZJS
672 if (proc_fdinfo_fd < 0)
673 return -errno;
3f132692 674
2485b7e2 675 stream = memstream_init(&m);
3f132692
JF
676 if (!stream)
677 return -ENOMEM;
678
af3b864d 679 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
3f132692 680 _cleanup_fclose_ FILE *fdinfo = NULL;
4d84bc2f 681 _cleanup_free_ char *fdname = NULL;
254d1313 682 _cleanup_close_ int fd = -EBADF;
3f132692 683
af3b864d 684 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
3f132692
JF
685 if (r < 0)
686 return r;
687
af3b864d 688 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
3f132692
JF
689 fddelim = "\n";
690
691 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
af3b864d 692 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
59059b4a 693 if (fd < 0)
3f132692
JF
694 continue;
695
b46c3e49
VC
696 fdinfo = take_fdopen(&fd, "r");
697 if (!fdinfo)
3f132692
JF
698 continue;
699
7b26ea6f
LP
700 for (;;) {
701 _cleanup_free_ char *line = NULL;
702
703 r = read_line(fdinfo, LONG_LINE_MAX, &line);
704 if (r < 0)
705 return r;
706 if (r == 0)
707 break;
708
0d536673 709 fputs(line, stream);
7b26ea6f 710 fputc('\n', stream);
4d84bc2f 711 }
3f132692
JF
712 }
713
2485b7e2 714 return memstream_finalize(&m, ret, NULL);
3f132692
JF
715}
716
7ed03ce6
JF
717/* Returns 1 if the parent was found.
718 * Returns 0 if there is not a process we can call the pid's
719 * container parent (the pid's process isn't 'containerized').
720 * Returns a negative number on errors.
721 */
722static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
7ed03ce6
JF
723 pid_t container_pid;
724 const char *proc_root_path;
725 struct stat root_stat, proc_root_stat;
83844031 726 int r;
7ed03ce6
JF
727
728 /* To compare inodes of / and /proc/[pid]/root */
729 if (stat("/", &root_stat) < 0)
730 return -errno;
731
732 proc_root_path = procfs_file_alloca(pid, "root");
733 if (stat(proc_root_path, &proc_root_stat) < 0)
734 return -errno;
735
736 /* The process uses system root. */
c20c77ef 737 if (stat_inode_same(&proc_root_stat, &root_stat)) {
7ed03ce6
JF
738 *cmdline = NULL;
739 return 0;
740 }
741
ade39d9a 742 r = namespace_get_leader(pid, NAMESPACE_MOUNT, &container_pid);
7ed03ce6
JF
743 if (r < 0)
744 return r;
745
a034620f 746 r = pid_get_cmdline(container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, cmdline);
d3cba4ea
EV
747 if (r < 0)
748 return r;
749
750 return 1;
7ed03ce6
JF
751}
752
f46c706b 753static int change_uid_gid(const Context *context) {
9764bca9
NR
754 uid_t uid = context->uid;
755 gid_t gid = context->gid;
3c171f0b 756 int r;
34c10968 757
28add648 758 if (uid_is_system(uid)) {
888e378d
LP
759 const char *user = "systemd-coredump";
760
fafff8f1 761 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
888e378d
LP
762 if (r < 0) {
763 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
764 uid = gid = 0;
765 }
888e378d 766 }
3c171f0b
LP
767
768 return drop_privileges(uid, gid, 0);
769}
8c8549db 770
3c171f0b 771static int submit_coredump(
3e4d0f6c 772 const Context *context,
9a435388 773 struct iovec_wrapper *iovw,
3c171f0b 774 int input_fd) {
34c10968 775
c546154a 776 _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
254d1313 777 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
9a435388 778 _cleanup_free_ char *filename = NULL, *coredump_data = NULL;
51d3783d 779 _cleanup_free_ char *stacktrace = NULL;
c546154a 780 const char *module_name;
587f2a5e 781 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
6fea39ba 782 bool truncated = false, written = false;
c546154a 783 JsonVariant *module_json;
3c171f0b 784 int r;
83844031 785
3c171f0b 786 assert(context);
9a435388 787 assert(iovw);
3c171f0b 788 assert(input_fd >= 0);
f5e04665 789
3c171f0b
LP
790 /* Vacuum before we write anything again */
791 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
803a3464 792
3c171f0b 793 /* Always stream the coredump to disk, if that's possible */
c8e94763
LP
794 written = save_external_coredump(
795 context, input_fd,
796 &filename, &coredump_node_fd, &coredump_fd,
797 &coredump_size, &coredump_compressed_size, &truncated) >= 0;
798 if (written) {
799 /* If we could write it to disk we can now process it. */
800 /* If we don't want to keep the coredump on disk, remove it now, as later on we
801 * will lack the privileges for it. However, we keep the fd to it, so that we can
802 * still process it and log it. */
803 r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
804 if (r < 0)
805 return r;
806 if (r == 0)
807 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
808 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
809 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
810 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
811
812 /* Vacuum again, but exclude the coredump we just created */
813 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
814 }
6fea39ba 815
c8e94763
LP
816 /* Now, let's drop privileges to become the user who owns the segfaulted process and allocate the
817 * coredump memory under the user's uid. This also ensures that the credentials journald will see are
818 * the ones of the coredumping user, thus making sure the user gets access to the core dump. Let's
819 * also get rid of all capabilities, if we run as root, we won't need them anymore. */
3c171f0b
LP
820 r = change_uid_gid(context);
821 if (r < 0)
822 return log_error_errno(r, "Failed to drop privileges: %m");
34c10968 823
c8e94763
LP
824 if (written) {
825 /* Try to get a stack trace if we can */
826 if (coredump_size > arg_process_size_max)
827 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
828 "than %"PRIu64" (the configured maximum)",
829 coredump_size, arg_process_size_max);
830 else if (coredump_fd >= 0) {
831 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
832
833 (void) parse_elf_object(coredump_fd,
834 context->meta[META_EXE],
835 /* fork_disable_dump= */ skip, /* avoid loops */
836 &stacktrace,
837 &json_metadata);
838 }
c790632c 839 }
51d3783d 840
6fea39ba 841 _cleanup_free_ char *core_message = NULL;
6fea39ba
LP
842 core_message = strjoin(
843 "Process ", context->meta[META_ARGV_PID],
844 " (", context->meta[META_COMM],
845 ") of user ", context->meta[META_ARGV_UID],
846 written ? " dumped core." : " terminated abnormally without generating a coredump.");
847 if (!core_message)
848 return log_oom();
849
850 if (context->is_journald && filename)
851 if (!strextend(&core_message, "\nCoredump diverted to ", filename))
852 return log_oom();
51d3783d 853
6fea39ba
LP
854 if (stacktrace)
855 if (!strextend(&core_message, "\n\n", stacktrace))
856 return log_oom();
92e92d71 857
5edf875b
DDM
858 if (context->is_journald)
859 /* We might not be able to log to the journal, so let's always print the message to another
860 * log target. The target was set previously to something safe. */
9a435388 861 log_dispatch(LOG_ERR, 0, core_message);
92e92d71 862
2a3bebd0 863 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
3c171f0b 864
0cd4e913 865 if (truncated)
2a3bebd0 866 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
0cd4e913 867
c546154a
LB
868 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
869 * attach it as journal metadata. */
870 if (json_metadata) {
871 _cleanup_free_ char *formatted_json = NULL;
872
873 r = json_variant_format(json_metadata, 0, &formatted_json);
874 if (r < 0)
875 return log_error_errno(r, "Failed to format JSON package metadata: %m");
876
671769c9 877 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
c546154a
LB
878 }
879
c790632c
ZJS
880 /* In the unlikely scenario that context->meta[META_EXE] is not available,
881 * let's avoid guessing the module name and skip the loop. */
882 if (context->meta[META_EXE])
883 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
884 JsonVariant *t;
c546154a 885
c790632c
ZJS
886 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
887 if (!path_equal_filename(module_name, context->meta[META_EXE]))
888 continue;
c546154a 889
c790632c
ZJS
890 t = json_variant_by_key(module_json, "name");
891 if (t)
892 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", json_variant_string(t));
1f2abb79 893
c790632c
ZJS
894 t = json_variant_by_key(module_json, "version");
895 if (t)
896 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", json_variant_string(t));
897 }
c546154a 898
3c171f0b 899 /* Optionally store the entire coredump in the journal */
587f2a5e 900 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
6e9ef603
ZJS
901 if (coredump_size <= arg_journal_size_max) {
902 size_t sz = 0;
903
904 /* Store the coredump itself in the journal */
905
906 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
9a435388
FB
907 if (r >= 0) {
908 if (iovw_put(iovw, coredump_data, sz) >= 0)
909 TAKE_PTR(coredump_data);
910 } else
6e9ef603
ZJS
911 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
912 } else
5206a724 913 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
6e9ef603 914 coredump_size, arg_journal_size_max);
f5e04665
LP
915 }
916
5edf875b
DDM
917 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
918 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
919 * the coredump to the socket. */
920
921 if (context->is_journald) {
922 r = journal_fd_nonblock(true);
923 if (r < 0)
924 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
925 }
926
9a435388 927 r = sd_journal_sendv(iovw->iovec, iovw->count);
5edf875b
DDM
928
929 if (context->is_journald) {
930 int k;
931
932 k = journal_fd_nonblock(false);
933 if (k < 0)
934 return log_error_errno(k, "Failed to make journal socket blocking: %m");
935 }
936
937 if (r == -EAGAIN && context->is_journald)
938 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
939 else if (r < 0)
3c171f0b
LP
940 return log_error_errno(r, "Failed to log coredump: %m");
941
942 return 0;
943}
944
f46c706b 945static int save_context(Context *context, const struct iovec_wrapper *iovw) {
f46c706b
FB
946 const char *unit;
947 int r;
3c171f0b 948
3c171f0b 949 assert(context);
f46c706b
FB
950 assert(iovw);
951 assert(iovw->count >= _META_ARGV_MAX);
3c171f0b 952
f46c706b 953 /* The context does not allocate any memory on its own */
3c171f0b 954
fe96c0f8 955 for (size_t n = 0; n < iovw->count; n++) {
f46c706b 956 struct iovec *iovec = iovw->iovec + n;
92e92d71 957
fe96c0f8 958 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
f46c706b
FB
959 /* Note that these strings are NUL terminated, because we made sure that a
960 * trailing NUL byte is in the buffer, though not included in the iov_len
961 * count (see process_socket() and gather_pid_metadata_*()) */
962 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
3c171f0b 963
3e4d0f6c 964 const char *p = startswith(iovec->iov_base, meta_field_names[i]);
f46c706b
FB
965 if (p) {
966 context->meta[i] = p;
3e4d0f6c 967 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
f46c706b
FB
968 break;
969 }
970 }
3c171f0b 971 }
f46c706b
FB
972
973 if (!context->meta[META_ARGV_PID])
974 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
975 "Failed to find the PID of crashing process");
976
977 r = parse_pid(context->meta[META_ARGV_PID], &context->pid);
978 if (r < 0)
979 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
980
9764bca9
NR
981 r = parse_uid(context->meta[META_ARGV_UID], &context->uid);
982 if (r < 0)
983 return log_error_errno(r, "Failed to parse UID \"%s\": %m", context->meta[META_ARGV_UID]);
984
985 r = parse_gid(context->meta[META_ARGV_GID], &context->gid);
986 if (r < 0)
987 return log_error_errno(r, "Failed to parse GID \"%s\": %m", context->meta[META_ARGV_GID]);
988
f46c706b
FB
989 unit = context->meta[META_UNIT];
990 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
991 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
992
993 return 0;
3c171f0b
LP
994}
995
996static int process_socket(int fd) {
254d1313 997 _cleanup_close_ int input_fd = -EBADF;
f46c706b 998 Context context = {};
9a435388
FB
999 struct iovec_wrapper iovw = {};
1000 struct iovec iovec;
fe96c0f8 1001 int r;
3c171f0b
LP
1002
1003 assert(fd >= 0);
1004
d2acb93d 1005 log_setup();
3c171f0b 1006
988e89ee
ZJS
1007 log_debug("Processing coredump received on stdin...");
1008
3c171f0b 1009 for (;;) {
fb29cdbe 1010 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
3c171f0b
LP
1011 struct msghdr mh = {
1012 .msg_control = &control,
1013 .msg_controllen = sizeof(control),
1014 .msg_iovlen = 1,
1015 };
1016 ssize_t n;
fe1ef0f8 1017 ssize_t l;
3c171f0b 1018
fe1ef0f8
EV
1019 l = next_datagram_size_fd(fd);
1020 if (l < 0) {
1021 r = log_error_errno(l, "Failed to determine datagram size to read: %m");
3c171f0b
LP
1022 goto finish;
1023 }
1024
9a435388
FB
1025 iovec.iov_len = l;
1026 iovec.iov_base = malloc(l + 1);
1027 if (!iovec.iov_base) {
3c171f0b
LP
1028 r = log_oom();
1029 goto finish;
1030 }
1031
9a435388 1032 mh.msg_iov = &iovec;
3c171f0b 1033
3691bcf3 1034 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
3c171f0b 1035 if (n < 0) {
9a435388 1036 free(iovec.iov_base);
3691bcf3 1037 r = log_error_errno(n, "Failed to receive datagram: %m");
3c171f0b
LP
1038 goto finish;
1039 }
1040
9a435388
FB
1041 /* The final zero-length datagram carries the file descriptor and tells us
1042 * that we're done. */
3c171f0b 1043 if (n == 0) {
dac556fa 1044 struct cmsghdr *found;
3c171f0b 1045
9a435388 1046 free(iovec.iov_base);
3c171f0b 1047
dac556fa 1048 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
3c171f0b 1049 if (!found) {
3691bcf3
LP
1050 cmsg_close_all(&mh);
1051 r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1052 "Coredump file descriptor missing.");
3c171f0b
LP
1053 goto finish;
1054 }
1055
f8540bde 1056 assert(input_fd < 0);
b1d02191 1057 input_fd = *CMSG_TYPED_DATA(found, int);
3c171f0b 1058 break;
3691bcf3
LP
1059 } else
1060 cmsg_close_all(&mh);
3c171f0b
LP
1061
1062 /* Add trailing NUL byte, in case these are strings */
9a435388
FB
1063 ((char*) iovec.iov_base)[n] = 0;
1064 iovec.iov_len = (size_t) n;
3c171f0b 1065
9a435388
FB
1066 r = iovw_put(&iovw, iovec.iov_base, iovec.iov_len);
1067 if (r < 0)
1068 goto finish;
34c10968
LP
1069 }
1070
61233823 1071 /* Make sure we got all data we really need */
f8540bde 1072 assert(input_fd >= 0);
3c171f0b 1073
f46c706b
FB
1074 r = save_context(&context, &iovw);
1075 if (r < 0)
1076 goto finish;
1077
1078 /* Make sure we received at least all fields we need. */
fe96c0f8 1079 for (int i = 0; i < _META_MANDATORY_MAX; i++)
f46c706b
FB
1080 if (!context.meta[i]) {
1081 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1082 "A mandatory argument (%i) has not been sent, aborting.",
1083 i);
1084 goto finish;
1085 }
80002f66 1086
f46c706b 1087 r = submit_coredump(&context, &iovw, input_fd);
3c171f0b
LP
1088
1089finish:
9a435388 1090 iovw_free_contents(&iovw, true);
3c171f0b
LP
1091 return r;
1092}
1093
9a435388 1094static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
254d1313 1095 _cleanup_close_ int fd = -EBADF;
3c171f0b
LP
1096 int r;
1097
9a435388 1098 assert(iovw);
3c171f0b
LP
1099 assert(input_fd >= 0);
1100
1101 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1102 if (fd < 0)
1103 return log_error_errno(errno, "Failed to create coredump socket: %m");
1104
1861986a
LP
1105 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1106 if (r < 0)
1107 return log_error_errno(r, "Failed to connect to coredump service: %m");
3c171f0b 1108
fe96c0f8 1109 for (size_t i = 0; i < iovw->count; i++) {
fec603eb 1110 struct msghdr mh = {
9a435388 1111 .msg_iov = iovw->iovec + i,
fec603eb
LP
1112 .msg_iovlen = 1,
1113 };
1114 struct iovec copy[2];
1115
1116 for (;;) {
1117 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1118 break;
1119
1120 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
f46c706b
FB
1121 /* This field didn't fit? That's a pity. Given that this is
1122 * just metadata, let's truncate the field at half, and try
1123 * again. We append three dots, in order to show that this is
1124 * truncated. */
fec603eb
LP
1125
1126 if (mh.msg_iov != copy) {
f46c706b
FB
1127 /* We don't want to modify the caller's iovec, hence
1128 * let's create our own array, consisting of two new
1129 * iovecs, where the first is a (truncated) copy of
1130 * what we want to send, and the second one contains
1131 * the trailing dots. */
9a435388 1132 copy[0] = iovw->iovec[i];
ed0cb346 1133 copy[1] = IOVEC_MAKE(((char[]){'.', '.', '.'}), 3);
fec603eb
LP
1134
1135 mh.msg_iov = copy;
1136 mh.msg_iovlen = 2;
1137 }
1138
1139 copy[0].iov_len /= 2; /* halve it, and try again */
1140 continue;
1141 }
3c171f0b 1142
3c171f0b 1143 return log_error_errno(errno, "Failed to send coredump datagram: %m");
fec603eb 1144 }
1eef15b1
ZJS
1145 }
1146
3c171f0b
LP
1147 r = send_one_fd(fd, input_fd, 0);
1148 if (r < 0)
1149 return log_error_errno(r, "Failed to send coredump fd: %m");
1eef15b1 1150
3c171f0b
LP
1151 return 0;
1152}
1eef15b1 1153
64a5384f
LP
1154static int gather_pid_metadata_from_argv(
1155 struct iovec_wrapper *iovw,
1156 Context *context,
1157 int argc, char **argv) {
1158
f46c706b 1159 _cleanup_free_ char *free_timestamp = NULL;
fe96c0f8 1160 int r, signo;
3c171f0b 1161 char *t;
3c171f0b 1162
e6aa443f
LP
1163 assert(iovw);
1164 assert(context);
1165
f46c706b
FB
1166 /* We gather all metadata that were passed via argv[] into an array of iovecs that
1167 * we'll forward to the socket unit */
3c171f0b 1168
f46c706b
FB
1169 if (argc < _META_ARGV_MAX)
1170 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1171 "Not enough arguments passed by the kernel (%i, expected %i).",
1172 argc, _META_ARGV_MAX);
3c171f0b 1173
fe96c0f8 1174 for (int i = 0; i < _META_ARGV_MAX; i++) {
3c171f0b 1175
f46c706b 1176 t = argv[i];
3c171f0b 1177
f46c706b 1178 switch (i) {
64a5384f 1179
f46c706b
FB
1180 case META_ARGV_TIMESTAMP:
1181 /* The journal fields contain the timestamp padded with six
1182 * zeroes, so that the kernel-supplied 1s granularity timestamps
e503019b 1183 * becomes 1μs granularity, i.e. the granularity systemd usually
f46c706b
FB
1184 * operates in. */
1185 t = free_timestamp = strjoin(argv[i], "000000");
1186 if (!t)
1187 return log_oom();
1188 break;
64a5384f 1189
f46c706b
FB
1190 case META_ARGV_SIGNAL:
1191 /* For signal, record its pretty name too */
1192 if (safe_atoi(argv[i], &signo) >= 0 && SIGNAL_VALID(signo))
2a3bebd0
FB
1193 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG",
1194 signal_to_string(signo));
f46c706b 1195 break;
64a5384f 1196
f46c706b
FB
1197 default:
1198 break;
c8091d92
LP
1199 }
1200
f46c706b
FB
1201 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1202 if (r < 0)
1203 return r;
8c8549db 1204 }
803a3464 1205
f46c706b
FB
1206 /* Cache some of the process metadata we collected so far and that we'll need to
1207 * access soon */
1208 return save_context(context, iovw);
1209}
3c171f0b 1210
db9ac801 1211static int gather_pid_metadata_from_procfs(struct iovec_wrapper *iovw, Context *context) {
f46c706b
FB
1212 uid_t owner_uid;
1213 pid_t pid;
1214 char *t;
3e4d0f6c 1215 size_t size;
f46c706b
FB
1216 const char *p;
1217 int r;
f5e04665 1218
e6aa443f
LP
1219 assert(iovw);
1220 assert(context);
1221
f46c706b
FB
1222 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1223 * structure. (It remains valid, with the first iovec fields initialized.) */
f5e04665 1224
f46c706b 1225 pid = context->pid;
f5e04665 1226
f46c706b 1227 /* The following is mandatory */
d7d74854 1228 r = pid_get_comm(pid, &t);
9a435388 1229 if (r < 0)
f46c706b 1230 return log_error_errno(r, "Failed to get COMM: %m");
f5e04665 1231
f46c706b 1232 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
9a435388
FB
1233 if (r < 0)
1234 return r;
f45b8015 1235
c790632c 1236 /* The following are optional, but we use them if present. */
2a3bebd0
FB
1237 r = get_process_exe(pid, &t);
1238 if (r >= 0)
1239 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1240 if (r < 0)
f46c706b 1241 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
bdfd7b2c 1242
f46c706b 1243 if (cg_pid_get_unit(pid, &t) >= 0)
2a3bebd0 1244 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
f5e04665 1245
f46c706b 1246 if (cg_pid_get_user_unit(pid, &t) >= 0)
2a3bebd0 1247 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
f46c706b 1248
9aa82023 1249 if (sd_pid_get_session(pid, &t) >= 0)
9a435388 1250 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
f5e04665 1251
a035f819 1252 if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) {
9a435388 1253 r = asprintf(&t, UID_FMT, owner_uid);
7de80bfe 1254 if (r > 0)
9a435388 1255 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
f5e04665
LP
1256 }
1257
9aa82023 1258 if (sd_pid_get_slice(pid, &t) >= 0)
2a3bebd0 1259 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
f5e04665 1260
a034620f 1261 if (pid_get_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
2a3bebd0 1262 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
a035f819 1263
9aa82023 1264 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
2a3bebd0 1265 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
a035f819 1266
9aa82023 1267 if (compose_open_fds(pid, &t) >= 0)
2a3bebd0 1268 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
3f132692
JF
1269
1270 p = procfs_file_alloca(pid, "status");
627055ce 1271 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1272 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
3f132692
JF
1273
1274 p = procfs_file_alloca(pid, "maps");
627055ce 1275 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1276 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
3f132692
JF
1277
1278 p = procfs_file_alloca(pid, "limits");
627055ce 1279 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1280 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
3f132692
JF
1281
1282 p = procfs_file_alloca(pid, "cgroup");
3e4d0f6c 1283 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1284 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
3f132692 1285
d7032b1f 1286 p = procfs_file_alloca(pid, "mountinfo");
3e4d0f6c 1287 if (read_full_virtual_file(p, &t, NULL) >= 0)
2a3bebd0 1288 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
d7032b1f 1289
3e4d0f6c
ZJS
1290 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1291 p = procfs_file_alloca(pid, "auxv");
1292 if (read_full_virtual_file(p, &t, &size) >= 0) {
1293 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1294 if (buf) {
1295 /* Add a dummy terminator to make save_context() happy. */
1296 *((uint8_t*) mempcpy(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size)) = '\0';
1297 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1298 }
1299
1300 free(t);
1301 }
1302
9aa82023 1303 if (get_process_cwd(pid, &t) >= 0)
2a3bebd0 1304 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
3f132692
JF
1305
1306 if (get_process_root(pid, &t) >= 0) {
9aa82023
ZJS
1307 bool proc_self_root_is_slash;
1308
1309 proc_self_root_is_slash = strcmp(t, "/") == 0;
3f132692 1310
2a3bebd0 1311 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
7ed03ce6
JF
1312
1313 /* If the process' root is "/", then there is a chance it has
1314 * mounted own root and hence being containerized. */
9aa82023 1315 if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0)
2a3bebd0 1316 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
3f132692
JF
1317 }
1318
9aa82023 1319 if (get_process_environ(pid, &t) >= 0)
2a3bebd0 1320 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
9aa82023 1321
f46c706b
FB
1322 /* we successfully acquired all metadata */
1323 return save_context(context, iovw);
9aa82023 1324}
3f132692 1325
a108c43e
NR
1326static int send_ucred(int transport_fd, struct ucred *ucred) {
1327 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1328 struct msghdr mh = {
1329 .msg_control = &control,
1330 .msg_controllen = sizeof(control),
1331 };
1332 struct cmsghdr *cmsg;
1333
1334 assert(transport_fd >= 0);
1335
1336 cmsg = CMSG_FIRSTHDR(&mh);
1337 *cmsg = (struct cmsghdr) {
1338 .cmsg_level = SOL_SOCKET,
1339 .cmsg_type = SCM_CREDENTIALS,
1340 .cmsg_len = CMSG_LEN(sizeof(struct ucred)),
1341 };
1342 memcpy(CMSG_DATA(cmsg), ucred, sizeof(struct ucred));
1343
1344 return RET_NERRNO(sendmsg(transport_fd, &mh, MSG_NOSIGNAL));
1345}
1346
1347static int receive_ucred(int transport_fd, struct ucred *ret_ucred) {
1348 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(struct ucred))) control = {};
1349 struct msghdr mh = {
1350 .msg_control = &control,
1351 .msg_controllen = sizeof(control),
1352 };
1353 struct cmsghdr *cmsg = NULL;
1354 struct ucred *ucred = NULL;
1355 ssize_t n;
1356
1357 assert(ret_ucred);
1358
1359 n = recvmsg_safe(transport_fd, &mh, 0);
1360 if (n < 0)
1361 return n;
1362
1363 CMSG_FOREACH(cmsg, &mh)
1364 if (cmsg->cmsg_level == SOL_SOCKET &&
1365 cmsg->cmsg_type == SCM_CREDENTIALS &&
1366 cmsg->cmsg_len == CMSG_LEN(sizeof(struct ucred))) {
1367
1368 assert(!ucred);
1369 ucred = CMSG_TYPED_DATA(cmsg, struct ucred);
1370 }
1371
1372 if (!ucred)
1373 return -EIO;
1374
1375 *ret_ucred = *ucred;
1376
1377 return 0;
1378}
1379
1380static int can_forward_coredump(pid_t pid) {
1381 _cleanup_free_ char *cgroup = NULL, *path = NULL, *unit = NULL;
1382 int r;
1383
1384 r = cg_pid_get_path(SYSTEMD_CGROUP_CONTROLLER, pid, &cgroup);
1385 if (r < 0)
1386 return r;
1387
1388 r = path_extract_directory(cgroup, &path);
1389 if (r < 0)
1390 return r;
1391
1392 r = cg_path_get_unit_path(path, &unit);
1393 if (r == -ENOMEM)
1394 return log_oom();
1395 if (r == -ENXIO)
1396 /* No valid units in this path. */
1397 return false;
1398 if (r < 0)
1399 return r;
1400
1401 /* We require that this process belongs to a delegated cgroup
1402 * (i.e. Delegate=yes), with CoredumpReceive=yes also. */
1403 r = cg_is_delegated(unit);
1404 if (r <= 0)
1405 return r;
1406
1407 return cg_has_coredump_receive(unit);
1408}
1409
1410static int forward_coredump_to_container(Context *context) {
1411 _cleanup_close_ int pidnsfd = -EBADF, mntnsfd = -EBADF, netnsfd = -EBADF, usernsfd = -EBADF, rootfd = -EBADF;
71136404 1412 _cleanup_close_pair_ int pair[2] = EBADF_PAIR;
a108c43e
NR
1413 pid_t pid, child;
1414 struct ucred ucred = {
1415 .pid = context->pid,
1416 .uid = context->uid,
1417 .gid = context->gid,
1418 };
1419 int r;
1420
1421 r = namespace_get_leader(context->pid, NAMESPACE_PID, &pid);
1422 if (r < 0)
1423 return log_debug_errno(r, "Failed to get namespace leader: %m");
1424
1425 r = can_forward_coredump(pid);
1426 if (r < 0)
1427 return log_debug_errno(r, "Failed to check if coredump can be forwarded: %m");
1428 if (r == 0)
1429 return log_debug_errno(SYNTHETIC_ERRNO(ENOENT),
1430 "Coredump will not be forwarded because no target cgroup was found.");
1431
1432 r = RET_NERRNO(socketpair(AF_UNIX, SOCK_DGRAM|SOCK_CLOEXEC, 0, pair));
1433 if (r < 0)
1434 return log_debug_errno(r, "Failed to create socket pair: %m");
1435
1436 r = setsockopt_int(pair[1], SOL_SOCKET, SO_PASSCRED, true);
1437 if (r < 0)
1438 return log_debug_errno(r, "Failed to set SO_PASSCRED: %m");
1439
1440 r = namespace_open(pid, &pidnsfd, &mntnsfd, &netnsfd, &usernsfd, &rootfd);
1441 if (r < 0)
1442 return log_debug_errno(r, "Failed to join namespaces of PID " PID_FMT ": %m", pid);
1443
1444 r = namespace_fork("(sd-coredumpns)", "(sd-coredump)", NULL, 0,
e9ccae31 1445 FORK_RESET_SIGNALS|FORK_DEATHSIG_SIGTERM,
a108c43e
NR
1446 pidnsfd, mntnsfd, netnsfd, usernsfd, rootfd, &child);
1447 if (r < 0)
1448 return log_debug_errno(r, "Failed to fork into namespaces of PID " PID_FMT ": %m", pid);
1449 if (r == 0) {
1450 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
1451 Context child_context = {};
1452
1453 pair[0] = safe_close(pair[0]);
1454
1455 if (laccess("/run/systemd/coredump", W_OK) < 0) {
1456 log_debug_errno(errno, "Cannot find coredump socket, exiting: %m");
1457 _exit(EXIT_FAILURE);
1458 }
1459
1460 r = receive_ucred(pair[1], &ucred);
1461 if (r < 0) {
1462 log_debug_errno(r, "Failed to receive ucred and fd: %m");
1463 _exit(EXIT_FAILURE);
1464 }
1465
1466 iovw = iovw_new();
1467 if (!iovw) {
1468 log_oom();
1469 _exit(EXIT_FAILURE);
1470 }
1471
1472 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1473 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1474 (void) iovw_put_string_field(iovw, "COREDUMP_FORWARDED=", "1");
1475
1476 for (int i = 0; i < _META_ARGV_MAX; i++) {
1477 int signo;
1478 char buf[DECIMAL_STR_MAX(pid_t)];
1479 const char *t = context->meta[i];
1480
1d03d970 1481 switch (i) {
a108c43e
NR
1482
1483 case META_ARGV_PID:
1484 xsprintf(buf, PID_FMT, ucred.pid);
1485 t = buf;
1486
1487 break;
1488
1489 case META_ARGV_UID:
1490 xsprintf(buf, UID_FMT, ucred.uid);
1491 t = buf;
1492 break;
1493
1494 case META_ARGV_GID:
1495 xsprintf(buf, GID_FMT, ucred.gid);
1496 t = buf;
1497 break;
1498
1499 case META_ARGV_SIGNAL:
1500 if (safe_atoi(t, &signo) >= 0 && SIGNAL_VALID(signo))
1501 (void) iovw_put_string_field(iovw,
1502 "COREDUMP_SIGNAL_NAME=SIG",
1503 signal_to_string(signo));
1504 break;
1505
1506 default:
1507 break;
1508 }
1509
1510 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1511 if (r < 0) {
1512 log_debug_errno(r, "Failed to construct iovec: %m");
1513 _exit(EXIT_FAILURE);
1514 }
1515 }
1516
1517 r = save_context(&child_context, iovw);
1518 if (r < 0) {
1519 log_debug_errno(r, "Failed to save context: %m");
1520 _exit(EXIT_FAILURE);
1521 }
1522
1523 r = gather_pid_metadata_from_procfs(iovw, &child_context);
1524 if (r < 0) {
1525 log_debug_errno(r, "Failed to gather metadata from procfs: %m");
1526 _exit(EXIT_FAILURE);
1527 }
1528
1529 r = send_iovec(iovw, STDIN_FILENO);
1530 if (r < 0) {
1531 log_debug_errno(r, "Failed to send iovec to coredump socket: %m");
1532 _exit(EXIT_FAILURE);
1533 }
1534
1535 _exit(EXIT_SUCCESS);
1536 }
1537
1538 pair[1] = safe_close(pair[1]);
1539
1540 /* We need to translate the PID, UID, and GID of the crashing process
1541 * to the container's namespaces. Do this by sending an SCM_CREDENTIALS
1542 * message on a socket pair, and read the result when we join the
1543 * container. The kernel will perform the translation for us. */
1544 r = send_ucred(pair[0], &ucred);
1545 if (r < 0)
1546 return log_debug_errno(r, "Failed to send metadata to container: %m");
1547
1548 r = wait_for_terminate_and_check("(sd-coredumpns)", child, 0);
1549 if (r < 0)
1550 return log_debug_errno(r, "Failed to wait for child to terminate: %m");
1551 if (r != EXIT_SUCCESS)
4e494e6a 1552 return log_debug_errno(SYNTHETIC_ERRNO(EPROTO), "Failed to process coredump in container.");
a108c43e
NR
1553
1554 return 0;
1555}
1556
9aa82023 1557static int process_kernel(int argc, char* argv[]) {
6257e2fb 1558 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1559 Context context = {};
2a9b1a76 1560 int r, signo;
9aa82023 1561
1f9d2a81
DDM
1562 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1563 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1564 * /dev/null. */
5bb1d7fb 1565 r = rearrange_stdio(STDIN_FILENO, -EBADF, -EBADF);
1f9d2a81
DDM
1566 if (r < 0)
1567 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1568
988e89ee
ZJS
1569 log_debug("Processing coredump received from the kernel...");
1570
9a435388
FB
1571 iovw = iovw_new();
1572 if (!iovw)
1573 return log_oom();
1574
f46c706b
FB
1575 /* Collect all process metadata passed by the kernel through argv[] */
1576 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
92e92d71 1577 if (r < 0)
6257e2fb 1578 return r;
86562420 1579
f46c706b 1580 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1581 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1582 if (r < 0)
6257e2fb 1583 return r;
f46c706b 1584
1e344c1d 1585 if (!context.is_journald)
f46c706b 1586 /* OK, now we know it's not the journal, hence we can make use of it now. */
1e344c1d 1587 log_set_target_and_open(LOG_TARGET_JOURNAL_OR_KMSG);
f46c706b 1588
2a9b1a76
HB
1589 /* Log minimal metadata now, so it is not lost if the system is about to shut down. */
1590 log_info("Process %s (%s) of user %s terminated abnormally with signal %s/%s, processing...",
1591 context.meta[META_ARGV_PID], context.meta[META_COMM],
1592 context.meta[META_ARGV_UID], context.meta[META_ARGV_SIGNAL],
1593 strna(safe_atoi(context.meta[META_ARGV_SIGNAL], &signo) >= 0 ? signal_to_string(signo) : NULL));
1594
a108c43e
NR
1595 r = in_same_namespace(getpid_cached(), context.pid, NAMESPACE_PID);
1596 if (r < 0)
1597 log_debug_errno(r, "Failed to check pidns of crashing process, ignoring: %m");
1598 if (r == 0) {
1599 /* If this fails, fallback to the old behavior so that
1600 * there is still some record of the crash. */
1601 r = forward_coredump_to_container(&context);
1602 if (r >= 0)
1603 return 0;
1604 }
1605
f46c706b
FB
1606 /* If this is PID 1 disable coredump collection, we'll unlikely be able to process
1607 * it later on.
1608 *
1609 * FIXME: maybe we should disable coredumps generation from the beginning and
1610 * re-enable it only when we know it's either safe (ie we're not running OOM) or
1611 * it's not pid1 ? */
1612 if (context.is_pid1) {
1613 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1614 disable_coredumps();
1615 }
34c10968 1616
a108c43e
NR
1617 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1618 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1619
f46c706b 1620 if (context.is_journald || context.is_pid1)
6257e2fb 1621 return submit_coredump(&context, iovw, STDIN_FILENO);
9aa82023 1622
6257e2fb 1623 return send_iovec(iovw, STDIN_FILENO);
3c171f0b 1624}
34c10968 1625
988e89ee 1626static int process_backtrace(int argc, char *argv[]) {
3a19fe46
YW
1627 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1628 _cleanup_(iovw_free_freep) struct iovec_wrapper *iovw = NULL;
f46c706b 1629 Context context = {};
9a435388 1630 char *message;
988e89ee
ZJS
1631 int r;
1632
1633 log_debug("Processing backtrace on stdin...");
1634
9a435388
FB
1635 iovw = iovw_new();
1636 if (!iovw)
5b45a160
ZJS
1637 return log_oom();
1638
2a3bebd0
FB
1639 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1640 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
f46c706b
FB
1641
1642 /* Collect all process metadata from argv[] by making sure to skip the
1643 * '--backtrace' option */
1644 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
988e89ee 1645 if (r < 0)
3a19fe46 1646 return r;
aaeb2522 1647
f46c706b 1648 /* Collect the rest of the process metadata retrieved from the runtime */
db9ac801 1649 r = gather_pid_metadata_from_procfs(iovw, &context);
f46c706b 1650 if (r < 0)
3a19fe46 1651 return r;
988e89ee 1652
86562420 1653 for (;;) {
5b45a160 1654 r = journal_importer_process_data(&importer);
3a19fe46
YW
1655 if (r < 0)
1656 return log_error_errno(r, "Failed to parse journal entry on stdin: %m");
d74dc4f2
ZJS
1657 if (r == 1 || /* complete entry */
1658 journal_importer_eof(&importer)) /* end of data */
5b45a160 1659 break;
988e89ee 1660 }
988e89ee 1661
5b45a160
ZJS
1662 if (journal_importer_eof(&importer)) {
1663 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
988e89ee 1664
f46c706b
FB
1665 message = strjoina("Process ", context.meta[META_ARGV_PID],
1666 " (", context.meta[META_COMM], ")"
1667 " of user ", context.meta[META_ARGV_UID],
1668 " failed with ", context.meta[META_ARGV_SIGNAL]);
9a435388
FB
1669
1670 r = iovw_put_string_field(iovw, "MESSAGE=", message);
1671 if (r < 0)
3a19fe46 1672 return r;
5b45a160 1673 } else {
3a19fe46
YW
1674 /* The imported iovecs are not supposed to be freed by us so let's copy and merge them at the
1675 * end of the array. */
1676 r = iovw_append(iovw, &importer.iovw);
1677 if (r < 0)
1678 return r;
9a435388 1679 }
988e89ee 1680
9a435388 1681 r = sd_journal_sendv(iovw->iovec, iovw->count);
988e89ee 1682 if (r < 0)
3a19fe46 1683 return log_error_errno(r, "Failed to log backtrace: %m");
988e89ee 1684
3a19fe46 1685 return 0;
988e89ee
ZJS
1686}
1687
4515a95e 1688static int run(int argc, char *argv[]) {
3c171f0b 1689 int r;
fee80f69 1690
9aa82023
ZJS
1691 /* First, log to a safe place, since we don't know what crashed and it might
1692 * be journald which we'd rather not log to then. */
8d4e028f 1693
1e344c1d 1694 log_set_target_and_open(LOG_TARGET_KMSG);
8d4e028f 1695
3c171f0b
LP
1696 /* Make sure we never enter a loop */
1697 (void) prctl(PR_SET_DUMPABLE, 0);
8d4e028f 1698
3c171f0b
LP
1699 /* Ignore all parse errors */
1700 (void) parse_config();
fee80f69 1701
3c171f0b
LP
1702 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
1703 log_debug("Selected compression %s.", yes_no(arg_compress));
fee80f69 1704
3c171f0b 1705 r = sd_listen_fds(false);
4515a95e
ZJS
1706 if (r < 0)
1707 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
fee80f69 1708
9aa82023
ZJS
1709 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
1710 * are invoked from the kernel as coredump handler. */
988e89ee
ZJS
1711 if (r == 0) {
1712 if (streq_ptr(argv[1], "--backtrace"))
4515a95e 1713 return process_backtrace(argc, argv);
988e89ee 1714 else
4515a95e 1715 return process_kernel(argc, argv);
988e89ee 1716 } else if (r == 1)
4515a95e 1717 return process_socket(SD_LISTEN_FDS_START);
f5e04665 1718
baaa35ad
ZJS
1719 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1720 "Received unexpected number of file descriptors.");
f5e04665 1721}
4515a95e
ZJS
1722
1723DEFINE_MAIN_FUNCTION(run);