]> git.ipfire.org Git - thirdparty/systemd.git/blob - src/coredump/coredump.c
Merge branch 'systemd-security/coredump-capabilities'
[thirdparty/systemd.git] / src / coredump / coredump.c
1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
2
3 #include <errno.h>
4 #include <stdio.h>
5 #include <sys/prctl.h>
6 #include <sys/statvfs.h>
7 #include <sys/auxv.h>
8 #include <sys/xattr.h>
9 #include <unistd.h>
10
11 #include "sd-daemon.h"
12 #include "sd-journal.h"
13 #include "sd-login.h"
14 #include "sd-messages.h"
15
16 #include "acl-util.h"
17 #include "alloc-util.h"
18 #include "bus-error.h"
19 #include "capability-util.h"
20 #include "cgroup-util.h"
21 #include "compress.h"
22 #include "conf-parser.h"
23 #include "copy.h"
24 #include "coredump-util.h"
25 #include "coredump-vacuum.h"
26 #include "dirent-util.h"
27 #include "elf-util.h"
28 #include "escape.h"
29 #include "fd-util.h"
30 #include "fileio.h"
31 #include "fs-util.h"
32 #include "io-util.h"
33 #include "journal-importer.h"
34 #include "journal-send.h"
35 #include "log.h"
36 #include "macro.h"
37 #include "main-func.h"
38 #include "memory-util.h"
39 #include "mkdir-label.h"
40 #include "parse-util.h"
41 #include "process-util.h"
42 #include "signal-util.h"
43 #include "socket-util.h"
44 #include "special.h"
45 #include "stat-util.h"
46 #include "string-table.h"
47 #include "string-util.h"
48 #include "strv.h"
49 #include "sync-util.h"
50 #include "tmpfile-util.h"
51 #include "uid-alloc-range.h"
52 #include "user-util.h"
53
54 /* The maximum size up to which we process coredumps. We use 1G on 32bit systems, and 32G on 64bit systems */
55 #if __SIZEOF_POINTER__ == 4
56 #define PROCESS_SIZE_MAX ((uint64_t) (1LLU*1024LLU*1024LLU*1024LLU))
57 #elif __SIZEOF_POINTER__ == 8
58 #define PROCESS_SIZE_MAX ((uint64_t) (32LLU*1024LLU*1024LLU*1024LLU))
59 #else
60 #error "Unexpected pointer size"
61 #endif
62
63 /* The maximum size up to which we leave the coredump around on disk */
64 #define EXTERNAL_SIZE_MAX PROCESS_SIZE_MAX
65
66 /* The maximum size up to which we store the coredump in the journal */
67 #ifndef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION
68 #define JOURNAL_SIZE_MAX ((size_t) (767LU*1024LU*1024LU))
69 #else
70 /* oss-fuzz limits memory usage. */
71 #define JOURNAL_SIZE_MAX ((size_t) (10LU*1024LU*1024LU))
72 #endif
73
74 /* When checking for available memory and setting lower limits, don't
75 * go below 4MB for writing core files to storage. */
76 #define PROCESS_SIZE_MIN (4U*1024U*1024U)
77
78 /* Make sure to not make this larger than the maximum journal entry
79 * size. See DATA_SIZE_MAX in journal-importer.h. */
80 assert_cc(JOURNAL_SIZE_MAX <= DATA_SIZE_MAX);
81
82 enum {
83 /* We use these as array indexes for our process metadata cache.
84 *
85 * The first indices of the cache stores the same metadata as the ones passed by
86 * the kernel via argv[], ie the strings array passed by the kernel according to
87 * our pattern defined in /proc/sys/kernel/core_pattern (see man:core(5)). */
88
89 META_ARGV_PID, /* %P: as seen in the initial pid namespace */
90 META_ARGV_UID, /* %u: as seen in the initial user namespace */
91 META_ARGV_GID, /* %g: as seen in the initial user namespace */
92 META_ARGV_SIGNAL, /* %s: number of signal causing dump */
93 META_ARGV_TIMESTAMP, /* %t: time of dump, expressed as seconds since the Epoch (we expand this to µs granularity) */
94 META_ARGV_RLIMIT, /* %c: core file size soft resource limit */
95 META_ARGV_HOSTNAME, /* %h: hostname */
96 _META_ARGV_MAX,
97
98 /* The following indexes are cached for a couple of special fields we use (and
99 * thereby need to be retrieved quickly) for naming coredump files, and attaching
100 * xattrs. Unlike the previous ones they are retrieved from the runtime
101 * environment. */
102
103 META_COMM = _META_ARGV_MAX,
104 _META_MANDATORY_MAX,
105
106 /* The rest are similar to the previous ones except that we won't fail if one of
107 * them is missing. */
108
109 META_EXE = _META_MANDATORY_MAX,
110 META_UNIT,
111 META_PROC_AUXV,
112 _META_MAX
113 };
114
115 static const char * const meta_field_names[_META_MAX] = {
116 [META_ARGV_PID] = "COREDUMP_PID=",
117 [META_ARGV_UID] = "COREDUMP_UID=",
118 [META_ARGV_GID] = "COREDUMP_GID=",
119 [META_ARGV_SIGNAL] = "COREDUMP_SIGNAL=",
120 [META_ARGV_TIMESTAMP] = "COREDUMP_TIMESTAMP=",
121 [META_ARGV_RLIMIT] = "COREDUMP_RLIMIT=",
122 [META_ARGV_HOSTNAME] = "COREDUMP_HOSTNAME=",
123 [META_COMM] = "COREDUMP_COMM=",
124 [META_EXE] = "COREDUMP_EXE=",
125 [META_UNIT] = "COREDUMP_UNIT=",
126 [META_PROC_AUXV] = "COREDUMP_PROC_AUXV=",
127 };
128
129 typedef struct Context {
130 const char *meta[_META_MAX];
131 size_t meta_size[_META_MAX];
132 pid_t pid;
133 bool is_pid1;
134 bool is_journald;
135 } Context;
136
137 typedef enum CoredumpStorage {
138 COREDUMP_STORAGE_NONE,
139 COREDUMP_STORAGE_EXTERNAL,
140 COREDUMP_STORAGE_JOURNAL,
141 _COREDUMP_STORAGE_MAX,
142 _COREDUMP_STORAGE_INVALID = -EINVAL,
143 } CoredumpStorage;
144
145 static const char* const coredump_storage_table[_COREDUMP_STORAGE_MAX] = {
146 [COREDUMP_STORAGE_NONE] = "none",
147 [COREDUMP_STORAGE_EXTERNAL] = "external",
148 [COREDUMP_STORAGE_JOURNAL] = "journal",
149 };
150
151 DEFINE_PRIVATE_STRING_TABLE_LOOKUP(coredump_storage, CoredumpStorage);
152 static DEFINE_CONFIG_PARSE_ENUM(config_parse_coredump_storage, coredump_storage, CoredumpStorage, "Failed to parse storage setting");
153
154 static CoredumpStorage arg_storage = COREDUMP_STORAGE_EXTERNAL;
155 static bool arg_compress = true;
156 static uint64_t arg_process_size_max = PROCESS_SIZE_MAX;
157 static uint64_t arg_external_size_max = EXTERNAL_SIZE_MAX;
158 static uint64_t arg_journal_size_max = JOURNAL_SIZE_MAX;
159 static uint64_t arg_keep_free = UINT64_MAX;
160 static uint64_t arg_max_use = UINT64_MAX;
161
162 static int parse_config(void) {
163 static const ConfigTableItem items[] = {
164 { "Coredump", "Storage", config_parse_coredump_storage, 0, &arg_storage },
165 { "Coredump", "Compress", config_parse_bool, 0, &arg_compress },
166 { "Coredump", "ProcessSizeMax", config_parse_iec_uint64, 0, &arg_process_size_max },
167 { "Coredump", "ExternalSizeMax", config_parse_iec_uint64_infinity, 0, &arg_external_size_max },
168 { "Coredump", "JournalSizeMax", config_parse_iec_size, 0, &arg_journal_size_max },
169 { "Coredump", "KeepFree", config_parse_iec_uint64, 0, &arg_keep_free },
170 { "Coredump", "MaxUse", config_parse_iec_uint64, 0, &arg_max_use },
171 {}
172 };
173
174 return config_parse_many_nulstr(
175 PKGSYSCONFDIR "/coredump.conf",
176 CONF_PATHS_NULSTR("systemd/coredump.conf.d"),
177 "Coredump\0",
178 config_item_table_lookup, items,
179 CONFIG_PARSE_WARN,
180 NULL,
181 NULL);
182 }
183
184 static uint64_t storage_size_max(void) {
185 if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
186 return arg_external_size_max;
187 if (arg_storage == COREDUMP_STORAGE_JOURNAL)
188 return arg_journal_size_max;
189 assert(arg_storage == COREDUMP_STORAGE_NONE);
190 return 0;
191 }
192
193 static int fix_acl(int fd, uid_t uid, bool allow_user) {
194 assert(fd >= 0);
195 assert(uid_is_valid(uid));
196
197 #if HAVE_ACL
198 int r;
199
200 /* We don't allow users to read coredumps if the uid or capabilities were changed. */
201 if (!allow_user)
202 return 0;
203
204 if (uid_is_system(uid) || uid_is_dynamic(uid) || uid == UID_NOBODY)
205 return 0;
206
207 /* Make sure normal users can read (but not write or delete) their own coredumps */
208 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
209 if (r < 0)
210 return log_error_errno(r, "Failed to adjust ACL of the coredump: %m");
211 #endif
212
213 return 0;
214 }
215
216 static int fix_xattr(int fd, const Context *context) {
217
218 static const char * const xattrs[_META_MAX] = {
219 [META_ARGV_PID] = "user.coredump.pid",
220 [META_ARGV_UID] = "user.coredump.uid",
221 [META_ARGV_GID] = "user.coredump.gid",
222 [META_ARGV_SIGNAL] = "user.coredump.signal",
223 [META_ARGV_TIMESTAMP] = "user.coredump.timestamp",
224 [META_ARGV_RLIMIT] = "user.coredump.rlimit",
225 [META_ARGV_HOSTNAME] = "user.coredump.hostname",
226 [META_COMM] = "user.coredump.comm",
227 [META_EXE] = "user.coredump.exe",
228 };
229
230 int r = 0;
231
232 assert(fd >= 0);
233
234 /* Attach some metadata to coredumps via extended
235 * attributes. Just because we can. */
236
237 for (unsigned i = 0; i < _META_MAX; i++) {
238 int k;
239
240 if (isempty(context->meta[i]) || !xattrs[i])
241 continue;
242
243 k = fsetxattr(fd, xattrs[i], context->meta[i], strlen(context->meta[i]), XATTR_CREATE);
244 if (k < 0 && r == 0)
245 r = -errno;
246 }
247
248 return r;
249 }
250
251 #define filename_escape(s) xescape((s), "./ ")
252
253 static const char *coredump_tmpfile_name(const char *s) {
254 return s ? s : "(unnamed temporary file)";
255 }
256
257 static int fix_permissions(
258 int fd,
259 const char *filename,
260 const char *target,
261 const Context *context,
262 uid_t uid,
263 bool allow_user) {
264
265 int r;
266
267 assert(fd >= 0);
268 assert(target);
269 assert(context);
270
271 /* Ignore errors on these */
272 (void) fchmod(fd, 0640);
273 (void) fix_acl(fd, uid, allow_user);
274 (void) fix_xattr(fd, context);
275
276 r = fsync_full(fd);
277 if (r < 0)
278 return log_error_errno(r, "Failed to sync coredump %s: %m", coredump_tmpfile_name(filename));
279
280 r = link_tmpfile(fd, filename, target);
281 if (r < 0)
282 return log_error_errno(r, "Failed to move coredump %s into place: %m", target);
283
284 return 0;
285 }
286
287 static int maybe_remove_external_coredump(const char *filename, uint64_t size) {
288
289 /* Returns 1 if might remove, 0 if will not remove, < 0 on error. */
290
291 if (arg_storage == COREDUMP_STORAGE_EXTERNAL &&
292 size <= arg_external_size_max)
293 return 0;
294
295 if (!filename)
296 return 1;
297
298 if (unlink(filename) < 0 && errno != ENOENT)
299 return log_error_errno(errno, "Failed to unlink %s: %m", filename);
300
301 return 1;
302 }
303
304 static int make_filename(const Context *context, char **ret) {
305 _cleanup_free_ char *c = NULL, *u = NULL, *p = NULL, *t = NULL;
306 sd_id128_t boot = {};
307 int r;
308
309 assert(context);
310
311 c = filename_escape(context->meta[META_COMM]);
312 if (!c)
313 return -ENOMEM;
314
315 u = filename_escape(context->meta[META_ARGV_UID]);
316 if (!u)
317 return -ENOMEM;
318
319 r = sd_id128_get_boot(&boot);
320 if (r < 0)
321 return r;
322
323 p = filename_escape(context->meta[META_ARGV_PID]);
324 if (!p)
325 return -ENOMEM;
326
327 t = filename_escape(context->meta[META_ARGV_TIMESTAMP]);
328 if (!t)
329 return -ENOMEM;
330
331 if (asprintf(ret,
332 "/var/lib/systemd/coredump/core.%s.%s." SD_ID128_FORMAT_STR ".%s.%s",
333 c,
334 u,
335 SD_ID128_FORMAT_VAL(boot),
336 p,
337 t) < 0)
338 return -ENOMEM;
339
340 return 0;
341 }
342
343 static int parse_auxv64(
344 const uint64_t *auxv,
345 size_t size_bytes,
346 int *at_secure,
347 uid_t *uid,
348 uid_t *euid,
349 gid_t *gid,
350 gid_t *egid) {
351
352 assert(auxv || size_bytes == 0);
353
354 if (size_bytes % (2 * sizeof(uint64_t)) != 0)
355 return log_warning_errno(SYNTHETIC_ERRNO(EIO), "Incomplete auxv structure (%zu bytes).", size_bytes);
356
357 size_t words = size_bytes / sizeof(uint64_t);
358
359 /* Note that we set output variables even on error. */
360
361 for (size_t i = 0; i + 1 < words; i += 2)
362 switch (auxv[i]) {
363 case AT_SECURE:
364 *at_secure = auxv[i + 1] != 0;
365 break;
366 case AT_UID:
367 *uid = auxv[i + 1];
368 break;
369 case AT_EUID:
370 *euid = auxv[i + 1];
371 break;
372 case AT_GID:
373 *gid = auxv[i + 1];
374 break;
375 case AT_EGID:
376 *egid = auxv[i + 1];
377 break;
378 case AT_NULL:
379 if (auxv[i + 1] != 0)
380 goto error;
381 return 0;
382 }
383 error:
384 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA),
385 "AT_NULL terminator not found, cannot parse auxv structure.");
386 }
387
388 static int parse_auxv32(
389 const uint32_t *auxv,
390 size_t size_bytes,
391 int *at_secure,
392 uid_t *uid,
393 uid_t *euid,
394 gid_t *gid,
395 gid_t *egid) {
396
397 assert(auxv || size_bytes == 0);
398
399 size_t words = size_bytes / sizeof(uint32_t);
400
401 if (size_bytes % (2 * sizeof(uint32_t)) != 0)
402 return log_warning_errno(SYNTHETIC_ERRNO(EIO), "Incomplete auxv structure (%zu bytes).", size_bytes);
403
404 /* Note that we set output variables even on error. */
405
406 for (size_t i = 0; i + 1 < words; i += 2)
407 switch (auxv[i]) {
408 case AT_SECURE:
409 *at_secure = auxv[i + 1] != 0;
410 break;
411 case AT_UID:
412 *uid = auxv[i + 1];
413 break;
414 case AT_EUID:
415 *euid = auxv[i + 1];
416 break;
417 case AT_GID:
418 *gid = auxv[i + 1];
419 break;
420 case AT_EGID:
421 *egid = auxv[i + 1];
422 break;
423 case AT_NULL:
424 if (auxv[i + 1] != 0)
425 goto error;
426 return 0;
427 }
428 error:
429 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA),
430 "AT_NULL terminator not found, cannot parse auxv structure.");
431 }
432
433 static int grant_user_access(int core_fd, const Context *context) {
434 int at_secure = -1;
435 uid_t uid = UID_INVALID, euid = UID_INVALID;
436 uid_t gid = GID_INVALID, egid = GID_INVALID;
437 int r;
438
439 assert(core_fd >= 0);
440 assert(context);
441
442 if (!context->meta[META_PROC_AUXV])
443 return log_warning_errno(SYNTHETIC_ERRNO(ENODATA), "No auxv data, not adjusting permissions.");
444
445 uint8_t elf[EI_NIDENT];
446 errno = 0;
447 if (pread(core_fd, &elf, sizeof(elf), 0) != sizeof(elf))
448 return log_warning_errno(errno_or_else(EIO),
449 "Failed to pread from coredump fd: %s", STRERROR_OR_EOF(errno));
450
451 if (elf[EI_MAG0] != ELFMAG0 ||
452 elf[EI_MAG1] != ELFMAG1 ||
453 elf[EI_MAG2] != ELFMAG2 ||
454 elf[EI_MAG3] != ELFMAG3 ||
455 elf[EI_VERSION] != EV_CURRENT)
456 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
457 "Core file does not have ELF header, not adjusting permissions.");
458 if (!IN_SET(elf[EI_CLASS], ELFCLASS32, ELFCLASS64) ||
459 !IN_SET(elf[EI_DATA], ELFDATA2LSB, ELFDATA2MSB))
460 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
461 "Core file has strange ELF class, not adjusting permissions.");
462
463 if ((elf[EI_DATA] == ELFDATA2LSB) != (__BYTE_ORDER == __LITTLE_ENDIAN))
464 return log_info_errno(SYNTHETIC_ERRNO(EUCLEAN),
465 "Core file has non-native endianness, not adjusting permissions.");
466
467 if (elf[EI_CLASS] == ELFCLASS64)
468 r = parse_auxv64((const uint64_t*) context->meta[META_PROC_AUXV],
469 context->meta_size[META_PROC_AUXV],
470 &at_secure, &uid, &euid, &gid, &egid);
471 else
472 r = parse_auxv32((const uint32_t*) context->meta[META_PROC_AUXV],
473 context->meta_size[META_PROC_AUXV],
474 &at_secure, &uid, &euid, &gid, &egid);
475 if (r < 0)
476 return r;
477
478 /* We allow access if we got all the data and at_secure is not set and
479 * the uid/gid matches euid/egid. */
480 bool ret =
481 at_secure == 0 &&
482 uid != UID_INVALID && euid != UID_INVALID && uid == euid &&
483 gid != GID_INVALID && egid != GID_INVALID && gid == egid;
484 log_debug("Will %s access (uid="UID_FMT " euid="UID_FMT " gid="GID_FMT " egid="GID_FMT " at_secure=%s)",
485 ret ? "permit" : "restrict",
486 uid, euid, gid, egid, yes_no(at_secure));
487 return ret;
488 }
489
490 static int save_external_coredump(
491 const Context *context,
492 int input_fd,
493 char **ret_filename,
494 int *ret_node_fd,
495 int *ret_data_fd,
496 uint64_t *ret_size,
497 uint64_t *ret_compressed_size,
498 bool *ret_truncated) {
499
500 _cleanup_(unlink_and_freep) char *tmp = NULL;
501 _cleanup_free_ char *fn = NULL;
502 _cleanup_close_ int fd = -EBADF;
503 uint64_t rlimit, process_limit, max_size;
504 bool truncated, storage_on_tmpfs;
505 struct stat st;
506 uid_t uid;
507 int r;
508
509 assert(context);
510 assert(ret_filename);
511 assert(ret_node_fd);
512 assert(ret_data_fd);
513 assert(ret_size);
514 assert(ret_compressed_size);
515 assert(ret_truncated);
516
517 r = parse_uid(context->meta[META_ARGV_UID], &uid);
518 if (r < 0)
519 return log_error_errno(r, "Failed to parse UID: %m");
520
521 r = safe_atou64(context->meta[META_ARGV_RLIMIT], &rlimit);
522 if (r < 0)
523 return log_error_errno(r, "Failed to parse resource limit '%s': %m",
524 context->meta[META_ARGV_RLIMIT]);
525 if (rlimit < page_size())
526 /* Is coredumping disabled? Then don't bother saving/processing the
527 * coredump. Anything below PAGE_SIZE cannot give a readable coredump
528 * (the kernel uses ELF_EXEC_PAGESIZE which is not easily accessible, but
529 * is usually the same as PAGE_SIZE. */
530 return log_info_errno(SYNTHETIC_ERRNO(EBADSLT),
531 "Resource limits disable core dumping for process %s (%s).",
532 context->meta[META_ARGV_PID], context->meta[META_COMM]);
533
534 process_limit = MAX(arg_process_size_max, storage_size_max());
535 if (process_limit == 0)
536 return log_debug_errno(SYNTHETIC_ERRNO(EBADSLT),
537 "Limits for coredump processing and storage are both 0, not dumping core.");
538
539 /* Never store more than the process configured, or than we actually shall keep or process */
540 max_size = MIN(rlimit, process_limit);
541
542 r = make_filename(context, &fn);
543 if (r < 0)
544 return log_error_errno(r, "Failed to determine coredump file name: %m");
545
546 (void) mkdir_parents_label(fn, 0755);
547
548 fd = open_tmpfile_linkable(fn, O_RDWR|O_CLOEXEC, &tmp);
549 if (fd < 0)
550 return log_error_errno(fd, "Failed to create temporary file for coredump %s: %m", fn);
551
552 /* If storage is on tmpfs, the kernel oomd might kill us if there's MemoryMax set on
553 * the service or the slice it belongs to. This is common on low-resources systems,
554 * to avoid crashing processes to take away too many system resources.
555 * Check the cgroup settings, and set max_size to a bit less than half of the
556 * available memory left to the process.
557 * Then, attempt to write the core file uncompressed first - if the write gets
558 * interrupted, we know we won't be able to write it all, so instead compress what
559 * was written so far, delete the uncompressed truncated core, and then continue
560 * compressing from STDIN. Given the compressed core cannot be larger than the
561 * uncompressed one, and 1KB for metadata is accounted for in the calculation, we
562 * should be able to at least store the full compressed core file. */
563
564 storage_on_tmpfs = fd_is_temporary_fs(fd) > 0;
565 if (storage_on_tmpfs && arg_compress) {
566 _cleanup_(sd_bus_flush_close_unrefp) sd_bus *bus = NULL;
567 uint64_t cgroup_limit = UINT64_MAX;
568 struct statvfs sv;
569
570 /* If we can't get the cgroup limit, just ignore it, but don't fail,
571 * try anyway with the config settings. */
572 r = sd_bus_default_system(&bus);
573 if (r < 0)
574 log_info_errno(r, "Failed to connect to system bus, skipping MemoryAvailable check: %m");
575 else {
576 _cleanup_(sd_bus_error_free) sd_bus_error error = SD_BUS_ERROR_NULL;
577
578 r = sd_bus_get_property_trivial(
579 bus,
580 "org.freedesktop.systemd1",
581 "/org/freedesktop/systemd1/unit/self",
582 "org.freedesktop.systemd1.Service",
583 "MemoryAvailable",
584 &error,
585 't', &cgroup_limit);
586 if (r < 0)
587 log_warning_errno(r,
588 "Failed to query MemoryAvailable for current unit, "
589 "falling back to static config settings: %s",
590 bus_error_message(&error, r));
591 }
592
593 max_size = MIN(cgroup_limit, max_size);
594 max_size = LESS_BY(max_size, 1024U) / 2; /* Account for 1KB metadata overhead for compressing */
595 max_size = MAX(PROCESS_SIZE_MIN, max_size); /* Impose a lower minimum */
596
597 /* tmpfs might get full quickly, so check the available space too.
598 * But don't worry about errors here, failing to access the storage
599 * location will be better logged when writing to it. */
600 if (statvfs("/var/lib/systemd/coredump/", &sv) >= 0)
601 max_size = MIN((uint64_t)sv.f_frsize * (uint64_t)sv.f_bfree, max_size);
602
603 log_debug("Limiting core file size to %" PRIu64 " bytes due to cgroup memory limits.", max_size);
604 }
605
606 r = copy_bytes(input_fd, fd, max_size, 0);
607 if (r < 0)
608 return log_error_errno(r, "Cannot store coredump of %s (%s): %m",
609 context->meta[META_ARGV_PID], context->meta[META_COMM]);
610 truncated = r == 1;
611
612 bool allow_user = grant_user_access(fd, context) > 0;
613
614 #if HAVE_COMPRESSION
615 if (arg_compress) {
616 _cleanup_(unlink_and_freep) char *tmp_compressed = NULL;
617 _cleanup_free_ char *fn_compressed = NULL;
618 _cleanup_close_ int fd_compressed = -EBADF;
619 uint64_t uncompressed_size = 0;
620
621 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
622 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
623
624 fn_compressed = strjoin(fn, default_compression_extension());
625 if (!fn_compressed)
626 return log_oom();
627
628 fd_compressed = open_tmpfile_linkable(fn_compressed, O_RDWR|O_CLOEXEC, &tmp_compressed);
629 if (fd_compressed < 0)
630 return log_error_errno(fd_compressed, "Failed to create temporary file for coredump %s: %m", fn_compressed);
631
632 r = compress_stream(fd, fd_compressed, max_size, &uncompressed_size);
633 if (r < 0)
634 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
635
636 if (truncated && storage_on_tmpfs) {
637 uint64_t partial_uncompressed_size = 0;
638
639 /* Uncompressed write was truncated and we are writing to tmpfs: delete
640 * the uncompressed core, and compress the remaining part from STDIN. */
641
642 tmp = unlink_and_free(tmp);
643 fd = safe_close(fd);
644
645 r = compress_stream(input_fd, fd_compressed, max_size, &partial_uncompressed_size);
646 if (r < 0)
647 return log_error_errno(r, "Failed to compress %s: %m", coredump_tmpfile_name(tmp_compressed));
648 uncompressed_size += partial_uncompressed_size;
649 }
650
651 r = fix_permissions(fd_compressed, tmp_compressed, fn_compressed, context, uid, allow_user);
652 if (r < 0)
653 return r;
654
655 if (fstat(fd_compressed, &st) < 0)
656 return log_error_errno(errno,
657 "Failed to fstat core file %s: %m",
658 coredump_tmpfile_name(tmp_compressed));
659
660 *ret_filename = TAKE_PTR(fn_compressed); /* compressed */
661 *ret_node_fd = TAKE_FD(fd_compressed); /* compressed */
662 *ret_compressed_size = (uint64_t) st.st_size; /* compressed */
663 *ret_data_fd = TAKE_FD(fd);
664 *ret_size = uncompressed_size;
665 *ret_truncated = truncated;
666 tmp_compressed = mfree(tmp_compressed);
667
668 return 0;
669 }
670 #endif
671
672 if (truncated)
673 log_struct(LOG_INFO,
674 LOG_MESSAGE("Core file was truncated to %"PRIu64" bytes.", max_size),
675 "SIZE_LIMIT=%"PRIu64, max_size,
676 "MESSAGE_ID=" SD_MESSAGE_TRUNCATED_CORE_STR);
677
678 r = fix_permissions(fd, tmp, fn, context, uid, allow_user);
679 if (r < 0)
680 return log_error_errno(r, "Failed to fix permissions and finalize coredump %s into %s: %m", coredump_tmpfile_name(tmp), fn);
681
682 if (fstat(fd, &st) < 0)
683 return log_error_errno(errno, "Failed to fstat core file %s: %m", coredump_tmpfile_name(tmp));
684
685 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
686 return log_error_errno(errno, "Failed to seek on coredump %s: %m", fn);
687
688 *ret_filename = TAKE_PTR(fn);
689 *ret_data_fd = TAKE_FD(fd);
690 *ret_size = (uint64_t) st.st_size;
691 *ret_truncated = truncated;
692
693 return 0;
694 }
695
696 static int allocate_journal_field(int fd, size_t size, char **ret, size_t *ret_size) {
697 _cleanup_free_ char *field = NULL;
698 ssize_t n;
699
700 assert(fd >= 0);
701 assert(ret);
702 assert(ret_size);
703
704 if (lseek(fd, 0, SEEK_SET) == (off_t) -1)
705 return log_warning_errno(errno, "Failed to seek: %m");
706
707 field = malloc(9 + size);
708 if (!field) {
709 log_warning("Failed to allocate memory for coredump, coredump will not be stored.");
710 return -ENOMEM;
711 }
712
713 memcpy(field, "COREDUMP=", 9);
714
715 n = read(fd, field + 9, size);
716 if (n < 0)
717 return log_error_errno((int) n, "Failed to read core data: %m");
718 if ((size_t) n < size)
719 return log_error_errno(SYNTHETIC_ERRNO(EIO),
720 "Core data too short.");
721
722 *ret = TAKE_PTR(field);
723 *ret_size = size + 9;
724
725 return 0;
726 }
727
728 /* Joins /proc/[pid]/fd/ and /proc/[pid]/fdinfo/ into the following lines:
729 * 0:/dev/pts/23
730 * pos: 0
731 * flags: 0100002
732 *
733 * 1:/dev/pts/23
734 * pos: 0
735 * flags: 0100002
736 *
737 * 2:/dev/pts/23
738 * pos: 0
739 * flags: 0100002
740 * EOF
741 */
742 static int compose_open_fds(pid_t pid, char **open_fds) {
743 _cleanup_closedir_ DIR *proc_fd_dir = NULL;
744 _cleanup_close_ int proc_fdinfo_fd = -EBADF;
745 _cleanup_free_ char *buffer = NULL;
746 _cleanup_fclose_ FILE *stream = NULL;
747 const char *fddelim = "", *path;
748 size_t size = 0;
749 int r;
750
751 assert(pid >= 0);
752 assert(open_fds != NULL);
753
754 path = procfs_file_alloca(pid, "fd");
755 proc_fd_dir = opendir(path);
756 if (!proc_fd_dir)
757 return -errno;
758
759 proc_fdinfo_fd = openat(dirfd(proc_fd_dir), "../fdinfo", O_DIRECTORY|O_NOFOLLOW|O_CLOEXEC|O_PATH);
760 if (proc_fdinfo_fd < 0)
761 return -errno;
762
763 stream = open_memstream_unlocked(&buffer, &size);
764 if (!stream)
765 return -ENOMEM;
766
767 FOREACH_DIRENT(de, proc_fd_dir, return -errno) {
768 _cleanup_fclose_ FILE *fdinfo = NULL;
769 _cleanup_free_ char *fdname = NULL;
770 _cleanup_close_ int fd = -EBADF;
771
772 r = readlinkat_malloc(dirfd(proc_fd_dir), de->d_name, &fdname);
773 if (r < 0)
774 return r;
775
776 fprintf(stream, "%s%s:%s\n", fddelim, de->d_name, fdname);
777 fddelim = "\n";
778
779 /* Use the directory entry from /proc/[pid]/fd with /proc/[pid]/fdinfo */
780 fd = openat(proc_fdinfo_fd, de->d_name, O_NOFOLLOW|O_CLOEXEC|O_RDONLY);
781 if (fd < 0)
782 continue;
783
784 fdinfo = take_fdopen(&fd, "r");
785 if (!fdinfo)
786 continue;
787
788 for (;;) {
789 _cleanup_free_ char *line = NULL;
790
791 r = read_line(fdinfo, LONG_LINE_MAX, &line);
792 if (r < 0)
793 return r;
794 if (r == 0)
795 break;
796
797 fputs(line, stream);
798 fputc('\n', stream);
799 }
800 }
801
802 errno = 0;
803 stream = safe_fclose(stream);
804
805 if (errno > 0)
806 return -errno;
807
808 *open_fds = TAKE_PTR(buffer);
809
810 return 0;
811 }
812
813 static int get_process_ns(pid_t pid, const char *namespace, ino_t *ns) {
814 const char *p;
815 struct stat stbuf;
816 _cleanup_close_ int proc_ns_dir_fd = -EBADF;
817
818 p = procfs_file_alloca(pid, "ns");
819
820 proc_ns_dir_fd = open(p, O_DIRECTORY | O_CLOEXEC | O_RDONLY);
821 if (proc_ns_dir_fd < 0)
822 return -errno;
823
824 if (fstatat(proc_ns_dir_fd, namespace, &stbuf, /* flags */0) < 0)
825 return -errno;
826
827 *ns = stbuf.st_ino;
828 return 0;
829 }
830
831 static int get_mount_namespace_leader(pid_t pid, pid_t *ret) {
832 ino_t proc_mntns;
833 int r;
834
835 r = get_process_ns(pid, "mnt", &proc_mntns);
836 if (r < 0)
837 return r;
838
839 for (;;) {
840 ino_t parent_mntns;
841 pid_t ppid;
842
843 r = get_process_ppid(pid, &ppid);
844 if (r == -EADDRNOTAVAIL) /* Reached the top (i.e. typically PID 1, but could also be a process
845 * whose parent is not in our pidns) */
846 return -ENOENT;
847 if (r < 0)
848 return r;
849
850 r = get_process_ns(ppid, "mnt", &parent_mntns);
851 if (r < 0)
852 return r;
853
854 if (proc_mntns != parent_mntns) {
855 *ret = ppid;
856 return 0;
857 }
858
859 pid = ppid;
860 }
861 }
862
863 /* Returns 1 if the parent was found.
864 * Returns 0 if there is not a process we can call the pid's
865 * container parent (the pid's process isn't 'containerized').
866 * Returns a negative number on errors.
867 */
868 static int get_process_container_parent_cmdline(pid_t pid, char** cmdline) {
869 pid_t container_pid;
870 const char *proc_root_path;
871 struct stat root_stat, proc_root_stat;
872 int r;
873
874 /* To compare inodes of / and /proc/[pid]/root */
875 if (stat("/", &root_stat) < 0)
876 return -errno;
877
878 proc_root_path = procfs_file_alloca(pid, "root");
879 if (stat(proc_root_path, &proc_root_stat) < 0)
880 return -errno;
881
882 /* The process uses system root. */
883 if (stat_inode_same(&proc_root_stat, &root_stat)) {
884 *cmdline = NULL;
885 return 0;
886 }
887
888 r = get_mount_namespace_leader(pid, &container_pid);
889 if (r < 0)
890 return r;
891
892 r = get_process_cmdline(container_pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, cmdline);
893 if (r < 0)
894 return r;
895
896 return 1;
897 }
898
899 static int change_uid_gid(const Context *context) {
900 uid_t uid;
901 gid_t gid;
902 int r;
903
904 r = parse_uid(context->meta[META_ARGV_UID], &uid);
905 if (r < 0)
906 return r;
907
908 if (uid_is_system(uid)) {
909 const char *user = "systemd-coredump";
910
911 r = get_user_creds(&user, &uid, &gid, NULL, NULL, 0);
912 if (r < 0) {
913 log_warning_errno(r, "Cannot resolve %s user. Proceeding to dump core as root: %m", user);
914 uid = gid = 0;
915 }
916 } else {
917 r = parse_gid(context->meta[META_ARGV_GID], &gid);
918 if (r < 0)
919 return r;
920 }
921
922 return drop_privileges(uid, gid, 0);
923 }
924
925 static int submit_coredump(
926 const Context *context,
927 struct iovec_wrapper *iovw,
928 int input_fd) {
929
930 _cleanup_(json_variant_unrefp) JsonVariant *json_metadata = NULL;
931 _cleanup_close_ int coredump_fd = -EBADF, coredump_node_fd = -EBADF;
932 _cleanup_free_ char *filename = NULL, *coredump_data = NULL;
933 _cleanup_free_ char *stacktrace = NULL;
934 char *core_message;
935 const char *module_name;
936 uint64_t coredump_size = UINT64_MAX, coredump_compressed_size = UINT64_MAX;
937 bool truncated = false;
938 JsonVariant *module_json;
939 int r;
940
941 assert(context);
942 assert(iovw);
943 assert(input_fd >= 0);
944
945 /* Vacuum before we write anything again */
946 (void) coredump_vacuum(-1, arg_keep_free, arg_max_use);
947
948 /* Always stream the coredump to disk, if that's possible */
949 r = save_external_coredump(context, input_fd,
950 &filename, &coredump_node_fd, &coredump_fd,
951 &coredump_size, &coredump_compressed_size, &truncated);
952 if (r < 0)
953 /* Skip whole core dumping part */
954 goto log;
955
956 /* If we don't want to keep the coredump on disk, remove it now, as later on we
957 * will lack the privileges for it. However, we keep the fd to it, so that we can
958 * still process it and log it. */
959 r = maybe_remove_external_coredump(filename, coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size);
960 if (r < 0)
961 return r;
962 if (r == 0)
963 (void) iovw_put_string_field(iovw, "COREDUMP_FILENAME=", filename);
964 else if (arg_storage == COREDUMP_STORAGE_EXTERNAL)
965 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
966 coredump_node_fd >= 0 ? coredump_compressed_size : coredump_size, arg_external_size_max);
967
968 /* Vacuum again, but exclude the coredump we just created */
969 (void) coredump_vacuum(coredump_node_fd >= 0 ? coredump_node_fd : coredump_fd, arg_keep_free, arg_max_use);
970
971 /* Now, let's drop privileges to become the user who owns the segfaulted process
972 * and allocate the coredump memory under the user's uid. This also ensures that
973 * the credentials journald will see are the ones of the coredumping user, thus
974 * making sure the user gets access to the core dump. Let's also get rid of all
975 * capabilities, if we run as root, we won't need them anymore. */
976 r = change_uid_gid(context);
977 if (r < 0)
978 return log_error_errno(r, "Failed to drop privileges: %m");
979
980 /* Try to get a stack trace if we can */
981 if (coredump_size > arg_process_size_max)
982 log_debug("Not generating stack trace: core size %"PRIu64" is greater "
983 "than %"PRIu64" (the configured maximum)",
984 coredump_size, arg_process_size_max);
985 else if (coredump_fd >= 0) {
986 bool skip = startswith(context->meta[META_COMM], "systemd-coredum"); /* COMM is 16 bytes usually */
987
988 (void) parse_elf_object(coredump_fd,
989 context->meta[META_EXE],
990 /* fork_disable_dump= */ skip, /* avoid loops */
991 &stacktrace,
992 &json_metadata);
993 }
994
995 log:
996 core_message = strjoina("Process ", context->meta[META_ARGV_PID],
997 " (", context->meta[META_COMM], ") of user ",
998 context->meta[META_ARGV_UID], " dumped core.",
999 context->is_journald && filename ? "\nCoredump diverted to " : NULL,
1000 context->is_journald && filename ? filename : NULL);
1001
1002 core_message = strjoina(core_message, stacktrace ? "\n\n" : NULL, stacktrace);
1003
1004 if (context->is_journald)
1005 /* We might not be able to log to the journal, so let's always print the message to another
1006 * log target. The target was set previously to something safe. */
1007 log_dispatch(LOG_ERR, 0, core_message);
1008
1009 (void) iovw_put_string_field(iovw, "MESSAGE=", core_message);
1010
1011 if (truncated)
1012 (void) iovw_put_string_field(iovw, "COREDUMP_TRUNCATED=", "1");
1013
1014 /* If we managed to parse any ELF metadata (build-id, ELF package meta),
1015 * attach it as journal metadata. */
1016 if (json_metadata) {
1017 _cleanup_free_ char *formatted_json = NULL;
1018
1019 r = json_variant_format(json_metadata, 0, &formatted_json);
1020 if (r < 0)
1021 return log_error_errno(r, "Failed to format JSON package metadata: %m");
1022
1023 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_JSON=", formatted_json);
1024 }
1025
1026 /* In the unlikely scenario that context->meta[META_EXE] is not available,
1027 * let's avoid guessing the module name and skip the loop. */
1028 if (context->meta[META_EXE])
1029 JSON_VARIANT_OBJECT_FOREACH(module_name, module_json, json_metadata) {
1030 JsonVariant *t;
1031
1032 /* We only add structured fields for the 'main' ELF module, and only if we can identify it. */
1033 if (!path_equal_filename(module_name, context->meta[META_EXE]))
1034 continue;
1035
1036 t = json_variant_by_key(module_json, "name");
1037 if (t)
1038 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_NAME=", json_variant_string(t));
1039
1040 t = json_variant_by_key(module_json, "version");
1041 if (t)
1042 (void) iovw_put_string_field(iovw, "COREDUMP_PACKAGE_VERSION=", json_variant_string(t));
1043 }
1044
1045 /* Optionally store the entire coredump in the journal */
1046 if (arg_storage == COREDUMP_STORAGE_JOURNAL && coredump_fd >= 0) {
1047 if (coredump_size <= arg_journal_size_max) {
1048 size_t sz = 0;
1049
1050 /* Store the coredump itself in the journal */
1051
1052 r = allocate_journal_field(coredump_fd, (size_t) coredump_size, &coredump_data, &sz);
1053 if (r >= 0) {
1054 if (iovw_put(iovw, coredump_data, sz) >= 0)
1055 TAKE_PTR(coredump_data);
1056 } else
1057 log_warning_errno(r, "Failed to attach the core to the journal entry: %m");
1058 } else
1059 log_info("The core will not be stored: size %"PRIu64" is greater than %"PRIu64" (the configured maximum)",
1060 coredump_size, arg_journal_size_max);
1061 }
1062
1063 /* If journald is coredumping, we have to be careful that we don't deadlock when trying to write the
1064 * coredump to the journal, so we put the journal socket in nonblocking mode before trying to write
1065 * the coredump to the socket. */
1066
1067 if (context->is_journald) {
1068 r = journal_fd_nonblock(true);
1069 if (r < 0)
1070 return log_error_errno(r, "Failed to make journal socket non-blocking: %m");
1071 }
1072
1073 r = sd_journal_sendv(iovw->iovec, iovw->count);
1074
1075 if (context->is_journald) {
1076 int k;
1077
1078 k = journal_fd_nonblock(false);
1079 if (k < 0)
1080 return log_error_errno(k, "Failed to make journal socket blocking: %m");
1081 }
1082
1083 if (r == -EAGAIN && context->is_journald)
1084 log_warning_errno(r, "Failed to log journal coredump, ignoring: %m");
1085 else if (r < 0)
1086 return log_error_errno(r, "Failed to log coredump: %m");
1087
1088 return 0;
1089 }
1090
1091 static int save_context(Context *context, const struct iovec_wrapper *iovw) {
1092 const char *unit;
1093 int r;
1094
1095 assert(context);
1096 assert(iovw);
1097 assert(iovw->count >= _META_ARGV_MAX);
1098
1099 /* The context does not allocate any memory on its own */
1100
1101 for (size_t n = 0; n < iovw->count; n++) {
1102 struct iovec *iovec = iovw->iovec + n;
1103
1104 for (size_t i = 0; i < ELEMENTSOF(meta_field_names); i++) {
1105 /* Note that these strings are NUL terminated, because we made sure that a
1106 * trailing NUL byte is in the buffer, though not included in the iov_len
1107 * count (see process_socket() and gather_pid_metadata_*()) */
1108 assert(((char*) iovec->iov_base)[iovec->iov_len] == 0);
1109
1110 const char *p = startswith(iovec->iov_base, meta_field_names[i]);
1111 if (p) {
1112 context->meta[i] = p;
1113 context->meta_size[i] = iovec->iov_len - strlen(meta_field_names[i]);
1114 break;
1115 }
1116 }
1117 }
1118
1119 if (!context->meta[META_ARGV_PID])
1120 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1121 "Failed to find the PID of crashing process");
1122
1123 r = parse_pid(context->meta[META_ARGV_PID], &context->pid);
1124 if (r < 0)
1125 return log_error_errno(r, "Failed to parse PID \"%s\": %m", context->meta[META_ARGV_PID]);
1126
1127 unit = context->meta[META_UNIT];
1128 context->is_pid1 = streq(context->meta[META_ARGV_PID], "1") || streq_ptr(unit, SPECIAL_INIT_SCOPE);
1129 context->is_journald = streq_ptr(unit, SPECIAL_JOURNALD_SERVICE);
1130
1131 return 0;
1132 }
1133
1134 static int process_socket(int fd) {
1135 _cleanup_close_ int input_fd = -EBADF;
1136 Context context = {};
1137 struct iovec_wrapper iovw = {};
1138 struct iovec iovec;
1139 int r;
1140
1141 assert(fd >= 0);
1142
1143 log_setup();
1144
1145 log_debug("Processing coredump received on stdin...");
1146
1147 for (;;) {
1148 CMSG_BUFFER_TYPE(CMSG_SPACE(sizeof(int))) control;
1149 struct msghdr mh = {
1150 .msg_control = &control,
1151 .msg_controllen = sizeof(control),
1152 .msg_iovlen = 1,
1153 };
1154 ssize_t n;
1155 ssize_t l;
1156
1157 l = next_datagram_size_fd(fd);
1158 if (l < 0) {
1159 r = log_error_errno(l, "Failed to determine datagram size to read: %m");
1160 goto finish;
1161 }
1162
1163 iovec.iov_len = l;
1164 iovec.iov_base = malloc(l + 1);
1165 if (!iovec.iov_base) {
1166 r = log_oom();
1167 goto finish;
1168 }
1169
1170 mh.msg_iov = &iovec;
1171
1172 n = recvmsg_safe(fd, &mh, MSG_CMSG_CLOEXEC);
1173 if (n < 0) {
1174 free(iovec.iov_base);
1175 r = log_error_errno(n, "Failed to receive datagram: %m");
1176 goto finish;
1177 }
1178
1179 /* The final zero-length datagram carries the file descriptor and tells us
1180 * that we're done. */
1181 if (n == 0) {
1182 struct cmsghdr *found;
1183
1184 free(iovec.iov_base);
1185
1186 found = cmsg_find(&mh, SOL_SOCKET, SCM_RIGHTS, CMSG_LEN(sizeof(int)));
1187 if (!found) {
1188 cmsg_close_all(&mh);
1189 r = log_error_errno(SYNTHETIC_ERRNO(EBADMSG),
1190 "Coredump file descriptor missing.");
1191 goto finish;
1192 }
1193
1194 assert(input_fd < 0);
1195 input_fd = *(int*) CMSG_DATA(found);
1196 break;
1197 } else
1198 cmsg_close_all(&mh);
1199
1200 /* Add trailing NUL byte, in case these are strings */
1201 ((char*) iovec.iov_base)[n] = 0;
1202 iovec.iov_len = (size_t) n;
1203
1204 r = iovw_put(&iovw, iovec.iov_base, iovec.iov_len);
1205 if (r < 0)
1206 goto finish;
1207 }
1208
1209 /* Make sure we got all data we really need */
1210 assert(input_fd >= 0);
1211
1212 r = save_context(&context, &iovw);
1213 if (r < 0)
1214 goto finish;
1215
1216 /* Make sure we received at least all fields we need. */
1217 for (int i = 0; i < _META_MANDATORY_MAX; i++)
1218 if (!context.meta[i]) {
1219 r = log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1220 "A mandatory argument (%i) has not been sent, aborting.",
1221 i);
1222 goto finish;
1223 }
1224
1225 r = submit_coredump(&context, &iovw, input_fd);
1226
1227 finish:
1228 iovw_free_contents(&iovw, true);
1229 return r;
1230 }
1231
1232 static int send_iovec(const struct iovec_wrapper *iovw, int input_fd) {
1233 _cleanup_close_ int fd = -EBADF;
1234 int r;
1235
1236 assert(iovw);
1237 assert(input_fd >= 0);
1238
1239 fd = socket(AF_UNIX, SOCK_SEQPACKET|SOCK_CLOEXEC, 0);
1240 if (fd < 0)
1241 return log_error_errno(errno, "Failed to create coredump socket: %m");
1242
1243 r = connect_unix_path(fd, AT_FDCWD, "/run/systemd/coredump");
1244 if (r < 0)
1245 return log_error_errno(r, "Failed to connect to coredump service: %m");
1246
1247 for (size_t i = 0; i < iovw->count; i++) {
1248 struct msghdr mh = {
1249 .msg_iov = iovw->iovec + i,
1250 .msg_iovlen = 1,
1251 };
1252 struct iovec copy[2];
1253
1254 for (;;) {
1255 if (sendmsg(fd, &mh, MSG_NOSIGNAL) >= 0)
1256 break;
1257
1258 if (errno == EMSGSIZE && mh.msg_iov[0].iov_len > 0) {
1259 /* This field didn't fit? That's a pity. Given that this is
1260 * just metadata, let's truncate the field at half, and try
1261 * again. We append three dots, in order to show that this is
1262 * truncated. */
1263
1264 if (mh.msg_iov != copy) {
1265 /* We don't want to modify the caller's iovec, hence
1266 * let's create our own array, consisting of two new
1267 * iovecs, where the first is a (truncated) copy of
1268 * what we want to send, and the second one contains
1269 * the trailing dots. */
1270 copy[0] = iovw->iovec[i];
1271 copy[1] = IOVEC_MAKE(((char[]){'.', '.', '.'}), 3);
1272
1273 mh.msg_iov = copy;
1274 mh.msg_iovlen = 2;
1275 }
1276
1277 copy[0].iov_len /= 2; /* halve it, and try again */
1278 continue;
1279 }
1280
1281 return log_error_errno(errno, "Failed to send coredump datagram: %m");
1282 }
1283 }
1284
1285 r = send_one_fd(fd, input_fd, 0);
1286 if (r < 0)
1287 return log_error_errno(r, "Failed to send coredump fd: %m");
1288
1289 return 0;
1290 }
1291
1292 static int gather_pid_metadata_from_argv(
1293 struct iovec_wrapper *iovw,
1294 Context *context,
1295 int argc, char **argv) {
1296
1297 _cleanup_free_ char *free_timestamp = NULL;
1298 int r, signo;
1299 char *t;
1300
1301 /* We gather all metadata that were passed via argv[] into an array of iovecs that
1302 * we'll forward to the socket unit */
1303
1304 if (argc < _META_ARGV_MAX)
1305 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1306 "Not enough arguments passed by the kernel (%i, expected %i).",
1307 argc, _META_ARGV_MAX);
1308
1309 for (int i = 0; i < _META_ARGV_MAX; i++) {
1310
1311 t = argv[i];
1312
1313 switch (i) {
1314
1315 case META_ARGV_TIMESTAMP:
1316 /* The journal fields contain the timestamp padded with six
1317 * zeroes, so that the kernel-supplied 1s granularity timestamps
1318 * becomes 1µs granularity, i.e. the granularity systemd usually
1319 * operates in. */
1320 t = free_timestamp = strjoin(argv[i], "000000");
1321 if (!t)
1322 return log_oom();
1323 break;
1324
1325 case META_ARGV_SIGNAL:
1326 /* For signal, record its pretty name too */
1327 if (safe_atoi(argv[i], &signo) >= 0 && SIGNAL_VALID(signo))
1328 (void) iovw_put_string_field(iovw, "COREDUMP_SIGNAL_NAME=SIG",
1329 signal_to_string(signo));
1330 break;
1331
1332 default:
1333 break;
1334 }
1335
1336 r = iovw_put_string_field(iovw, meta_field_names[i], t);
1337 if (r < 0)
1338 return r;
1339 }
1340
1341 /* Cache some of the process metadata we collected so far and that we'll need to
1342 * access soon */
1343 return save_context(context, iovw);
1344 }
1345
1346 static int gather_pid_metadata(struct iovec_wrapper *iovw, Context *context) {
1347 uid_t owner_uid;
1348 pid_t pid;
1349 char *t;
1350 size_t size;
1351 const char *p;
1352 int r;
1353
1354 /* Note that if we fail on oom later on, we do not roll-back changes to the iovec
1355 * structure. (It remains valid, with the first iovec fields initialized.) */
1356
1357 pid = context->pid;
1358
1359 /* The following is mandatory */
1360 r = get_process_comm(pid, &t);
1361 if (r < 0)
1362 return log_error_errno(r, "Failed to get COMM: %m");
1363
1364 r = iovw_put_string_field_free(iovw, "COREDUMP_COMM=", t);
1365 if (r < 0)
1366 return r;
1367
1368 /* The following are optional, but we use them if present. */
1369 r = get_process_exe(pid, &t);
1370 if (r >= 0)
1371 r = iovw_put_string_field_free(iovw, "COREDUMP_EXE=", t);
1372 if (r < 0)
1373 log_warning_errno(r, "Failed to get EXE, ignoring: %m");
1374
1375 if (cg_pid_get_unit(pid, &t) >= 0)
1376 (void) iovw_put_string_field_free(iovw, "COREDUMP_UNIT=", t);
1377
1378 if (cg_pid_get_user_unit(pid, &t) >= 0)
1379 (void) iovw_put_string_field_free(iovw, "COREDUMP_USER_UNIT=", t);
1380
1381 if (sd_pid_get_session(pid, &t) >= 0)
1382 (void) iovw_put_string_field_free(iovw, "COREDUMP_SESSION=", t);
1383
1384 if (sd_pid_get_owner_uid(pid, &owner_uid) >= 0) {
1385 r = asprintf(&t, UID_FMT, owner_uid);
1386 if (r > 0)
1387 (void) iovw_put_string_field_free(iovw, "COREDUMP_OWNER_UID=", t);
1388 }
1389
1390 if (sd_pid_get_slice(pid, &t) >= 0)
1391 (void) iovw_put_string_field_free(iovw, "COREDUMP_SLICE=", t);
1392
1393 if (get_process_cmdline(pid, SIZE_MAX, PROCESS_CMDLINE_QUOTE_POSIX, &t) >= 0)
1394 (void) iovw_put_string_field_free(iovw, "COREDUMP_CMDLINE=", t);
1395
1396 if (cg_pid_get_path_shifted(pid, NULL, &t) >= 0)
1397 (void) iovw_put_string_field_free(iovw, "COREDUMP_CGROUP=", t);
1398
1399 if (compose_open_fds(pid, &t) >= 0)
1400 (void) iovw_put_string_field_free(iovw, "COREDUMP_OPEN_FDS=", t);
1401
1402 p = procfs_file_alloca(pid, "status");
1403 if (read_full_virtual_file(p, &t, NULL) >= 0)
1404 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_STATUS=", t);
1405
1406 p = procfs_file_alloca(pid, "maps");
1407 if (read_full_virtual_file(p, &t, NULL) >= 0)
1408 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MAPS=", t);
1409
1410 p = procfs_file_alloca(pid, "limits");
1411 if (read_full_virtual_file(p, &t, NULL) >= 0)
1412 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_LIMITS=", t);
1413
1414 p = procfs_file_alloca(pid, "cgroup");
1415 if (read_full_virtual_file(p, &t, NULL) >= 0)
1416 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_CGROUP=", t);
1417
1418 p = procfs_file_alloca(pid, "mountinfo");
1419 if (read_full_virtual_file(p, &t, NULL) >= 0)
1420 (void) iovw_put_string_field_free(iovw, "COREDUMP_PROC_MOUNTINFO=", t);
1421
1422 /* We attach /proc/auxv here. ELF coredumps also contain a note for this (NT_AUXV), see elf(5). */
1423 p = procfs_file_alloca(pid, "auxv");
1424 if (read_full_virtual_file(p, &t, &size) >= 0) {
1425 char *buf = malloc(strlen("COREDUMP_PROC_AUXV=") + size + 1);
1426 if (buf) {
1427 /* Add a dummy terminator to make save_context() happy. */
1428 *((uint8_t*) mempcpy(stpcpy(buf, "COREDUMP_PROC_AUXV="), t, size)) = '\0';
1429 (void) iovw_consume(iovw, buf, size + strlen("COREDUMP_PROC_AUXV="));
1430 }
1431
1432 free(t);
1433 }
1434
1435 if (get_process_cwd(pid, &t) >= 0)
1436 (void) iovw_put_string_field_free(iovw, "COREDUMP_CWD=", t);
1437
1438 if (get_process_root(pid, &t) >= 0) {
1439 bool proc_self_root_is_slash;
1440
1441 proc_self_root_is_slash = strcmp(t, "/") == 0;
1442
1443 (void) iovw_put_string_field_free(iovw, "COREDUMP_ROOT=", t);
1444
1445 /* If the process' root is "/", then there is a chance it has
1446 * mounted own root and hence being containerized. */
1447 if (proc_self_root_is_slash && get_process_container_parent_cmdline(pid, &t) > 0)
1448 (void) iovw_put_string_field_free(iovw, "COREDUMP_CONTAINER_CMDLINE=", t);
1449 }
1450
1451 if (get_process_environ(pid, &t) >= 0)
1452 (void) iovw_put_string_field_free(iovw, "COREDUMP_ENVIRON=", t);
1453
1454 /* we successfully acquired all metadata */
1455 return save_context(context, iovw);
1456 }
1457
1458 static int process_kernel(int argc, char* argv[]) {
1459 Context context = {};
1460 struct iovec_wrapper *iovw;
1461 int r;
1462
1463 /* When we're invoked by the kernel, stdout/stderr are closed which is dangerous because the fds
1464 * could get reallocated. To avoid hard to debug issues, let's instead bind stdout/stderr to
1465 * /dev/null. */
1466 r = rearrange_stdio(STDIN_FILENO, -1, -1);
1467 if (r < 0)
1468 return log_error_errno(r, "Failed to connect stdout/stderr to /dev/null: %m");
1469
1470 log_debug("Processing coredump received from the kernel...");
1471
1472 iovw = iovw_new();
1473 if (!iovw)
1474 return log_oom();
1475
1476 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_COREDUMP_STR);
1477 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1478
1479 /* Collect all process metadata passed by the kernel through argv[] */
1480 r = gather_pid_metadata_from_argv(iovw, &context, argc - 1, argv + 1);
1481 if (r < 0)
1482 goto finish;
1483
1484 /* Collect the rest of the process metadata retrieved from the runtime */
1485 r = gather_pid_metadata(iovw, &context);
1486 if (r < 0)
1487 goto finish;
1488
1489 if (!context.is_journald) {
1490 /* OK, now we know it's not the journal, hence we can make use of it now. */
1491 log_set_target(LOG_TARGET_JOURNAL_OR_KMSG);
1492 log_open();
1493 }
1494
1495 /* If this is PID 1 disable coredump collection, we'll unlikely be able to process
1496 * it later on.
1497 *
1498 * FIXME: maybe we should disable coredumps generation from the beginning and
1499 * re-enable it only when we know it's either safe (ie we're not running OOM) or
1500 * it's not pid1 ? */
1501 if (context.is_pid1) {
1502 log_notice("Due to PID 1 having crashed coredump collection will now be turned off.");
1503 disable_coredumps();
1504 }
1505
1506 if (context.is_journald || context.is_pid1)
1507 r = submit_coredump(&context, iovw, STDIN_FILENO);
1508 else
1509 r = send_iovec(iovw, STDIN_FILENO);
1510
1511 finish:
1512 iovw = iovw_free_free(iovw);
1513 return r;
1514 }
1515
1516 static int process_backtrace(int argc, char *argv[]) {
1517 Context context = {};
1518 struct iovec_wrapper *iovw;
1519 char *message;
1520 int r;
1521 _cleanup_(journal_importer_cleanup) JournalImporter importer = JOURNAL_IMPORTER_INIT(STDIN_FILENO);
1522
1523 log_debug("Processing backtrace on stdin...");
1524
1525 iovw = iovw_new();
1526 if (!iovw)
1527 return log_oom();
1528
1529 (void) iovw_put_string_field(iovw, "MESSAGE_ID=", SD_MESSAGE_BACKTRACE_STR);
1530 (void) iovw_put_string_field(iovw, "PRIORITY=", STRINGIFY(LOG_CRIT));
1531
1532 /* Collect all process metadata from argv[] by making sure to skip the
1533 * '--backtrace' option */
1534 r = gather_pid_metadata_from_argv(iovw, &context, argc - 2, argv + 2);
1535 if (r < 0)
1536 goto finish;
1537
1538 /* Collect the rest of the process metadata retrieved from the runtime */
1539 r = gather_pid_metadata(iovw, &context);
1540 if (r < 0)
1541 goto finish;
1542
1543 for (;;) {
1544 r = journal_importer_process_data(&importer);
1545 if (r < 0) {
1546 log_error_errno(r, "Failed to parse journal entry on stdin: %m");
1547 goto finish;
1548 }
1549 if (r == 1 || /* complete entry */
1550 journal_importer_eof(&importer)) /* end of data */
1551 break;
1552 }
1553
1554 if (journal_importer_eof(&importer)) {
1555 log_warning("Did not receive a full journal entry on stdin, ignoring message sent by reporter");
1556
1557 message = strjoina("Process ", context.meta[META_ARGV_PID],
1558 " (", context.meta[META_COMM], ")"
1559 " of user ", context.meta[META_ARGV_UID],
1560 " failed with ", context.meta[META_ARGV_SIGNAL]);
1561
1562 r = iovw_put_string_field(iovw, "MESSAGE=", message);
1563 if (r < 0)
1564 return r;
1565 } else {
1566 /* The imported iovecs are not supposed to be freed by us so let's store
1567 * them at the end of the array so we can skip them while freeing the
1568 * rest. */
1569 for (size_t i = 0; i < importer.iovw.count; i++) {
1570 struct iovec *iovec = importer.iovw.iovec + i;
1571
1572 iovw_put(iovw, iovec->iov_base, iovec->iov_len);
1573 }
1574 }
1575
1576 r = sd_journal_sendv(iovw->iovec, iovw->count);
1577 if (r < 0)
1578 log_error_errno(r, "Failed to log backtrace: %m");
1579
1580 finish:
1581 iovw->count -= importer.iovw.count;
1582 iovw = iovw_free_free(iovw);
1583 return r;
1584 }
1585
1586 static int run(int argc, char *argv[]) {
1587 int r;
1588
1589 /* First, log to a safe place, since we don't know what crashed and it might
1590 * be journald which we'd rather not log to then. */
1591
1592 log_set_target(LOG_TARGET_KMSG);
1593 log_open();
1594
1595 /* Make sure we never enter a loop */
1596 (void) prctl(PR_SET_DUMPABLE, 0);
1597
1598 /* Ignore all parse errors */
1599 (void) parse_config();
1600
1601 log_debug("Selected storage '%s'.", coredump_storage_to_string(arg_storage));
1602 log_debug("Selected compression %s.", yes_no(arg_compress));
1603
1604 r = sd_listen_fds(false);
1605 if (r < 0)
1606 return log_error_errno(r, "Failed to determine the number of file descriptors: %m");
1607
1608 /* If we got an fd passed, we are running in coredumpd mode. Otherwise we
1609 * are invoked from the kernel as coredump handler. */
1610 if (r == 0) {
1611 if (streq_ptr(argv[1], "--backtrace"))
1612 return process_backtrace(argc, argv);
1613 else
1614 return process_kernel(argc, argv);
1615 } else if (r == 1)
1616 return process_socket(SD_LISTEN_FDS_START);
1617
1618 return log_error_errno(SYNTHETIC_ERRNO(EINVAL),
1619 "Received unexpected number of file descriptors.");
1620 }
1621
1622 DEFINE_MAIN_FUNCTION(run);