src/core/exec-credential.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/mount.h>
   4 #include <unistd.h>
   5
   6 #include "acl-util.h"
   7 #include "cgroup.h"
   8 #include "creds-util.h"
   9 #include "errno-util.h"
  10 #include "exec-credential.h"
  11 #include "execute.h"
  12 #include "fileio.h"
  13 #include "fs-util.h"
  14 #include "glob-util.h"
  15 #include "io-util.h"
  16 #include "iovec-util.h"
  17 #include "label-util.h"
  18 #include "log.h"
  19 #include "mkdir-label.h"
  20 #include "mount-util.h"
  21 #include "mountpoint-util.h"
  22 #include "ordered-set.h"
  23 #include "path-lookup.h"
  24 #include "path-util.h"
  25 #include "process-util.h"
  26 #include "random-util.h"
  27 #include "recurse-dir.h"
  28 #include "rm-rf.h"
  29 #include "siphash24.h"
  30 #include "stat-util.h"
  31 #include "strv.h"
  32 #include "tmpfile-util.h"
  33 #include "user-util.h"
  34
  35 ExecSetCredential* exec_set_credential_free(ExecSetCredential *sc) {
  36         if (!sc)
  37                 return NULL;
  38
  39         free(sc->id);
  40         free(sc->data);
  41         return mfree(sc);
  42 }
  43
  44 ExecLoadCredential* exec_load_credential_free(ExecLoadCredential *lc) {
  45         if (!lc)
  46                 return NULL;
  47
  48         free(lc->id);
  49         free(lc->path);
  50         return mfree(lc);
  51 }
  52
  53 ExecImportCredential* exec_import_credential_free(ExecImportCredential *ic) {
  54         if (!ic)
  55                 return NULL;
  56
  57         free(ic->glob);
  58         free(ic->rename);
  59         return mfree(ic);
  60 }
  61
  62 static void exec_import_credential_hash_func(const ExecImportCredential *ic, struct siphash *state) {
  63         assert(ic);
  64         assert(state);
  65
  66         siphash24_compress_string(ic->glob, state);
  67         if (ic->rename)
  68                 siphash24_compress_string(ic->rename, state);
  69 }
  70
  71 static int exec_import_credential_compare_func(const ExecImportCredential *a, const ExecImportCredential *b) {
  72         int r;
  73
  74         assert(a);
  75         assert(b);
  76
  77         r = strcmp(a->glob, b->glob);
  78         if (r != 0)
  79                 return r;
  80
  81         return strcmp_ptr(a->rename, b->rename);
  82 }
  83
  84 DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
  85         exec_set_credential_hash_ops,
  86         char, string_hash_func, string_compare_func,
  87         ExecSetCredential, exec_set_credential_free);
  88
  89 DEFINE_PRIVATE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
  90         exec_load_credential_hash_ops,
  91         char, string_hash_func, string_compare_func,
  92         ExecLoadCredential, exec_load_credential_free);
  93
  94 DEFINE_PRIVATE_HASH_OPS_WITH_KEY_DESTRUCTOR(
  95         exec_import_credential_hash_ops,
  96         ExecImportCredential,
  97         exec_import_credential_hash_func,
  98         exec_import_credential_compare_func,
  99         exec_import_credential_free);
 100
 101 int exec_context_put_load_credential(ExecContext *c, const char *id, const char *path, bool encrypted) {
 102         ExecLoadCredential *old;
 103         int r;
 104
 105         assert(c);
 106         assert(id);
 107         assert(path);
 108
 109         old = hashmap_get(c->load_credentials, id);
 110         if (old) {
 111                 r = free_and_strdup(&old->path, path);
 112                 if (r < 0)
 113                         return r;
 114
 115                 old->encrypted = encrypted;
 116         } else {
 117                 _cleanup_(exec_load_credential_freep) ExecLoadCredential *lc = NULL;
 118
 119                 lc = new(ExecLoadCredential, 1);
 120                 if (!lc)
 121                         return -ENOMEM;
 122
 123                 *lc = (ExecLoadCredential) {
 124                         .id = strdup(id),
 125                         .path = strdup(path),
 126                         .encrypted = encrypted,
 127                 };
 128                 if (!lc->id || !lc->path)
 129                         return -ENOMEM;
 130
 131                 r = hashmap_ensure_put(&c->load_credentials, &exec_load_credential_hash_ops, lc->id, lc);
 132                 assert(r != -EEXIST);
 133                 if (r < 0)
 134                         return r;
 135
 136                 TAKE_PTR(lc);
 137         }
 138
 139         return 0;
 140 }
 141
 142 int exec_context_put_set_credential(
 143                 ExecContext *c,
 144                 const char *id,
 145                 void *data_consume,
 146                 size_t size,
 147                 bool encrypted) {
 148
 149         _cleanup_free_ void *data = data_consume;
 150         ExecSetCredential *old;
 151         int r;
 152
 153         /* Takes the ownership of data both on success and failure */
 154
 155         assert(c);
 156         assert(id);
 157         assert(data || size == 0);
 158
 159         old = hashmap_get(c->set_credentials, id);
 160         if (old) {
 161                 free_and_replace(old->data, data);
 162                 old->size = size;
 163                 old->encrypted = encrypted;
 164         } else {
 165                 _cleanup_(exec_set_credential_freep) ExecSetCredential *sc = NULL;
 166
 167                 sc = new(ExecSetCredential, 1);
 168                 if (!sc)
 169                         return -ENOMEM;
 170
 171                 *sc = (ExecSetCredential) {
 172                         .id = strdup(id),
 173                         .data = TAKE_PTR(data),
 174                         .size = size,
 175                         .encrypted = encrypted,
 176                 };
 177                 if (!sc->id)
 178                         return -ENOMEM;
 179
 180                 r = hashmap_ensure_put(&c->set_credentials, &exec_set_credential_hash_ops, sc->id, sc);
 181                 assert(r != -EEXIST);
 182                 if (r < 0)
 183                         return r;
 184
 185                 TAKE_PTR(sc);
 186         }
 187
 188         return 0;
 189 }
 190
 191 int exec_context_put_import_credential(ExecContext *c, const char *glob, const char *rename) {
 192         _cleanup_(exec_import_credential_freep) ExecImportCredential *ic = NULL;
 193         int r;
 194
 195         assert(c);
 196         assert(glob);
 197
 198         rename = empty_to_null(rename);
 199
 200         ic = new(ExecImportCredential, 1);
 201         if (!ic)
 202                 return -ENOMEM;
 203
 204         *ic = (ExecImportCredential) {
 205                 .glob = strdup(glob),
 206         };
 207         if (!ic->glob)
 208                 return -ENOMEM;
 209         if (rename) {
 210                 ic->rename = strdup(rename);
 211                 if (!ic->rename)
 212                         return -ENOMEM;
 213         }
 214
 215         if (ordered_set_contains(c->import_credentials, ic))
 216                 return 0;
 217
 218         r = ordered_set_ensure_put(&c->import_credentials, &exec_import_credential_hash_ops, ic);
 219         assert(r != -EEXIST);
 220         if (r < 0)
 221                 return r;
 222
 223         TAKE_PTR(ic);
 224
 225         return 0;
 226 }
 227
 228 bool exec_params_need_credentials(const ExecParameters *p) {
 229         assert(p);
 230
 231         return p->flags & (EXEC_SETUP_CREDENTIALS|EXEC_SETUP_CREDENTIALS_FRESH);
 232 }
 233
 234 bool exec_context_has_credentials(const ExecContext *c) {
 235         assert(c);
 236
 237         return !hashmap_isempty(c->set_credentials) ||
 238                 !hashmap_isempty(c->load_credentials) ||
 239                 !ordered_set_isempty(c->import_credentials);
 240 }
 241
 242 bool mount_point_is_credentials(const char *runtime_prefix, const char *path) {
 243         const char *e;
 244
 245         assert(runtime_prefix);
 246         assert(path);
 247
 248         e = path_startswith(path, runtime_prefix);
 249         if (!e)
 250                 return false;
 251
 252         return path_startswith(e, "credentials");
 253 }
 254
 255 static int get_credential_directory(
 256                 const char *runtime_prefix,
 257                 const char *unit,
 258                 char **ret) {
 259
 260         char *p;
 261
 262         assert(ret);
 263
 264         if (!runtime_prefix || !unit) {
 265                 *ret = NULL;
 266                 return 0;
 267         }
 268
 269         p = path_join(runtime_prefix, "credentials", unit);
 270         if (!p)
 271                 return -ENOMEM;
 272
 273         *ret = p;
 274         return 1;
 275 }
 276
 277 int exec_context_get_credential_directory(
 278                 const ExecContext *context,
 279                 const ExecParameters *params,
 280                 const char *unit,
 281                 char **ret) {
 282
 283         assert(context);
 284         assert(params);
 285         assert(unit);
 286         assert(ret);
 287
 288         if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context)) {
 289                 *ret = NULL;
 290                 return 0;
 291         }
 292
 293         return get_credential_directory(params->prefix[EXEC_DIRECTORY_RUNTIME], unit, ret);
 294 }
 295
 296 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
 297         _cleanup_free_ char *p = NULL;
 298         int r;
 299
 300         assert(c);
 301
 302         r = get_credential_directory(runtime_prefix, unit, &p);
 303         if (r <= 0)
 304                 return r;
 305
 306         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
 307          * unmount it, and afterwards remove the mount point */
 308         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
 309         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
 310
 311         return 0;
 312 }
 313
 314 static int write_credential(
 315                 int dfd,
 316                 const char *id,
 317                 const void *data,
 318                 size_t size,
 319                 uid_t uid,
 320                 gid_t gid,
 321                 bool ownership_ok) {
 322
 323         _cleanup_free_ char *tmp = NULL;
 324         _cleanup_close_ int fd = -EBADF;
 325         int r;
 326
 327         assert(dfd >= 0);
 328         assert(id);
 329         assert(data || size == 0);
 330
 331         r = tempfn_random_child("", "cred", &tmp);
 332         if (r < 0)
 333                 return r;
 334
 335         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
 336         if (fd < 0)
 337                 return -errno;
 338
 339         r = loop_write(fd, data, size);
 340         if (r < 0)
 341                 goto fail;
 342
 343         r = RET_NERRNO(fchmod(fd, 0400)); /* Take away "w" bit */
 344         if (r < 0)
 345                 goto fail;
 346
 347         if (uid_is_valid(uid) && uid != getuid()) {
 348                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
 349                 if (r < 0) {
 350                         /* Ideally we use ACLs, since we can neatly express what we want to express:
 351                          * the user gets read access and nothing else. But if the backing fs can't
 352                          * support that (e.g. ramfs), then we can use file ownership instead. But that's
 353                          * only safe if we can then re-mount the whole thing read-only, so that the user
 354                          * can no longer chmod() the file to gain write access. */
 355                         if (!ownership_ok || (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r)))
 356                                 goto fail;
 357
 358                         r = RET_NERRNO(fchown(fd, uid, gid));
 359                         if (r < 0)
 360                                 goto fail;
 361                 }
 362         }
 363
 364         r = RET_NERRNO(renameat(dfd, tmp, dfd, id));
 365         if (r < 0)
 366                 goto fail;
 367
 368         return 0;
 369
 370 fail:
 371         (void) unlinkat(dfd, tmp, /* flags = */ 0);
 372         return r;
 373 }
 374
 375 typedef enum CredentialSearchPath {
 376         CREDENTIAL_SEARCH_PATH_TRUSTED,
 377         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
 378         CREDENTIAL_SEARCH_PATH_ALL,
 379         _CREDENTIAL_SEARCH_PATH_MAX,
 380         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
 381 } CredentialSearchPath;
 382
 383 static int credential_search_path(const ExecParameters *params, CredentialSearchPath path, char ***ret) {
 384         _cleanup_strv_free_ char **l = NULL;
 385         int r;
 386
 387         assert(params);
 388         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
 389         assert(ret);
 390
 391         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
 392          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
 393          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
 394
 395         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
 396                 r = strv_extend(&l, params->received_encrypted_credentials_directory);
 397                 if (r < 0)
 398                         return r;
 399
 400                 _cleanup_strv_free_ char **add = NULL;
 401                 r = credential_store_path_encrypted(params->runtime_scope, &add);
 402                 if (r < 0)
 403                         return r;
 404
 405                 r = strv_extend_strv_consume(&l, TAKE_PTR(add), /* filter_duplicates= */ false);
 406                 if (r < 0)
 407                         return r;
 408         }
 409
 410         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
 411                 r = strv_extend(&l, params->received_credentials_directory);
 412                 if (r < 0)
 413                         return r;
 414
 415                 _cleanup_strv_free_ char **add = NULL;
 416                 r = credential_store_path(params->runtime_scope, &add);
 417                 if (r < 0)
 418                         return r;
 419
 420                 r = strv_extend_strv_consume(&l, TAKE_PTR(add), /* filter_duplicates= */ false);
 421                 if (r < 0)
 422                         return r;
 423         }
 424
 425         if (DEBUG_LOGGING) {
 426                 _cleanup_free_ char *t = strv_join(l, ":");
 427                 log_debug("Credential search path is: %s", strempty(t));
 428         }
 429
 430         *ret = TAKE_PTR(l);
 431         return 0;
 432 }
 433
 434 static bool device_nodes_restricted(
 435                 const ExecContext *c,
 436                 const CGroupContext *cgroup_context) {
 437
 438         assert(c);
 439         assert(cgroup_context);
 440
 441         /* Returns true if we have any reason to believe we might not be able to access the TPM device
 442          * directly, even if we run as root/PID 1. This could be because /dev/ is replaced by a private
 443          * version, or because a device node access list is configured. */
 444
 445         if (c->private_devices)
 446                 return true;
 447
 448         if (cgroup_context->device_policy != CGROUP_DEVICE_POLICY_AUTO ||
 449             cgroup_context->device_allow)
 450                 return true;
 451
 452         return false;
 453 }
 454
 455 struct load_cred_args {
 456         const ExecContext *context;
 457         const CGroupContext *cgroup_context;
 458         const ExecParameters *params;
 459         const char *unit;
 460         bool encrypted;
 461         int write_dfd;
 462         uid_t uid;
 463         gid_t gid;
 464         bool ownership_ok;
 465         uint64_t left;
 466 };
 467
 468 static int maybe_decrypt_and_write_credential(
 469                 struct load_cred_args *args,
 470                 const char *id,
 471                 const char *data,
 472                 size_t size) {
 473
 474         _cleanup_(iovec_done_erase) struct iovec plaintext = {};
 475         size_t add;
 476         int r;
 477
 478         assert(args);
 479         assert(args->write_dfd >= 0);
 480         assert(id);
 481         assert(data || size == 0);
 482
 483         if (args->encrypted) {
 484                 CredentialFlags flags = 0; /* only allow user creds in user scope */
 485
 486                 switch (args->params->runtime_scope) {
 487
 488                 case RUNTIME_SCOPE_SYSTEM:
 489                         /* In system mode talk directly to the TPM – unless we live in a device sandbox
 490                          * which might block TPM device access. */
 491
 492                         flags |= CREDENTIAL_ANY_SCOPE;
 493
 494                         if (!device_nodes_restricted(args->context, args->cgroup_context)) {
 495                                 r = decrypt_credential_and_warn(
 496                                                 id,
 497                                                 now(CLOCK_REALTIME),
 498                                                 /* tpm2_device= */ NULL,
 499                                                 /* tpm2_signature_path= */ NULL,
 500                                                 getuid(),
 501                                                 &IOVEC_MAKE(data, size),
 502                                                 flags,
 503                                                 &plaintext);
 504                                 break;
 505                         }
 506
 507                         _fallthrough_;
 508
 509                 case RUNTIME_SCOPE_USER:
 510                         /* In per user mode we'll not have access to the machine secret, nor to the TPM (most
 511                          * likely), hence go via the IPC service instead. Do this if we are run in root's
 512                          * per-user invocation too, to minimize differences and because isolating this logic
 513                          * into a separate process is generally a good thing anyway. */
 514                         r = ipc_decrypt_credential(
 515                                         id,
 516                                         now(CLOCK_REALTIME),
 517                                         getuid(),
 518                                         &IOVEC_MAKE(data, size),
 519                                         flags,
 520                                         &plaintext);
 521                         break;
 522
 523                 default:
 524                         assert_not_reached();
 525                 }
 526                 if (r < 0)
 527                         return r;
 528
 529                 data = plaintext.iov_base;
 530                 size = plaintext.iov_len;
 531         }
 532
 533         add = strlen(id) + size;
 534         if (add > args->left)
 535                 return -E2BIG;
 536
 537         r = write_credential(args->write_dfd, id, data, size, args->uid, args->gid, args->ownership_ok);
 538         if (r < 0)
 539                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
 540
 541         args->left -= add;
 542
 543         return 0;
 544 }
 545
 546 static int load_credential_glob(
 547                 struct load_cred_args *args,
 548                 const ExecImportCredential *ic,
 549                 char * const *search_path,
 550                 ReadFullFileFlags flags) {
 551
 552         int r;
 553
 554         assert(args);
 555         assert(args->write_dfd >= 0);
 556         assert(ic);
 557         assert(search_path);
 558
 559         STRV_FOREACH(d, search_path) {
 560                 _cleanup_strv_free_ char **paths = NULL;
 561                 _cleanup_free_ char *j = NULL;
 562
 563                 j = path_join(*d, ic->glob);
 564                 if (!j)
 565                         return -ENOMEM;
 566
 567                 r = safe_glob(j, /* flags = */ 0, &paths);
 568                 if (r == -ENOENT)
 569                         continue;
 570                 if (r < 0)
 571                         return r;
 572
 573                 STRV_FOREACH(p, paths) {
 574                         _cleanup_free_ char *fn = NULL;
 575                         _cleanup_(erase_and_freep) char *data = NULL;
 576                         size_t size;
 577
 578                         r = path_extract_filename(*p, &fn);
 579                         if (r < 0)
 580                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m", *p);
 581
 582                         if (ic->rename) {
 583                                 _cleanup_free_ char *renamed = NULL;
 584
 585                                 renamed = strjoin(ic->rename, fn + strlen(ic->glob) - !!endswith(ic->glob, "*"));
 586                                 if (!renamed)
 587                                         return log_oom_debug();
 588
 589                                 free_and_replace(fn, renamed);
 590                         }
 591
 592                         if (!credential_name_valid(fn)) {
 593                                 log_debug("Skipping credential with invalid name: %s", fn);
 594                                 continue;
 595                         }
 596
 597                         if (faccessat(args->write_dfd, fn, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
 598                                 log_debug("Skipping credential with duplicated ID %s at %s", fn, *p);
 599                                 continue;
 600                         }
 601                         if (errno != ENOENT)
 602                                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", fn);
 603
 604                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
 605                         r = read_full_file_full(
 606                                         AT_FDCWD,
 607                                         *p,
 608                                         UINT64_MAX,
 609                                         args->encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
 610                                         flags,
 611                                         NULL,
 612                                         &data, &size);
 613                         if (r < 0)
 614                                 return log_debug_errno(r, "Failed to read credential '%s': %m", *p);
 615
 616                         r = maybe_decrypt_and_write_credential(args, fn, data, size);
 617                         if (r < 0)
 618                                 return r;
 619                 }
 620         }
 621
 622         return 0;
 623 }
 624
 625 static int load_credential(
 626                 struct load_cred_args *args,
 627                 const char *id,
 628                 int read_dfd,
 629                 const char *path) {
 630
 631         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
 632         _cleanup_strv_free_ char **search_path = NULL;
 633         _cleanup_free_ char *bindname = NULL;
 634         const char *source = NULL;
 635         bool missing_ok;
 636         _cleanup_(erase_and_freep) char *data = NULL;
 637         size_t size, maxsz;
 638         int r;
 639
 640         assert(args);
 641         assert(args->context);
 642         assert(args->params);
 643         assert(args->unit);
 644         assert(args->write_dfd >= 0);
 645         assert(id);
 646         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
 647         assert(path);
 648
 649         if (read_dfd >= 0) {
 650                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
 651                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
 652                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
 653                  * open it. */
 654
 655                 if (!filename_is_valid(path)) /* safety check */
 656                         return -EINVAL;
 657
 658                 missing_ok = true;
 659                 source = path;
 660
 661         } else if (path_is_absolute(path)) {
 662                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
 663                  * sockets */
 664
 665                 if (!path_is_valid(path)) /* safety check */
 666                         return -EINVAL;
 667
 668                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
 669
 670                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
 671                  * via the source socket address in case we read off an AF_UNIX socket. */
 672                 if (asprintf(&bindname, "@%" PRIx64 "/unit/%s/%s", random_u64(), args->unit, id) < 0)
 673                         return -ENOMEM;
 674
 675                 missing_ok = false;
 676                 source = path;
 677
 678         } else if (credential_name_valid(path)) {
 679                 /* If this is a relative path, take it as credential name relative to the credentials
 680                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
 681                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
 682
 683                 r = credential_search_path(args->params, CREDENTIAL_SEARCH_PATH_ALL, &search_path);
 684                 if (r < 0)
 685                         return r;
 686
 687                 missing_ok = true;
 688         } else
 689                 return -EINVAL;
 690
 691         if (args->encrypted) {
 692                 flags |= READ_FULL_FILE_UNBASE64;
 693                 maxsz = CREDENTIAL_ENCRYPTED_SIZE_MAX;
 694         } else
 695                 maxsz = CREDENTIAL_SIZE_MAX;
 696
 697         if (search_path)
 698                 STRV_FOREACH(d, search_path) {
 699                         _cleanup_free_ char *j = NULL;
 700
 701                         j = path_join(*d, path);
 702                         if (!j)
 703                                 return -ENOMEM;
 704
 705                         r = read_full_file_full(
 706                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
 707                                         UINT64_MAX,
 708                                         maxsz,
 709                                         flags,
 710                                         NULL,
 711                                         &data, &size);
 712                         if (r != -ENOENT)
 713                                 break;
 714                 }
 715         else if (source)
 716                 r = read_full_file_full(
 717                                 read_dfd, source,
 718                                 UINT64_MAX,
 719                                 maxsz,
 720                                 flags,
 721                                 bindname,
 722                                 &data, &size);
 723         else
 724                 assert_not_reached();
 725
 726         if (r == -ENOENT && (missing_ok || hashmap_contains(args->context->set_credentials, id))) {
 727                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
 728                  * will get clear errors if we don't pass such a missing credential on as they
 729                  * themselves will get ENOENT when trying to read them, which should not be much
 730                  * worse than when we handle the error here and make it fatal.
 731                  *
 732                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
 733                  * we are fine, too. */
 734                 log_full_errno(hashmap_contains(args->context->set_credentials, id) ? LOG_DEBUG : LOG_INFO,
 735                                r, "Couldn't read inherited credential '%s', skipping: %m", path);
 736                 return 0;
 737         }
 738         if (r < 0)
 739                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
 740
 741         return maybe_decrypt_and_write_credential(args, id, data, size);
 742 }
 743
 744 static int load_cred_recurse_dir_cb(
 745                 RecurseDirEvent event,
 746                 const char *path,
 747                 int dir_fd,
 748                 int inode_fd,
 749                 const struct dirent *de,
 750                 const struct statx *sx,
 751                 void *userdata) {
 752
 753         struct load_cred_args *args = ASSERT_PTR(userdata);
 754         _cleanup_free_ char *sub_id = NULL;
 755         int r;
 756
 757         assert(path);
 758         assert(de);
 759
 760         if (event != RECURSE_DIR_ENTRY)
 761                 return RECURSE_DIR_CONTINUE;
 762
 763         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
 764                 return RECURSE_DIR_CONTINUE;
 765
 766         sub_id = strreplace(path, "/", "_");
 767         if (!sub_id)
 768                 return -ENOMEM;
 769
 770         if (!credential_name_valid(sub_id))
 771                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID '%s', which is not valid, refusing.", sub_id);
 772
 773         if (faccessat(args->write_dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
 774                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
 775                 return RECURSE_DIR_CONTINUE;
 776         }
 777         if (errno != ENOENT)
 778                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
 779
 780         r = load_credential(args,
 781                             sub_id,
 782                             dir_fd, de->d_name);
 783         if (r < 0)
 784                 return r;
 785
 786         return RECURSE_DIR_CONTINUE;
 787 }
 788
 789 static int acquire_credentials(
 790                 const ExecContext *context,
 791                 const CGroupContext *cgroup_context,
 792                 const ExecParameters *params,
 793                 const char *unit,
 794                 const char *p,
 795                 uid_t uid,
 796                 gid_t gid,
 797                 bool ownership_ok) {
 798
 799         _cleanup_close_ int dfd = -EBADF;
 800         int r;
 801
 802         assert(context);
 803         assert(cgroup_context);
 804         assert(params);
 805         assert(unit);
 806         assert(p);
 807
 808         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
 809         if (dfd < 0)
 810                 return -errno;
 811
 812         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
 813         if (r < 0)
 814                 return r;
 815
 816         struct load_cred_args args = {
 817                 .context = context,
 818                 .cgroup_context = cgroup_context,
 819                 .params = params,
 820                 .unit = unit,
 821                 .write_dfd = dfd,
 822                 .uid = uid,
 823                 .gid = gid,
 824                 .ownership_ok = ownership_ok,
 825                 .left = CREDENTIALS_TOTAL_SIZE_MAX,
 826         };
 827
 828         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
 829         ExecLoadCredential *lc;
 830         HASHMAP_FOREACH(lc, context->load_credentials) {
 831                 _cleanup_close_ int sub_fd = -EBADF;
 832
 833                 args.encrypted = lc->encrypted;
 834
 835                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
 836                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
 837                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
 838                  * propagate a credential passed to us from further up. */
 839
 840                 if (path_is_absolute(lc->path)) {
 841                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC);
 842                         if (sub_fd < 0 && !IN_SET(errno,
 843                                                   ENOTDIR,  /* Not a directory */
 844                                                   ENOENT))  /* Doesn't exist? */
 845                                 return log_debug_errno(errno, "Failed to open credential source '%s': %m", lc->path);
 846                 }
 847
 848                 if (sub_fd < 0)
 849                         /* Regular file (incl. a credential passed in from higher up) */
 850                         r = load_credential(&args,
 851                                             lc->id,
 852                                             AT_FDCWD, lc->path);
 853                 else
 854                         /* Directory */
 855                         r = recurse_dir(sub_fd,
 856                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
 857                                         /* statx_mask= */ 0,
 858                                         /* n_depth_max= */ UINT_MAX,
 859                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
 860                                         load_cred_recurse_dir_cb,
 861                                         &args);
 862                 if (r < 0)
 863                         return r;
 864         }
 865
 866         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
 867          * override any credentials found earlier. */
 868         ExecImportCredential *ic;
 869         ORDERED_SET_FOREACH(ic, context->import_credentials) {
 870                 _cleanup_free_ char **search_path = NULL;
 871
 872                 r = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED, &search_path);
 873                 if (r < 0)
 874                         return r;
 875
 876                 args.encrypted = false;
 877
 878                 r = load_credential_glob(&args,
 879                                          ic,
 880                                          search_path,
 881                                          READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER);
 882                 if (r < 0)
 883                         return r;
 884
 885                 search_path = strv_free(search_path);
 886
 887                 r = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED, &search_path);
 888                 if (r < 0)
 889                         return r;
 890
 891                 args.encrypted = true;
 892
 893                 r = load_credential_glob(&args,
 894                                          ic,
 895                                          search_path,
 896                                          READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64);
 897                 if (r < 0)
 898                         return r;
 899         }
 900
 901         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
 902          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
 903         ExecSetCredential *sc;
 904         HASHMAP_FOREACH(sc, context->set_credentials) {
 905                 args.encrypted = sc->encrypted;
 906
 907                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
 908                         log_debug("Skipping credential with duplicated ID %s", sc->id);
 909                         continue;
 910                 }
 911                 if (errno != ENOENT)
 912                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
 913
 914                 r = maybe_decrypt_and_write_credential(&args, sc->id, sc->data, sc->size);
 915                 if (r < 0)
 916                         return r;
 917         }
 918
 919         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
 920         if (r < 0)
 921                 return r;
 922
 923         /* After we created all keys with the right perms, also make sure the credential store as a whole is
 924          * accessible */
 925
 926         if (uid_is_valid(uid) && uid != getuid()) {
 927                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
 928                 if (r < 0) {
 929                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
 930                                 return r;
 931
 932                         if (!ownership_ok)
 933                                 return r;
 934
 935                         if (fchown(dfd, uid, gid) < 0)
 936                                 return -errno;
 937                 }
 938         }
 939
 940         return 0;
 941 }
 942
 943 static int setup_credentials_internal(
 944                 const ExecContext *context,
 945                 const CGroupContext *cgroup_context,
 946                 const ExecParameters *params,
 947                 const char *unit,
 948                 const char *final,        /* This is where the credential store shall eventually end up at */
 949                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
 950                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
 951                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
 952                 uid_t uid,
 953                 gid_t gid) {
 954
 955         bool final_mounted;
 956         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
 957                                    * if we mounted something; false if we definitely can't mount anything */
 958
 959         assert(context);
 960         assert(params);
 961         assert(unit);
 962         assert(final);
 963         assert(workspace);
 964
 965         r = path_is_mount_point(final);
 966         if (r < 0)
 967                 return log_debug_errno(r, "Failed to determine if '%s' is a mountpoint: %m", final);
 968         final_mounted = r > 0;
 969
 970         if (final_mounted) {
 971                 if (FLAGS_SET(params->flags, EXEC_SETUP_CREDENTIALS_FRESH)) {
 972                         r = umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
 973                         if (r < 0)
 974                                 return r;
 975
 976                         final_mounted = false;
 977                 } else {
 978                         /* We can reuse the previous credential dir */
 979                         r = dir_is_empty(final, /* ignore_hidden_or_backup = */ false);
 980                         if (r < 0)
 981                                 return r;
 982                         if (r == 0) {
 983                                 log_debug("Credential dir for unit '%s' already set up, skipping.", unit);
 984                                 return 0;
 985                         }
 986                 }
 987         }
 988
 989         if (reuse_workspace) {
 990                 r = path_is_mount_point(workspace);
 991                 if (r < 0)
 992                         return r;
 993                 if (r > 0)
 994                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
 995                                                    * it, let's keep this in mind */
 996                 else
 997                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
 998         } else
 999                 workspace_mounted = -1; /* ditto */
1000
1001         /* If both the final place and the workspace are mounted, we have no mounts to set up, based on
1002          * the assumption that they're actually the same tmpfs (but the latter with MS_RDONLY different).
1003          * If the workspace is not mounted, we just bind the final place over and make it writable. */
1004         must_mount = must_mount || final_mounted;
1005
1006         if (workspace_mounted < 0) {
1007                 if (!final_mounted)
1008                         /* Nothing is mounted on the workspace yet, let's try to mount a new tmpfs if
1009                          * not using the final place. */
1010                         r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
1011                 if (final_mounted || r < 0) {
1012                         /* If using final place or failed to mount new tmpfs, make a bind mount from
1013                          * the final to the workspace, so that we can make it writable there. */
1014                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
1015                         if (r < 0) {
1016                                 if (!ERRNO_IS_PRIVILEGE(r))
1017                                         /* Propagate anything that isn't a permission problem. */
1018                                         return r;
1019
1020                                 if (must_mount)
1021                                         /* If it's not OK to use the plain directory fallback, propagate all
1022                                          * errors too. */
1023                                         return r;
1024
1025                                 /* If we lack privileges to bind mount stuff, then let's gracefully proceed
1026                                  * for compat with container envs, and just use the final dir as is.
1027                                  * Final place must not be mounted in this case (refused by must_mount
1028                                  * above) */
1029
1030                                 workspace_mounted = false;
1031                         } else {
1032                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
1033                                 r = mount_nofollow_verbose(LOG_DEBUG,
1034                                                            NULL,
1035                                                            workspace,
1036                                                            NULL,
1037                                                            MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false),
1038                                                            NULL);
1039                                 if (r < 0)
1040                                         return r;
1041
1042                                 workspace_mounted = true;
1043                         }
1044                 } else
1045                         workspace_mounted = true;
1046         }
1047
1048         assert(workspace_mounted >= 0);
1049         assert(!must_mount || workspace_mounted);
1050
1051         const char *where = workspace_mounted ? workspace : final;
1052
1053         (void) label_fix_full(AT_FDCWD, where, final, 0);
1054
1055         r = acquire_credentials(context, cgroup_context, params, unit, where, uid, gid, workspace_mounted);
1056         if (r < 0) {
1057                 /* If we're using final place as workspace, and failed to acquire credentials, we might
1058                  * have left half-written creds there. Let's get rid of the whole mount, so future
1059                  * calls won't reuse it. */
1060                 if (final_mounted)
1061                         (void) umount_verbose(LOG_DEBUG, final, MNT_DETACH|UMOUNT_NOFOLLOW);
1062
1063                 return r;
1064         }
1065
1066         if (workspace_mounted) {
1067                 if (!final_mounted) {
1068                         /* Make workspace read-only now, so that any bind mount we make from it defaults to
1069                          * read-only too */
1070                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
1071                         if (r < 0)
1072                                 return r;
1073
1074                         /* And mount it to the final place, read-only */
1075                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
1076                 } else
1077                         /* Otherwise we just get rid of the bind mount of final place */
1078                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
1079                 if (r < 0)
1080                         return r;
1081         } else {
1082                 _cleanup_free_ char *parent = NULL;
1083
1084                 /* If we do not have our own mount put used the plain directory fallback, then we need to
1085                  * open access to the top-level credential directory and the per-service directory now */
1086
1087                 r = path_extract_directory(final, &parent);
1088                 if (r < 0)
1089                         return r;
1090                 if (chmod(parent, 0755) < 0)
1091                         return -errno;
1092         }
1093
1094         return 0;
1095 }
1096
1097 int exec_setup_credentials(
1098                 const ExecContext *context,
1099                 const CGroupContext *cgroup_context,
1100                 const ExecParameters *params,
1101                 const char *unit,
1102                 uid_t uid,
1103                 gid_t gid) {
1104
1105         _cleanup_free_ char *p = NULL, *q = NULL;
1106         int r;
1107
1108         assert(context);
1109         assert(params);
1110         assert(unit);
1111
1112         if (!exec_params_need_credentials(params) || !exec_context_has_credentials(context))
1113                 return 0;
1114
1115         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
1116                 return -EINVAL;
1117
1118         /* This is where we'll place stuff when we are done; the main credentials directory is world-readable,
1119          * and the subdir we mount over with a read-only file system readable by the service's user. */
1120         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
1121         if (!q)
1122                 return -ENOMEM;
1123
1124         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
1125         if (r < 0 && r != -EEXIST)
1126                 return r;
1127
1128         p = path_join(q, unit);
1129         if (!p)
1130                 return -ENOMEM;
1131
1132         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
1133         if (r < 0 && r != -EEXIST)
1134                 return r;
1135
1136         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG_SIGTERM|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
1137         if (r < 0) {
1138                 _cleanup_(rmdir_and_freep) char *u = NULL; /* remove the temporary workspace if we can */
1139                 _cleanup_free_ char *t = NULL;
1140
1141                 /* If this is not a privilege or support issue then propagate the error */
1142                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
1143                         return r;
1144
1145                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
1146                  * it into place, so that users can't access half-initialized credential stores. */
1147                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
1148                 if (!t)
1149                         return -ENOMEM;
1150
1151                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
1152                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
1153                  * after it is fully set up */
1154                 u = path_join(t, unit);
1155                 if (!u)
1156                         return -ENOMEM;
1157
1158                 FOREACH_STRING(i, t, u) {
1159                         r = mkdir_label(i, 0700);
1160                         if (r < 0 && r != -EEXIST)
1161                                 return log_debug_errno(r, "Failed to make directory '%s': %m", i);
1162                 }
1163
1164                 r = setup_credentials_internal(
1165                                 context,
1166                                 cgroup_context,
1167                                 params,
1168                                 unit,
1169                                 p,       /* final mount point */
1170                                 u,       /* temporary workspace to overmount */
1171                                 true,    /* reuse the workspace if it is already a mount */
1172                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
1173                                 uid,
1174                                 gid);
1175                 if (r < 0)
1176                         return r;
1177
1178         } else if (r == 0) {
1179
1180                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
1181                  * we can use the same directory for all cases, after turning off propagation. Question
1182                  * though is: where do we turn off propagation exactly, and where do we place the workspace
1183                  * directory? We need some place that is guaranteed to be a mount point in the host, and
1184                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
1185                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
1186                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
1187                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
1188                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
1189                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
1190                  * propagation on the former, and then overmount the latter.
1191                  *
1192                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
1193                  * for this purpose, but there are few other candidates that work equally well for us, and
1194                  * given that we do this in a privately namespaced short-lived single-threaded process that
1195                  * no one else sees this should be OK to do. */
1196
1197                 /* Turn off propagation from our namespace to host */
1198                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
1199                 if (r < 0)
1200                         goto child_fail;
1201
1202                 r = setup_credentials_internal(
1203                                 context,
1204                                 cgroup_context,
1205                                 params,
1206                                 unit,
1207                                 p,           /* final mount point */
1208                                 "/dev/shm",  /* temporary workspace to overmount */
1209                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
1210                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
1211                                 uid,
1212                                 gid);
1213                 if (r < 0)
1214                         goto child_fail;
1215
1216                 _exit(EXIT_SUCCESS);
1217
1218         child_fail:
1219                 _exit(EXIT_FAILURE);
1220         }
1221
1222         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
1223          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
1224          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
1225          * seen by users when trying access this inode. */
1226         (void) rmdir(p);
1227         return 0;
1228 }