src/core/credential.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <sys/mount.h>
   4
   5 #include "acl-util.h"
   6 #include "credential.h"
   7 #include "creds-util.h"
   8 #include "execute.h"
   9 #include "fileio.h"
  10 #include "glob-util.h"
  11 #include "io-util.h"
  12 #include "label-util.h"
  13 #include "mkdir-label.h"
  14 #include "mount-util.h"
  15 #include "mountpoint-util.h"
  16 #include "process-util.h"
  17 #include "random-util.h"
  18 #include "recurse-dir.h"
  19 #include "rm-rf.h"
  20 #include "tmpfile-util.h"
  21
  22 ExecSetCredential *exec_set_credential_free(ExecSetCredential *sc) {
  23         if (!sc)
  24                 return NULL;
  25
  26         free(sc->id);
  27         free(sc->data);
  28         return mfree(sc);
  29 }
  30
  31 ExecLoadCredential *exec_load_credential_free(ExecLoadCredential *lc) {
  32         if (!lc)
  33                 return NULL;
  34
  35         free(lc->id);
  36         free(lc->path);
  37         return mfree(lc);
  38 }
  39
  40 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
  41         exec_set_credential_hash_ops,
  42         char, string_hash_func, string_compare_func,
  43         ExecSetCredential, exec_set_credential_free);
  44
  45 DEFINE_HASH_OPS_WITH_VALUE_DESTRUCTOR(
  46         exec_load_credential_hash_ops,
  47         char, string_hash_func, string_compare_func,
  48         ExecLoadCredential, exec_load_credential_free);
  49
  50 bool exec_context_has_credentials(const ExecContext *c) {
  51         assert(c);
  52
  53         return !hashmap_isempty(c->set_credentials) ||
  54                 !hashmap_isempty(c->load_credentials) ||
  55                 !set_isempty(c->import_credentials);
  56 }
  57
  58 bool exec_context_has_encrypted_credentials(ExecContext *c) {
  59         ExecLoadCredential *load_cred;
  60         ExecSetCredential *set_cred;
  61
  62         assert(c);
  63
  64         HASHMAP_FOREACH(load_cred, c->load_credentials)
  65                 if (load_cred->encrypted)
  66                         return true;
  67
  68         HASHMAP_FOREACH(set_cred, c->set_credentials)
  69                 if (set_cred->encrypted)
  70                         return true;
  71
  72         return false;
  73 }
  74
  75 static int get_credential_directory(
  76                 const char *runtime_prefix,
  77                 const char *unit,
  78                 char **ret) {
  79
  80         char *p;
  81
  82         assert(ret);
  83
  84         if (!runtime_prefix || !unit) {
  85                 *ret = NULL;
  86                 return 0;
  87         }
  88
  89         p = path_join(runtime_prefix, "credentials", unit);
  90         if (!p)
  91                 return -ENOMEM;
  92
  93         *ret = p;
  94         return 1;
  95 }
  96
  97 int unit_add_default_credential_dependencies(Unit *u, const ExecContext *c) {
  98         _cleanup_free_ char *p = NULL, *m = NULL;
  99         int r;
 100
 101         assert(u);
 102         assert(c);
 103
 104         if (!exec_context_has_credentials(c))
 105                 return 0;
 106
 107         /* Let's make sure the credentials directory of this service is unmounted *after* the service itself
 108          * shuts down. This only matters if mount namespacing is not used for the service, and hence the
 109          * credentials mount appears on the host. */
 110
 111         r = get_credential_directory(u->manager->prefix[EXEC_DIRECTORY_RUNTIME], u->id, &p);
 112         if (r <= 0)
 113                 return r;
 114
 115         r = unit_name_from_path(p, ".mount", &m);
 116         if (r < 0)
 117                 return r;
 118
 119         return unit_add_dependency_by_name(u, UNIT_AFTER, m, /* add_reference= */ true, UNIT_DEPENDENCY_FILE);
 120 }
 121
 122 int exec_context_destroy_credentials(const ExecContext *c, const char *runtime_prefix, const char *unit) {
 123         _cleanup_free_ char *p = NULL;
 124         int r;
 125
 126         assert(c);
 127
 128         r = get_credential_directory(runtime_prefix, unit, &p);
 129         if (r <= 0)
 130                 return r;
 131
 132         /* This is either a tmpfs/ramfs of its own, or a plain directory. Either way, let's first try to
 133          * unmount it, and afterwards remove the mount point */
 134         (void) umount2(p, MNT_DETACH|UMOUNT_NOFOLLOW);
 135         (void) rm_rf(p, REMOVE_ROOT|REMOVE_CHMOD);
 136
 137         return 0;
 138 }
 139
 140 static int write_credential(
 141                 int dfd,
 142                 const char *id,
 143                 const void *data,
 144                 size_t size,
 145                 uid_t uid,
 146                 gid_t gid,
 147                 bool ownership_ok) {
 148
 149         _cleanup_(unlink_and_freep) char *tmp = NULL;
 150         _cleanup_close_ int fd = -EBADF;
 151         int r;
 152
 153         r = tempfn_random_child("", "cred", &tmp);
 154         if (r < 0)
 155                 return r;
 156
 157         fd = openat(dfd, tmp, O_CREAT|O_RDWR|O_CLOEXEC|O_EXCL|O_NOFOLLOW|O_NOCTTY, 0600);
 158         if (fd < 0) {
 159                 tmp = mfree(tmp);
 160                 return -errno;
 161         }
 162
 163         r = loop_write(fd, data, size, /* do_poll = */ false);
 164         if (r < 0)
 165                 return r;
 166
 167         if (fchmod(fd, 0400) < 0) /* Take away "w" bit */
 168                 return -errno;
 169
 170         if (uid_is_valid(uid) && uid != getuid()) {
 171                 r = fd_add_uid_acl_permission(fd, uid, ACL_READ);
 172                 if (r < 0) {
 173                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
 174                                 return r;
 175
 176                         if (!ownership_ok) /* Ideally we use ACLs, since we can neatly express what we want
 177                                             * to express: that the user gets read access and nothing
 178                                             * else. But if the backing fs can't support that (e.g. ramfs)
 179                                             * then we can use file ownership instead. But that's only safe if
 180                                             * we can then re-mount the whole thing read-only, so that the
 181                                             * user can no longer chmod() the file to gain write access. */
 182                                 return r;
 183
 184                         if (fchown(fd, uid, gid) < 0)
 185                                 return -errno;
 186                 }
 187         }
 188
 189         if (renameat(dfd, tmp, dfd, id) < 0)
 190                 return -errno;
 191
 192         tmp = mfree(tmp);
 193         return 0;
 194 }
 195
 196 typedef enum CredentialSearchPath {
 197         CREDENTIAL_SEARCH_PATH_TRUSTED,
 198         CREDENTIAL_SEARCH_PATH_ENCRYPTED,
 199         CREDENTIAL_SEARCH_PATH_ALL,
 200         _CREDENTIAL_SEARCH_PATH_MAX,
 201         _CREDENTIAL_SEARCH_PATH_INVALID = -EINVAL,
 202 } CredentialSearchPath;
 203
 204 static char **credential_search_path(const ExecParameters *params, CredentialSearchPath path) {
 205
 206         _cleanup_strv_free_ char **l = NULL;
 207
 208         assert(params);
 209         assert(path >= 0 && path < _CREDENTIAL_SEARCH_PATH_MAX);
 210
 211         /* Assemble a search path to find credentials in. For non-encrypted credentials, We'll look in
 212          * /etc/credstore/ (and similar directories in /usr/lib/ + /run/). If we're looking for encrypted
 213          * credentials, we'll look in /etc/credstore.encrypted/ (and similar dirs). */
 214
 215         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_ENCRYPTED, CREDENTIAL_SEARCH_PATH_ALL)) {
 216                 if (strv_extend(&l, params->received_encrypted_credentials_directory) < 0)
 217                         return NULL;
 218
 219                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore.encrypted"), /* filter_duplicates= */ true) < 0)
 220                         return NULL;
 221         }
 222
 223         if (IN_SET(path, CREDENTIAL_SEARCH_PATH_TRUSTED, CREDENTIAL_SEARCH_PATH_ALL)) {
 224                 if (params->received_credentials_directory)
 225                         if (strv_extend(&l, params->received_credentials_directory) < 0)
 226                                 return NULL;
 227
 228                 if (strv_extend_strv(&l, CONF_PATHS_STRV("credstore"), /* filter_duplicates= */ true) < 0)
 229                         return NULL;
 230         }
 231
 232         if (DEBUG_LOGGING) {
 233                 _cleanup_free_ char *t = strv_join(l, ":");
 234
 235                 log_debug("Credential search path is: %s", strempty(t));
 236         }
 237
 238         return TAKE_PTR(l);
 239 }
 240
 241 static int maybe_decrypt_and_write_credential(
 242                 int dir_fd,
 243                 const char *id,
 244                 bool encrypted,
 245                 uid_t uid,
 246                 gid_t gid,
 247                 bool ownership_ok,
 248                 const char *data,
 249                 size_t size,
 250                 uint64_t *left) {
 251
 252         _cleanup_free_ void *plaintext = NULL;
 253         size_t add;
 254         int r;
 255
 256         if (encrypted) {
 257                 size_t plaintext_size = 0;
 258
 259                 r = decrypt_credential_and_warn(id, now(CLOCK_REALTIME), NULL, NULL, data, size,
 260                                                 &plaintext, &plaintext_size);
 261                 if (r < 0)
 262                         return r;
 263
 264                 data = plaintext;
 265                 size = plaintext_size;
 266         }
 267
 268         add = strlen(id) + size;
 269         if (add > *left)
 270                 return -E2BIG;
 271
 272         r = write_credential(dir_fd, id, data, size, uid, gid, ownership_ok);
 273         if (r < 0)
 274                 return log_debug_errno(r, "Failed to write credential '%s': %m", id);
 275
 276         *left -= add;
 277         return 0;
 278 }
 279
 280 static int load_credential_glob(
 281                 const char *path,
 282                 bool encrypted,
 283                 char **search_path,
 284                 ReadFullFileFlags flags,
 285                 int write_dfd,
 286                 uid_t uid,
 287                 gid_t gid,
 288                 bool ownership_ok,
 289                 uint64_t *left) {
 290
 291         int r;
 292
 293         STRV_FOREACH(d, search_path) {
 294                 _cleanup_globfree_ glob_t pglob = {};
 295                 _cleanup_free_ char *j = NULL;
 296
 297                 j = path_join(*d, path);
 298                 if (!j)
 299                         return -ENOMEM;
 300
 301                 r = safe_glob(j, 0, &pglob);
 302                 if (r == -ENOENT)
 303                         continue;
 304                 if (r < 0)
 305                         return r;
 306
 307                 for (size_t n = 0; n < pglob.gl_pathc; n++) {
 308                         _cleanup_free_ char *fn = NULL;
 309                         _cleanup_(erase_and_freep) char *data = NULL;
 310                         size_t size;
 311
 312                         /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
 313                         r = read_full_file_full(
 314                                 AT_FDCWD,
 315                                 pglob.gl_pathv[n],
 316                                 UINT64_MAX,
 317                                 encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX,
 318                                 flags,
 319                                 NULL,
 320                                 &data, &size);
 321                         if (r < 0)
 322                                 return log_debug_errno(r, "Failed to read credential '%s': %m",
 323                                                         pglob.gl_pathv[n]);
 324
 325                         r = path_extract_filename(pglob.gl_pathv[n], &fn);
 326                         if (r < 0)
 327                                 return log_debug_errno(r, "Failed to extract filename from '%s': %m",
 328                                                         pglob.gl_pathv[n]);
 329
 330                         r = maybe_decrypt_and_write_credential(
 331                                 write_dfd,
 332                                 fn,
 333                                 encrypted,
 334                                 uid,
 335                                 gid,
 336                                 ownership_ok,
 337                                 data, size,
 338                                 left);
 339                         if (r == -EEXIST)
 340                                 continue;
 341                         if (r < 0)
 342                                 return r;
 343                 }
 344         }
 345
 346         return 0;
 347 }
 348
 349 static int load_credential(
 350                 const ExecContext *context,
 351                 const ExecParameters *params,
 352                 const char *id,
 353                 const char *path,
 354                 bool encrypted,
 355                 const char *unit,
 356                 int read_dfd,
 357                 int write_dfd,
 358                 uid_t uid,
 359                 gid_t gid,
 360                 bool ownership_ok,
 361                 uint64_t *left) {
 362
 363         ReadFullFileFlags flags = READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER;
 364         _cleanup_strv_free_ char **search_path = NULL;
 365         _cleanup_(erase_and_freep) char *data = NULL;
 366         _cleanup_free_ char *bindname = NULL;
 367         const char *source = NULL;
 368         bool missing_ok = true;
 369         size_t size, maxsz;
 370         int r;
 371
 372         assert(context);
 373         assert(params);
 374         assert(id);
 375         assert(path);
 376         assert(unit);
 377         assert(read_dfd >= 0 || read_dfd == AT_FDCWD);
 378         assert(write_dfd >= 0);
 379         assert(left);
 380
 381         if (read_dfd >= 0) {
 382                 /* If a directory fd is specified, then read the file directly from that dir. In this case we
 383                  * won't do AF_UNIX stuff (we simply don't want to recursively iterate down a tree of AF_UNIX
 384                  * IPC sockets). It's OK if a file vanishes here in the time we enumerate it and intend to
 385                  * open it. */
 386
 387                 if (!filename_is_valid(path)) /* safety check */
 388                         return -EINVAL;
 389
 390                 missing_ok = true;
 391                 source = path;
 392
 393         } else if (path_is_absolute(path)) {
 394                 /* If this is an absolute path, read the data directly from it, and support AF_UNIX
 395                  * sockets */
 396
 397                 if (!path_is_valid(path)) /* safety check */
 398                         return -EINVAL;
 399
 400                 flags |= READ_FULL_FILE_CONNECT_SOCKET;
 401
 402                 /* Pass some minimal info about the unit and the credential name we are looking to acquire
 403                  * via the source socket address in case we read off an AF_UNIX socket. */
 404                 if (asprintf(&bindname, "@%" PRIx64"/unit/%s/%s", random_u64(), unit, id) < 0)
 405                         return -ENOMEM;
 406
 407                 missing_ok = false;
 408                 source = path;
 409
 410         } else if (credential_name_valid(path)) {
 411                 /* If this is a relative path, take it as credential name relative to the credentials
 412                  * directory we received ourselves. We don't support the AF_UNIX stuff in this mode, since we
 413                  * are operating on a credential store, i.e. this is guaranteed to be regular files. */
 414
 415                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ALL);
 416                 if (!search_path)
 417                         return -ENOMEM;
 418
 419                 missing_ok = true;
 420         } else
 421                 source = NULL;
 422
 423         if (encrypted)
 424                 flags |= READ_FULL_FILE_UNBASE64;
 425
 426         maxsz = encrypted ? CREDENTIAL_ENCRYPTED_SIZE_MAX : CREDENTIAL_SIZE_MAX;
 427
 428         if (search_path) {
 429                 STRV_FOREACH(d, search_path) {
 430                         _cleanup_free_ char *j = NULL;
 431
 432                         j = path_join(*d, path);
 433                         if (!j)
 434                                 return -ENOMEM;
 435
 436                         r = read_full_file_full(
 437                                         AT_FDCWD, j, /* path is absolute, hence pass AT_FDCWD as nop dir fd here */
 438                                         UINT64_MAX,
 439                                         maxsz,
 440                                         flags,
 441                                         NULL,
 442                                         &data, &size);
 443                         if (r != -ENOENT)
 444                                 break;
 445                 }
 446         } else if (source)
 447                 r = read_full_file_full(
 448                                 read_dfd, source,
 449                                 UINT64_MAX,
 450                                 maxsz,
 451                                 flags,
 452                                 bindname,
 453                                 &data, &size);
 454         else
 455                 r = -ENOENT;
 456
 457         if (r == -ENOENT && (missing_ok || hashmap_contains(context->set_credentials, id))) {
 458                 /* Make a missing inherited credential non-fatal, let's just continue. After all apps
 459                  * will get clear errors if we don't pass such a missing credential on as they
 460                  * themselves will get ENOENT when trying to read them, which should not be much
 461                  * worse than when we handle the error here and make it fatal.
 462                  *
 463                  * Also, if the source file doesn't exist, but a fallback is set via SetCredentials=
 464                  * we are fine, too. */
 465                 log_debug_errno(r, "Couldn't read inherited credential '%s', skipping: %m", path);
 466                 return 0;
 467         }
 468         if (r < 0)
 469                 return log_debug_errno(r, "Failed to read credential '%s': %m", path);
 470
 471         return maybe_decrypt_and_write_credential(write_dfd, id, encrypted, uid, gid, ownership_ok, data, size, left);
 472 }
 473
 474 struct load_cred_args {
 475         const ExecContext *context;
 476         const ExecParameters *params;
 477         bool encrypted;
 478         const char *unit;
 479         int dfd;
 480         uid_t uid;
 481         gid_t gid;
 482         bool ownership_ok;
 483         uint64_t *left;
 484 };
 485
 486 static int load_cred_recurse_dir_cb(
 487                 RecurseDirEvent event,
 488                 const char *path,
 489                 int dir_fd,
 490                 int inode_fd,
 491                 const struct dirent *de,
 492                 const struct statx *sx,
 493                 void *userdata) {
 494
 495         struct load_cred_args *args = ASSERT_PTR(userdata);
 496         _cleanup_free_ char *sub_id = NULL;
 497         int r;
 498
 499         if (event != RECURSE_DIR_ENTRY)
 500                 return RECURSE_DIR_CONTINUE;
 501
 502         if (!IN_SET(de->d_type, DT_REG, DT_SOCK))
 503                 return RECURSE_DIR_CONTINUE;
 504
 505         sub_id = strreplace(path, "/", "_");
 506         if (!sub_id)
 507                 return -ENOMEM;
 508
 509         if (!credential_name_valid(sub_id))
 510                 return log_debug_errno(SYNTHETIC_ERRNO(EINVAL), "Credential would get ID %s, which is not valid, refusing", sub_id);
 511
 512         if (faccessat(args->dfd, sub_id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0) {
 513                 log_debug("Skipping credential with duplicated ID %s at %s", sub_id, path);
 514                 return RECURSE_DIR_CONTINUE;
 515         }
 516         if (errno != ENOENT)
 517                 return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sub_id);
 518
 519         r = load_credential(
 520                         args->context,
 521                         args->params,
 522                         sub_id,
 523                         de->d_name,
 524                         args->encrypted,
 525                         args->unit,
 526                         dir_fd,
 527                         args->dfd,
 528                         args->uid,
 529                         args->gid,
 530                         args->ownership_ok,
 531                         args->left);
 532         if (r < 0)
 533                 return r;
 534
 535         return RECURSE_DIR_CONTINUE;
 536 }
 537
 538 static int acquire_credentials(
 539                 const ExecContext *context,
 540                 const ExecParameters *params,
 541                 const char *unit,
 542                 const char *p,
 543                 uid_t uid,
 544                 gid_t gid,
 545                 bool ownership_ok) {
 546
 547         uint64_t left = CREDENTIALS_TOTAL_SIZE_MAX;
 548         _cleanup_close_ int dfd = -EBADF;
 549         const char *ic;
 550         ExecLoadCredential *lc;
 551         ExecSetCredential *sc;
 552         int r;
 553
 554         assert(context);
 555         assert(p);
 556
 557         dfd = open(p, O_DIRECTORY|O_CLOEXEC);
 558         if (dfd < 0)
 559                 return -errno;
 560
 561         r = fd_acl_make_writable(dfd); /* Add the "w" bit, if we are reusing an already set up credentials dir where it was unset */
 562         if (r < 0)
 563                 return r;
 564
 565         /* First, load credentials off disk (or acquire via AF_UNIX socket) */
 566         HASHMAP_FOREACH(lc, context->load_credentials) {
 567                 _cleanup_close_ int sub_fd = -EBADF;
 568
 569                 /* If this is an absolute path, then try to open it as a directory. If that works, then we'll
 570                  * recurse into it. If it is an absolute path but it isn't a directory, then we'll open it as
 571                  * a regular file. Finally, if it's a relative path we will use it as a credential name to
 572                  * propagate a credential passed to us from further up. */
 573
 574                 if (path_is_absolute(lc->path)) {
 575                         sub_fd = open(lc->path, O_DIRECTORY|O_CLOEXEC|O_RDONLY);
 576                         if (sub_fd < 0 && !IN_SET(errno,
 577                                                   ENOTDIR,  /* Not a directory */
 578                                                   ENOENT))  /* Doesn't exist? */
 579                                 return log_debug_errno(errno, "Failed to open '%s': %m", lc->path);
 580                 }
 581
 582                 if (sub_fd < 0)
 583                         /* Regular file (incl. a credential passed in from higher up) */
 584                         r = load_credential(
 585                                         context,
 586                                         params,
 587                                         lc->id,
 588                                         lc->path,
 589                                         lc->encrypted,
 590                                         unit,
 591                                         AT_FDCWD,
 592                                         dfd,
 593                                         uid,
 594                                         gid,
 595                                         ownership_ok,
 596                                         &left);
 597                 else
 598                         /* Directory */
 599                         r = recurse_dir(
 600                                         sub_fd,
 601                                         /* path= */ lc->id, /* recurse_dir() will suffix the subdir paths from here to the top-level id */
 602                                         /* statx_mask= */ 0,
 603                                         /* n_depth_max= */ UINT_MAX,
 604                                         RECURSE_DIR_SORT|RECURSE_DIR_IGNORE_DOT|RECURSE_DIR_ENSURE_TYPE,
 605                                         load_cred_recurse_dir_cb,
 606                                         &(struct load_cred_args) {
 607                                                 .context = context,
 608                                                 .params = params,
 609                                                 .encrypted = lc->encrypted,
 610                                                 .unit = unit,
 611                                                 .dfd = dfd,
 612                                                 .uid = uid,
 613                                                 .gid = gid,
 614                                                 .ownership_ok = ownership_ok,
 615                                                 .left = &left,
 616                                         });
 617                 if (r < 0)
 618                         return r;
 619         }
 620
 621         /* Next, look for system credentials and credentials in the credentials store. Note that these do not
 622          * override any credentials found earlier. */
 623         SET_FOREACH(ic, context->import_credentials) {
 624                 _cleanup_free_ char **search_path = NULL;
 625
 626                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_TRUSTED);
 627                 if (!search_path)
 628                         return -ENOMEM;
 629
 630                 r = load_credential_glob(
 631                                 ic,
 632                                 /* encrypted = */ false,
 633                                 search_path,
 634                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER,
 635                                 dfd,
 636                                 uid,
 637                                 gid,
 638                                 ownership_ok,
 639                                 &left);
 640                 if (r < 0)
 641                         return r;
 642
 643                 search_path = strv_free(search_path);
 644                 search_path = credential_search_path(params, CREDENTIAL_SEARCH_PATH_ENCRYPTED);
 645                 if (!search_path)
 646                         return -ENOMEM;
 647
 648                 r = load_credential_glob(
 649                                 ic,
 650                                 /* encrypted = */ true,
 651                                 search_path,
 652                                 READ_FULL_FILE_SECURE|READ_FULL_FILE_FAIL_WHEN_LARGER|READ_FULL_FILE_UNBASE64,
 653                                 dfd,
 654                                 uid,
 655                                 gid,
 656                                 ownership_ok,
 657                                 &left);
 658                 if (r < 0)
 659                         return r;
 660         }
 661
 662         /* Finally, we add in literally specified credentials. If the credentials already exist, we'll not
 663          * add them, so that they can act as a "default" if the same credential is specified multiple times. */
 664         HASHMAP_FOREACH(sc, context->set_credentials) {
 665                 _cleanup_(erase_and_freep) void *plaintext = NULL;
 666                 const char *data;
 667                 size_t size, add;
 668
 669                 /* Note that we check ahead of time here instead of relying on O_EXCL|O_CREAT later to return
 670                  * EEXIST if the credential already exists. That's because the TPM2-based decryption is kinda
 671                  * slow and involved, hence it's nice to be able to skip that if the credential already
 672                  * exists anyway. */
 673                 if (faccessat(dfd, sc->id, F_OK, AT_SYMLINK_NOFOLLOW) >= 0)
 674                         continue;
 675                 if (errno != ENOENT)
 676                         return log_debug_errno(errno, "Failed to test if credential %s exists: %m", sc->id);
 677
 678                 if (sc->encrypted) {
 679                         r = decrypt_credential_and_warn(sc->id, now(CLOCK_REALTIME), NULL, NULL, sc->data, sc->size, &plaintext, &size);
 680                         if (r < 0)
 681                                 return r;
 682
 683                         data = plaintext;
 684                 } else {
 685                         data = sc->data;
 686                         size = sc->size;
 687                 }
 688
 689                 add = strlen(sc->id) + size;
 690                 if (add > left)
 691                         return -E2BIG;
 692
 693                 r = write_credential(dfd, sc->id, data, size, uid, gid, ownership_ok);
 694                 if (r < 0)
 695                         return r;
 696
 697                 left -= add;
 698         }
 699
 700         r = fd_acl_make_read_only(dfd); /* Now take away the "w" bit */
 701         if (r < 0)
 702                 return r;
 703
 704         /* After we created all keys with the right perms, also make sure the credential store as a whole is
 705          * accessible */
 706
 707         if (uid_is_valid(uid) && uid != getuid()) {
 708                 r = fd_add_uid_acl_permission(dfd, uid, ACL_READ | ACL_EXECUTE);
 709                 if (r < 0) {
 710                         if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
 711                                 return r;
 712
 713                         if (!ownership_ok)
 714                                 return r;
 715
 716                         if (fchown(dfd, uid, gid) < 0)
 717                                 return -errno;
 718                 }
 719         }
 720
 721         return 0;
 722 }
 723
 724 static int setup_credentials_internal(
 725                 const ExecContext *context,
 726                 const ExecParameters *params,
 727                 const char *unit,
 728                 const char *final,        /* This is where the credential store shall eventually end up at */
 729                 const char *workspace,    /* This is where we can prepare it before moving it to the final place */
 730                 bool reuse_workspace,     /* Whether to reuse any existing workspace mount if it already is a mount */
 731                 bool must_mount,          /* Whether to require that we mount something, it's not OK to use the plain directory fall back */
 732                 uid_t uid,
 733                 gid_t gid) {
 734
 735         int r, workspace_mounted; /* negative if we don't know yet whether we have/can mount something; true
 736                                    * if we mounted something; false if we definitely can't mount anything */
 737         bool final_mounted;
 738         const char *where;
 739
 740         assert(context);
 741         assert(final);
 742         assert(workspace);
 743
 744         if (reuse_workspace) {
 745                 r = path_is_mount_point(workspace, NULL, 0);
 746                 if (r < 0)
 747                         return r;
 748                 if (r > 0)
 749                         workspace_mounted = true; /* If this is already a mount, and we are supposed to reuse
 750                                                    * it, let's keep this in mind */
 751                 else
 752                         workspace_mounted = -1; /* We need to figure out if we can mount something to the workspace */
 753         } else
 754                 workspace_mounted = -1; /* ditto */
 755
 756         r = path_is_mount_point(final, NULL, 0);
 757         if (r < 0)
 758                 return r;
 759         if (r > 0) {
 760                 /* If the final place already has something mounted, we use that. If the workspace also has
 761                  * something mounted we assume it's actually the same mount (but with MS_RDONLY
 762                  * different). */
 763                 final_mounted = true;
 764
 765                 if (workspace_mounted < 0) {
 766                         /* If the final place is mounted, but the workspace isn't, then let's bind mount
 767                          * the final version to the workspace, and make it writable, so that we can make
 768                          * changes */
 769
 770                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
 771                         if (r < 0)
 772                                 return r;
 773
 774                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
 775                         if (r < 0)
 776                                 return r;
 777
 778                         workspace_mounted = true;
 779                 }
 780         } else
 781                 final_mounted = false;
 782
 783         if (workspace_mounted < 0) {
 784                 /* Nothing is mounted on the workspace yet, let's try to mount something now */
 785
 786                 r = mount_credentials_fs(workspace, CREDENTIALS_TOTAL_SIZE_MAX, /* ro= */ false);
 787                 if (r < 0) {
 788                         /* If that didn't work, try to make a bind mount from the final to the workspace, so
 789                          * that we can make it writable there. */
 790                         r = mount_nofollow_verbose(LOG_DEBUG, final, workspace, NULL, MS_BIND|MS_REC, NULL);
 791                         if (r < 0) {
 792                                 if (!ERRNO_IS_PRIVILEGE(r))
 793                                         /* Propagate anything that isn't a permission problem. */
 794                                         return r;
 795
 796                                 if (must_mount)
 797                                         /* If it's not OK to use the plain directory fallback, propagate all
 798                                          * errors too. */
 799                                         return r;
 800
 801                                 /* If we lack privileges to bind mount stuff, then let's gracefully proceed
 802                                  * for compat with container envs, and just use the final dir as is. */
 803
 804                                 workspace_mounted = false;
 805                         } else {
 806                                 /* Make the new bind mount writable (i.e. drop MS_RDONLY) */
 807                                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ false), NULL);
 808                                 if (r < 0)
 809                                         return r;
 810
 811                                 workspace_mounted = true;
 812                         }
 813                 } else
 814                         workspace_mounted = true;
 815         }
 816
 817         assert(!must_mount || workspace_mounted > 0);
 818         where = workspace_mounted ? workspace : final;
 819
 820         (void) label_fix_full(AT_FDCWD, where, final, 0);
 821
 822         r = acquire_credentials(context, params, unit, where, uid, gid, workspace_mounted);
 823         if (r < 0)
 824                 return r;
 825
 826         if (workspace_mounted) {
 827                 bool install;
 828
 829                 /* Determine if we should actually install the prepared mount in the final location by bind
 830                  * mounting it there. We do so only if the mount is not established there already, and if the
 831                  * mount is actually non-empty (i.e. carries at least one credential). Not that in the best
 832                  * case we are doing all this in a mount namespace, thus no one else will see that we
 833                  * allocated a file system we are getting rid of again here. */
 834                 if (final_mounted)
 835                         install = false; /* already installed */
 836                 else {
 837                         r = dir_is_empty(where, /* ignore_hidden_or_backup= */ false);
 838                         if (r < 0)
 839                                 return r;
 840
 841                         install = r == 0; /* install only if non-empty */
 842                 }
 843
 844                 if (install) {
 845                         /* Make workspace read-only now, so that any bind mount we make from it defaults to
 846                          * read-only too */
 847                         r = mount_nofollow_verbose(LOG_DEBUG, NULL, workspace, NULL, MS_BIND|MS_REMOUNT|credentials_fs_mount_flags(/* ro= */ true), NULL);
 848                         if (r < 0)
 849                                 return r;
 850
 851                         /* And mount it to the final place, read-only */
 852                         r = mount_nofollow_verbose(LOG_DEBUG, workspace, final, NULL, MS_MOVE, NULL);
 853                 } else
 854                         /* Otherwise get rid of it */
 855                         r = umount_verbose(LOG_DEBUG, workspace, MNT_DETACH|UMOUNT_NOFOLLOW);
 856                 if (r < 0)
 857                         return r;
 858         } else {
 859                 _cleanup_free_ char *parent = NULL;
 860
 861                 /* If we do not have our own mount put used the plain directory fallback, then we need to
 862                  * open access to the top-level credential directory and the per-service directory now */
 863
 864                 r = path_extract_directory(final, &parent);
 865                 if (r < 0)
 866                         return r;
 867                 if (chmod(parent, 0755) < 0)
 868                         return -errno;
 869         }
 870
 871         return 0;
 872 }
 873
 874 int setup_credentials(
 875                 const ExecContext *context,
 876                 const ExecParameters *params,
 877                 const char *unit,
 878                 uid_t uid,
 879                 gid_t gid) {
 880
 881         _cleanup_free_ char *p = NULL, *q = NULL;
 882         int r;
 883
 884         assert(context);
 885         assert(params);
 886
 887         if (!exec_context_has_credentials(context))
 888                 return 0;
 889
 890         if (!params->prefix[EXEC_DIRECTORY_RUNTIME])
 891                 return -EINVAL;
 892
 893         /* This where we'll place stuff when we are done; this main credentials directory is world-readable,
 894          * and the subdir we mount over with a read-only file system readable by the service's user */
 895         q = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "credentials");
 896         if (!q)
 897                 return -ENOMEM;
 898
 899         r = mkdir_label(q, 0755); /* top-level dir: world readable/searchable */
 900         if (r < 0 && r != -EEXIST)
 901                 return r;
 902
 903         p = path_join(q, unit);
 904         if (!p)
 905                 return -ENOMEM;
 906
 907         r = mkdir_label(p, 0700); /* per-unit dir: private to user */
 908         if (r < 0 && r != -EEXIST)
 909                 return r;
 910
 911         r = safe_fork("(sd-mkdcreds)", FORK_DEATHSIG|FORK_WAIT|FORK_NEW_MOUNTNS, NULL);
 912         if (r < 0) {
 913                 _cleanup_free_ char *t = NULL, *u = NULL;
 914
 915                 /* If this is not a privilege or support issue then propagate the error */
 916                 if (!ERRNO_IS_NOT_SUPPORTED(r) && !ERRNO_IS_PRIVILEGE(r))
 917                         return r;
 918
 919                 /* Temporary workspace, that remains inaccessible all the time. We prepare stuff there before moving
 920                  * it into place, so that users can't access half-initialized credential stores. */
 921                 t = path_join(params->prefix[EXEC_DIRECTORY_RUNTIME], "systemd/temporary-credentials");
 922                 if (!t)
 923                         return -ENOMEM;
 924
 925                 /* We can't set up a mount namespace. In that case operate on a fixed, inaccessible per-unit
 926                  * directory outside of /run/credentials/ first, and then move it over to /run/credentials/
 927                  * after it is fully set up */
 928                 u = path_join(t, unit);
 929                 if (!u)
 930                         return -ENOMEM;
 931
 932                 FOREACH_STRING(i, t, u) {
 933                         r = mkdir_label(i, 0700);
 934                         if (r < 0 && r != -EEXIST)
 935                                 return r;
 936                 }
 937
 938                 r = setup_credentials_internal(
 939                                 context,
 940                                 params,
 941                                 unit,
 942                                 p,       /* final mount point */
 943                                 u,       /* temporary workspace to overmount */
 944                                 true,    /* reuse the workspace if it is already a mount */
 945                                 false,   /* it's OK to fall back to a plain directory if we can't mount anything */
 946                                 uid,
 947                                 gid);
 948
 949                 (void) rmdir(u); /* remove the workspace again if we can. */
 950
 951                 if (r < 0)
 952                         return r;
 953
 954         } else if (r == 0) {
 955
 956                 /* We managed to set up a mount namespace, and are now in a child. That's great. In this case
 957                  * we can use the same directory for all cases, after turning off propagation. Question
 958                  * though is: where do we turn off propagation exactly, and where do we place the workspace
 959                  * directory? We need some place that is guaranteed to be a mount point in the host, and
 960                  * which is guaranteed to have a subdir we can mount over. /run/ is not suitable for this,
 961                  * since we ultimately want to move the resulting file system there, i.e. we need propagation
 962                  * for /run/ eventually. We could use our own /run/systemd/bind mount on itself, but that
 963                  * would be visible in the host mount table all the time, which we want to avoid. Hence, what
 964                  * we do here instead we use /dev/ and /dev/shm/ for our purposes. We know for sure that
 965                  * /dev/ is a mount point and we now for sure that /dev/shm/ exists. Hence we can turn off
 966                  * propagation on the former, and then overmount the latter.
 967                  *
 968                  * Yes it's nasty playing games with /dev/ and /dev/shm/ like this, since it does not exist
 969                  * for this purpose, but there are few other candidates that work equally well for us, and
 970                  * given that we do this in a privately namespaced short-lived single-threaded process that
 971                  * no one else sees this should be OK to do. */
 972
 973                 /* Turn off propagation from our namespace to host */
 974                 r = mount_nofollow_verbose(LOG_DEBUG, NULL, "/dev", NULL, MS_SLAVE|MS_REC, NULL);
 975                 if (r < 0)
 976                         goto child_fail;
 977
 978                 r = setup_credentials_internal(
 979                                 context,
 980                                 params,
 981                                 unit,
 982                                 p,           /* final mount point */
 983                                 "/dev/shm",  /* temporary workspace to overmount */
 984                                 false,       /* do not reuse /dev/shm if it is already a mount, under no circumstances */
 985                                 true,        /* insist that something is mounted, do not allow fallback to plain directory */
 986                                 uid,
 987                                 gid);
 988                 if (r < 0)
 989                         goto child_fail;
 990
 991                 _exit(EXIT_SUCCESS);
 992
 993         child_fail:
 994                 _exit(EXIT_FAILURE);
 995         }
 996
 997         /* If the credentials dir is empty and not a mount point, then there's no point in having it. Let's
 998          * try to remove it. This matters in particular if we created the dir as mount point but then didn't
 999          * actually end up mounting anything on it. In that case we'd rather have ENOENT than EACCESS being
1000          * seen by users when trying access this inode. */
1001         (void) rmdir(p);
1002         return 0;
1003 }