src/nspawn/nspawn-patch-uid.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2016 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <linux/magic.h>
  23 #if HAVE_ACL
  24 #include <sys/acl.h>
  25 #endif
  26 #include <sys/stat.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/vfs.h>
  29 #include <unistd.h>
  30
  31 #include "acl-util.h"
  32 #include "dirent-util.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "missing.h"
  36 #include "nspawn-def.h"
  37 #include "nspawn-patch-uid.h"
  38 #include "stat-util.h"
  39 #include "stdio-util.h"
  40 #include "string-util.h"
  41 #include "strv.h"
  42 #include "user-util.h"
  43
  44 #if HAVE_ACL
  45
  46 static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
  47         char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
  48         acl_t acl;
  49
  50         assert(fd >= 0);
  51         assert(ret);
  52
  53         if (name) {
  54                 _cleanup_close_ int child_fd = -1;
  55
  56                 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
  57                 if (child_fd < 0)
  58                         return -errno;
  59
  60                 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
  61                 acl = acl_get_file(procfs_path, type);
  62         } else if (type == ACL_TYPE_ACCESS)
  63                 acl = acl_get_fd(fd);
  64         else {
  65                 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
  66                 acl = acl_get_file(procfs_path, type);
  67         }
  68         if (!acl)
  69                 return -errno;
  70
  71         *ret = acl;
  72         return 0;
  73 }
  74
  75 static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
  76         char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
  77         int r;
  78
  79         assert(fd >= 0);
  80         assert(acl);
  81
  82         if (name) {
  83                 _cleanup_close_ int child_fd = -1;
  84
  85                 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
  86                 if (child_fd < 0)
  87                         return -errno;
  88
  89                 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
  90                 r = acl_set_file(procfs_path, type, acl);
  91         } else if (type == ACL_TYPE_ACCESS)
  92                 r = acl_set_fd(fd, acl);
  93         else {
  94                 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
  95                 r = acl_set_file(procfs_path, type, acl);
  96         }
  97         if (r < 0)
  98                 return -errno;
  99
 100         return 0;
 101 }
 102
 103 static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
 104         _cleanup_(acl_freep) acl_t copy = NULL;
 105         acl_entry_t i;
 106         int r;
 107
 108         assert(acl);
 109         assert(ret);
 110
 111         r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
 112         if (r < 0)
 113                 return -errno;
 114         while (r > 0) {
 115                 uid_t *old_uid, new_uid;
 116                 bool modify = false;
 117                 acl_tag_t tag;
 118
 119                 if (acl_get_tag_type(i, &tag) < 0)
 120                         return -errno;
 121
 122                 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
 123
 124                         /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
 125                          * this is actually OK */
 126                         assert_cc(sizeof(uid_t) == sizeof(gid_t));
 127
 128                         old_uid = acl_get_qualifier(i);
 129                         if (!old_uid)
 130                                 return -errno;
 131
 132                         new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
 133                         if (!uid_is_valid(new_uid))
 134                                 return -EINVAL;
 135
 136                         modify = new_uid != *old_uid;
 137                         if (modify && !copy) {
 138                                 int n;
 139
 140                                 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
 141                                  * beginning, so that we copy all entries, starting from the first, this time. */
 142
 143                                 n = acl_entries(acl);
 144                                 if (n < 0)
 145                                         return -errno;
 146
 147                                 copy = acl_init(n);
 148                                 if (!copy)
 149                                         return -errno;
 150
 151                                 /* Seek back to the beginning */
 152                                 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
 153                                 if (r < 0)
 154                                         return -errno;
 155                                 continue;
 156                         }
 157                 }
 158
 159                 if (copy) {
 160                         acl_entry_t new_entry;
 161
 162                         if (acl_create_entry(&copy, &new_entry) < 0)
 163                                 return -errno;
 164
 165                         if (acl_copy_entry(new_entry, i) < 0)
 166                                 return -errno;
 167
 168                         if (modify)
 169                                 if (acl_set_qualifier(new_entry, &new_uid) < 0)
 170                                         return -errno;
 171                 }
 172
 173                 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
 174                 if (r < 0)
 175                         return -errno;
 176         }
 177
 178         *ret = TAKE_PTR(copy);
 179
 180         return !!*ret;
 181 }
 182
 183 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
 184         _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
 185         bool changed = false;
 186         int r;
 187
 188         assert(fd >= 0);
 189         assert(st);
 190
 191         /* ACLs are not supported on symlinks, there's no point in trying */
 192         if (S_ISLNK(st->st_mode))
 193                 return 0;
 194
 195         r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
 196         if (r == -EOPNOTSUPP)
 197                 return 0;
 198         if (r < 0)
 199                 return r;
 200
 201         r = shift_acl(acl, shift, &shifted);
 202         if (r < 0)
 203                 return r;
 204         if (r > 0) {
 205                 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
 206                 if (r < 0)
 207                         return r;
 208
 209                 changed = true;
 210         }
 211
 212         if (S_ISDIR(st->st_mode)) {
 213                 acl_free(acl);
 214                 acl_free(shifted);
 215
 216                 acl = shifted = NULL;
 217
 218                 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
 219                 if (r < 0)
 220                         return r;
 221
 222                 r = shift_acl(acl, shift, &shifted);
 223                 if (r < 0)
 224                         return r;
 225                 if (r > 0) {
 226                         r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
 227                         if (r < 0)
 228                                 return r;
 229
 230                         changed = true;
 231                 }
 232         }
 233
 234         return changed;
 235 }
 236
 237 #else
 238
 239 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
 240         return 0;
 241 }
 242
 243 #endif
 244
 245 static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
 246         uid_t new_uid;
 247         gid_t new_gid;
 248         bool changed = false;
 249         int r;
 250
 251         assert(fd >= 0);
 252         assert(st);
 253
 254         new_uid =         shift | (st->st_uid & UINT32_C(0xFFFF));
 255         new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
 256
 257         if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
 258                 return -EINVAL;
 259
 260         if (st->st_uid != new_uid || st->st_gid != new_gid) {
 261                 if (name)
 262                         r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
 263                 else
 264                         r = fchown(fd, new_uid, new_gid);
 265                 if (r < 0)
 266                         return -errno;
 267
 268                 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
 269                 if (name) {
 270                         if (!S_ISLNK(st->st_mode))
 271                                 r = fchmodat(fd, name, st->st_mode, 0);
 272                         else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
 273                                 r = 0;
 274                 } else
 275                         r = fchmod(fd, st->st_mode);
 276                 if (r < 0)
 277                         return -errno;
 278
 279                 changed = true;
 280         }
 281
 282         r = patch_acls(fd, name, st, shift);
 283         if (r < 0)
 284                 return r;
 285
 286         return r > 0 || changed;
 287 }
 288
 289 /*
 290  * Check if the filesystem is fully compatible with user namespaces or
 291  * UID/GID patching. Some filesystems in this list can be fully mounted inside
 292  * user namespaces, however their inodes may relate to host resources or only
 293  * valid in the global user namespace, therefore no patching should be applied.
 294  */
 295 static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
 296
 297         assert(sfs);
 298
 299         return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
 300                F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
 301                F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
 302                F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
 303                F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
 304                F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
 305                F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
 306                F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
 307                F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
 308                F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
 309                F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
 310                F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
 311                F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
 312                F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
 313                F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
 314                F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
 315 }
 316
 317 static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
 318         _cleanup_closedir_ DIR *d = NULL;
 319         bool changed = false;
 320         struct statfs sfs;
 321         int r;
 322
 323         assert(fd >= 0);
 324
 325         if (fstatfs(fd, &sfs) < 0)
 326                 return -errno;
 327
 328         /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
 329          * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
 330          * when we hit procfs, sysfs or some other special file systems. */
 331
 332         r = is_fs_fully_userns_compatible(&sfs);
 333         if (r < 0)
 334                 goto finish;
 335         if (r > 0) {
 336                 r = 0; /* don't recurse */
 337                 goto finish;
 338         }
 339
 340         /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
 341         if ((sfs.f_flags & ST_RDONLY) ||
 342             access_fd(fd, W_OK) == -EROFS)
 343                 goto read_only;
 344
 345         if (S_ISDIR(st->st_mode)) {
 346                 struct dirent *de;
 347
 348                 if (!donate_fd) {
 349                         int copy;
 350
 351                         copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 352                         if (copy < 0) {
 353                                 r = -errno;
 354                                 goto finish;
 355                         }
 356
 357                         fd = copy;
 358                         donate_fd = true;
 359                 }
 360
 361                 d = fdopendir(fd);
 362                 if (!d) {
 363                         r = -errno;
 364                         goto finish;
 365                 }
 366                 fd = -1;
 367
 368                 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
 369                         struct stat fst;
 370
 371                         if (dot_or_dot_dot(de->d_name))
 372                                 continue;
 373
 374                         if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
 375                                 r = -errno;
 376                                 goto finish;
 377                         }
 378
 379                         if (S_ISDIR(fst.st_mode)) {
 380                                 int subdir_fd;
 381
 382                                 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
 383                                 if (subdir_fd < 0) {
 384                                         r = -errno;
 385                                         goto finish;
 386
 387                                 }
 388
 389                                 r = recurse_fd(subdir_fd, true, &fst, shift, false);
 390                                 if (r < 0)
 391                                         goto finish;
 392                                 if (r > 0)
 393                                         changed = true;
 394
 395                         } else {
 396                                 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
 397                                 if (r < 0)
 398                                         goto finish;
 399                                 if (r > 0)
 400                                         changed = true;
 401                         }
 402                 }
 403         }
 404
 405         /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
 406          * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
 407          * tree is properly chown()ed already. */
 408         r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
 409         if (r == -EROFS)
 410                 goto read_only;
 411         if (r > 0)
 412                 changed = true;
 413
 414         r = changed;
 415         goto finish;
 416
 417 read_only:
 418         if (!is_toplevel) {
 419                 _cleanup_free_ char *name = NULL;
 420
 421                 /* When we hit a ready-only subtree we simply skip it, but log about it. */
 422                 (void) fd_get_path(fd, &name);
 423                 log_debug("Skippping read-only file or directory %s.", strna(name));
 424                 r = changed;
 425         }
 426
 427 finish:
 428         if (donate_fd)
 429                 safe_close(fd);
 430
 431         return r;
 432 }
 433
 434 static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
 435         struct stat st;
 436         int r;
 437
 438         assert(fd >= 0);
 439
 440         /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
 441          * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
 442          * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
 443          * UID within the container. */
 444
 445         if ((shift & 0xFFFF) != 0) {
 446                 /* We only support containers where the shift starts at a 2^16 boundary */
 447                 r = -EOPNOTSUPP;
 448                 goto finish;
 449         }
 450
 451         if (shift == UID_BUSY_BASE) {
 452                 r = -EINVAL;
 453                 goto finish;
 454         }
 455
 456         if (range != 0x10000) {
 457                 /* We only support containers with 16bit UID ranges for the patching logic */
 458                 r = -EOPNOTSUPP;
 459                 goto finish;
 460         }
 461
 462         if (fstat(fd, &st) < 0) {
 463                 r = -errno;
 464                 goto finish;
 465         }
 466
 467         if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
 468                 /* We only support containers where the uid/gid container ID match */
 469                 r = -EBADE;
 470                 goto finish;
 471         }
 472
 473         /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
 474          * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
 475         if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
 476                 return 0;
 477
 478         /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
 479          * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
 480          * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
 481
 482         if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
 483                 if (fchown(fd,
 484                            UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
 485                            (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
 486                         r = -errno;
 487                         goto finish;
 488                 }
 489         }
 490
 491         return recurse_fd(fd, donate_fd, &st, shift, true);
 492
 493 finish:
 494         if (donate_fd)
 495                 safe_close(fd);
 496
 497         return r;
 498 }
 499
 500 int fd_patch_uid(int fd, uid_t shift, uid_t range) {
 501         return fd_patch_uid_internal(fd, false, shift, range);
 502 }
 503
 504 int path_patch_uid(const char *path, uid_t shift, uid_t range) {
 505         int fd;
 506
 507         fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
 508         if (fd < 0)
 509                 return -errno;
 510
 511         return fd_patch_uid_internal(fd, true, shift, range);
 512 }