src/nspawn/nspawn-patch-uid.c

   1 /* SPDX-License-Identifier: LGPL-2.1+ */
   2 /***
   3   This file is part of systemd.
   4
   5   Copyright 2016 Lennart Poettering
   6
   7   systemd is free software; you can redistribute it and/or modify it
   8   under the terms of the GNU Lesser General Public License as published by
   9   the Free Software Foundation; either version 2.1 of the License, or
  10   (at your option) any later version.
  11
  12   systemd is distributed in the hope that it will be useful, but
  13   WITHOUT ANY WARRANTY; without even the implied warranty of
  14   MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
  15   Lesser General Public License for more details.
  16
  17   You should have received a copy of the GNU Lesser General Public License
  18   along with systemd; If not, see <http://www.gnu.org/licenses/>.
  19 ***/
  20
  21 #include <fcntl.h>
  22 #include <linux/magic.h>
  23 #if HAVE_ACL
  24 #include <sys/acl.h>
  25 #endif
  26 #include <sys/stat.h>
  27 #include <sys/statvfs.h>
  28 #include <sys/vfs.h>
  29 #include <unistd.h>
  30
  31 #include "acl-util.h"
  32 #include "dirent-util.h"
  33 #include "fd-util.h"
  34 #include "fs-util.h"
  35 #include "missing.h"
  36 #include "nspawn-def.h"
  37 #include "nspawn-patch-uid.h"
  38 #include "stat-util.h"
  39 #include "stdio-util.h"
  40 #include "string-util.h"
  41 #include "strv.h"
  42 #include "user-util.h"
  43
  44 #if HAVE_ACL
  45
  46 static int get_acl(int fd, const char *name, acl_type_t type, acl_t *ret) {
  47         char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
  48         acl_t acl;
  49
  50         assert(fd >= 0);
  51         assert(ret);
  52
  53         if (name) {
  54                 _cleanup_close_ int child_fd = -1;
  55
  56                 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
  57                 if (child_fd < 0)
  58                         return -errno;
  59
  60                 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
  61                 acl = acl_get_file(procfs_path, type);
  62         } else if (type == ACL_TYPE_ACCESS)
  63                 acl = acl_get_fd(fd);
  64         else {
  65                 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
  66                 acl = acl_get_file(procfs_path, type);
  67         }
  68         if (!acl)
  69                 return -errno;
  70
  71         *ret = acl;
  72         return 0;
  73 }
  74
  75 static int set_acl(int fd, const char *name, acl_type_t type, acl_t acl) {
  76         char procfs_path[STRLEN("/proc/self/fd/") + DECIMAL_STR_MAX(int) + 1];
  77         int r;
  78
  79         assert(fd >= 0);
  80         assert(acl);
  81
  82         if (name) {
  83                 _cleanup_close_ int child_fd = -1;
  84
  85                 child_fd = openat(fd, name, O_PATH|O_CLOEXEC|O_NOFOLLOW);
  86                 if (child_fd < 0)
  87                         return -errno;
  88
  89                 xsprintf(procfs_path, "/proc/self/fd/%i", child_fd);
  90                 r = acl_set_file(procfs_path, type, acl);
  91         } else if (type == ACL_TYPE_ACCESS)
  92                 r = acl_set_fd(fd, acl);
  93         else {
  94                 xsprintf(procfs_path, "/proc/self/fd/%i", fd);
  95                 r = acl_set_file(procfs_path, type, acl);
  96         }
  97         if (r < 0)
  98                 return -errno;
  99
 100         return 0;
 101 }
 102
 103 static int shift_acl(acl_t acl, uid_t shift, acl_t *ret) {
 104         _cleanup_(acl_freep) acl_t copy = NULL;
 105         acl_entry_t i;
 106         int r;
 107
 108         assert(acl);
 109         assert(ret);
 110
 111         r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
 112         if (r < 0)
 113                 return -errno;
 114         while (r > 0) {
 115                 uid_t *old_uid, new_uid;
 116                 bool modify = false;
 117                 acl_tag_t tag;
 118
 119                 if (acl_get_tag_type(i, &tag) < 0)
 120                         return -errno;
 121
 122                 if (IN_SET(tag, ACL_USER, ACL_GROUP)) {
 123
 124                         /* We don't distuingish here between uid_t and gid_t, let's make sure the compiler checks that
 125                          * this is actually OK */
 126                         assert_cc(sizeof(uid_t) == sizeof(gid_t));
 127
 128                         old_uid = acl_get_qualifier(i);
 129                         if (!old_uid)
 130                                 return -errno;
 131
 132                         new_uid = shift | (*old_uid & UINT32_C(0xFFFF));
 133                         if (!uid_is_valid(new_uid))
 134                                 return -EINVAL;
 135
 136                         modify = new_uid != *old_uid;
 137                         if (modify && !copy) {
 138                                 int n;
 139
 140                                 /* There's no copy of the ACL yet? if so, let's create one, and start the loop from the
 141                                  * beginning, so that we copy all entries, starting from the first, this time. */
 142
 143                                 n = acl_entries(acl);
 144                                 if (n < 0)
 145                                         return -errno;
 146
 147                                 copy = acl_init(n);
 148                                 if (!copy)
 149                                         return -errno;
 150
 151                                 /* Seek back to the beginning */
 152                                 r = acl_get_entry(acl, ACL_FIRST_ENTRY, &i);
 153                                 if (r < 0)
 154                                         return -errno;
 155                                 continue;
 156                         }
 157                 }
 158
 159                 if (copy) {
 160                         acl_entry_t new_entry;
 161
 162                         if (acl_create_entry(&copy, &new_entry) < 0)
 163                                 return -errno;
 164
 165                         if (acl_copy_entry(new_entry, i) < 0)
 166                                 return -errno;
 167
 168                         if (modify)
 169                                 if (acl_set_qualifier(new_entry, &new_uid) < 0)
 170                                         return -errno;
 171                 }
 172
 173                 r = acl_get_entry(acl, ACL_NEXT_ENTRY, &i);
 174                 if (r < 0)
 175                         return -errno;
 176         }
 177
 178         *ret = copy;
 179         copy = NULL;
 180
 181         return !!*ret;
 182 }
 183
 184 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
 185         _cleanup_(acl_freep) acl_t acl = NULL, shifted = NULL;
 186         bool changed = false;
 187         int r;
 188
 189         assert(fd >= 0);
 190         assert(st);
 191
 192         /* ACLs are not supported on symlinks, there's no point in trying */
 193         if (S_ISLNK(st->st_mode))
 194                 return 0;
 195
 196         r = get_acl(fd, name, ACL_TYPE_ACCESS, &acl);
 197         if (r == -EOPNOTSUPP)
 198                 return 0;
 199         if (r < 0)
 200                 return r;
 201
 202         r = shift_acl(acl, shift, &shifted);
 203         if (r < 0)
 204                 return r;
 205         if (r > 0) {
 206                 r = set_acl(fd, name, ACL_TYPE_ACCESS, shifted);
 207                 if (r < 0)
 208                         return r;
 209
 210                 changed = true;
 211         }
 212
 213         if (S_ISDIR(st->st_mode)) {
 214                 acl_free(acl);
 215                 acl_free(shifted);
 216
 217                 acl = shifted = NULL;
 218
 219                 r = get_acl(fd, name, ACL_TYPE_DEFAULT, &acl);
 220                 if (r < 0)
 221                         return r;
 222
 223                 r = shift_acl(acl, shift, &shifted);
 224                 if (r < 0)
 225                         return r;
 226                 if (r > 0) {
 227                         r = set_acl(fd, name, ACL_TYPE_DEFAULT, shifted);
 228                         if (r < 0)
 229                                 return r;
 230
 231                         changed = true;
 232                 }
 233         }
 234
 235         return changed;
 236 }
 237
 238 #else
 239
 240 static int patch_acls(int fd, const char *name, const struct stat *st, uid_t shift) {
 241         return 0;
 242 }
 243
 244 #endif
 245
 246 static int patch_fd(int fd, const char *name, const struct stat *st, uid_t shift) {
 247         uid_t new_uid;
 248         gid_t new_gid;
 249         bool changed = false;
 250         int r;
 251
 252         assert(fd >= 0);
 253         assert(st);
 254
 255         new_uid =         shift | (st->st_uid & UINT32_C(0xFFFF));
 256         new_gid = (gid_t) shift | (st->st_gid & UINT32_C(0xFFFF));
 257
 258         if (!uid_is_valid(new_uid) || !gid_is_valid(new_gid))
 259                 return -EINVAL;
 260
 261         if (st->st_uid != new_uid || st->st_gid != new_gid) {
 262                 if (name)
 263                         r = fchownat(fd, name, new_uid, new_gid, AT_SYMLINK_NOFOLLOW);
 264                 else
 265                         r = fchown(fd, new_uid, new_gid);
 266                 if (r < 0)
 267                         return -errno;
 268
 269                 /* The Linux kernel alters the mode in some cases of chown(). Let's undo this. */
 270                 if (name) {
 271                         if (!S_ISLNK(st->st_mode))
 272                                 r = fchmodat(fd, name, st->st_mode, 0);
 273                         else /* AT_SYMLINK_NOFOLLOW is not available for fchmodat() */
 274                                 r = 0;
 275                 } else
 276                         r = fchmod(fd, st->st_mode);
 277                 if (r < 0)
 278                         return -errno;
 279
 280                 changed = true;
 281         }
 282
 283         r = patch_acls(fd, name, st, shift);
 284         if (r < 0)
 285                 return r;
 286
 287         return r > 0 || changed;
 288 }
 289
 290 /*
 291  * Check if the filesystem is fully compatible with user namespaces or
 292  * UID/GID patching. Some filesystems in this list can be fully mounted inside
 293  * user namespaces, however their inodes may relate to host resources or only
 294  * valid in the global user namespace, therefore no patching should be applied.
 295  */
 296 static int is_fs_fully_userns_compatible(const struct statfs *sfs) {
 297
 298         assert(sfs);
 299
 300         return F_TYPE_EQUAL(sfs->f_type, BINFMTFS_MAGIC) ||
 301                F_TYPE_EQUAL(sfs->f_type, CGROUP_SUPER_MAGIC) ||
 302                F_TYPE_EQUAL(sfs->f_type, CGROUP2_SUPER_MAGIC) ||
 303                F_TYPE_EQUAL(sfs->f_type, DEBUGFS_MAGIC) ||
 304                F_TYPE_EQUAL(sfs->f_type, DEVPTS_SUPER_MAGIC) ||
 305                F_TYPE_EQUAL(sfs->f_type, EFIVARFS_MAGIC) ||
 306                F_TYPE_EQUAL(sfs->f_type, HUGETLBFS_MAGIC) ||
 307                F_TYPE_EQUAL(sfs->f_type, MQUEUE_MAGIC) ||
 308                F_TYPE_EQUAL(sfs->f_type, PROC_SUPER_MAGIC) ||
 309                F_TYPE_EQUAL(sfs->f_type, PSTOREFS_MAGIC) ||
 310                F_TYPE_EQUAL(sfs->f_type, SELINUX_MAGIC) ||
 311                F_TYPE_EQUAL(sfs->f_type, SMACK_MAGIC) ||
 312                F_TYPE_EQUAL(sfs->f_type, SECURITYFS_MAGIC) ||
 313                F_TYPE_EQUAL(sfs->f_type, BPF_FS_MAGIC) ||
 314                F_TYPE_EQUAL(sfs->f_type, TRACEFS_MAGIC) ||
 315                F_TYPE_EQUAL(sfs->f_type, SYSFS_MAGIC);
 316 }
 317
 318 static int recurse_fd(int fd, bool donate_fd, const struct stat *st, uid_t shift, bool is_toplevel) {
 319         _cleanup_closedir_ DIR *d = NULL;
 320         bool changed = false;
 321         struct statfs sfs;
 322         int r;
 323
 324         assert(fd >= 0);
 325
 326         if (fstatfs(fd, &sfs) < 0)
 327                 return -errno;
 328
 329         /* We generally want to permit crossing of mount boundaries when patching the UIDs/GIDs. However, we probably
 330          * shouldn't do this for /proc and /sys if that is already mounted into place. Hence, let's stop the recursion
 331          * when we hit procfs, sysfs or some other special file systems. */
 332
 333         r = is_fs_fully_userns_compatible(&sfs);
 334         if (r < 0)
 335                 goto finish;
 336         if (r > 0) {
 337                 r = 0; /* don't recurse */
 338                 goto finish;
 339         }
 340
 341         /* Also, if we hit a read-only file system, then don't bother, skip the whole subtree */
 342         if ((sfs.f_flags & ST_RDONLY) ||
 343             access_fd(fd, W_OK) == -EROFS)
 344                 goto read_only;
 345
 346         if (S_ISDIR(st->st_mode)) {
 347                 struct dirent *de;
 348
 349                 if (!donate_fd) {
 350                         int copy;
 351
 352                         copy = fcntl(fd, F_DUPFD_CLOEXEC, 3);
 353                         if (copy < 0) {
 354                                 r = -errno;
 355                                 goto finish;
 356                         }
 357
 358                         fd = copy;
 359                         donate_fd = true;
 360                 }
 361
 362                 d = fdopendir(fd);
 363                 if (!d) {
 364                         r = -errno;
 365                         goto finish;
 366                 }
 367                 fd = -1;
 368
 369                 FOREACH_DIRENT_ALL(de, d, r = -errno; goto finish) {
 370                         struct stat fst;
 371
 372                         if (dot_or_dot_dot(de->d_name))
 373                                 continue;
 374
 375                         if (fstatat(dirfd(d), de->d_name, &fst, AT_SYMLINK_NOFOLLOW) < 0) {
 376                                 r = -errno;
 377                                 goto finish;
 378                         }
 379
 380                         if (S_ISDIR(fst.st_mode)) {
 381                                 int subdir_fd;
 382
 383                                 subdir_fd = openat(dirfd(d), de->d_name, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
 384                                 if (subdir_fd < 0) {
 385                                         r = -errno;
 386                                         goto finish;
 387
 388                                 }
 389
 390                                 r = recurse_fd(subdir_fd, true, &fst, shift, false);
 391                                 if (r < 0)
 392                                         goto finish;
 393                                 if (r > 0)
 394                                         changed = true;
 395
 396                         } else {
 397                                 r = patch_fd(dirfd(d), de->d_name, &fst, shift);
 398                                 if (r < 0)
 399                                         goto finish;
 400                                 if (r > 0)
 401                                         changed = true;
 402                         }
 403                 }
 404         }
 405
 406         /* After we descended, also patch the directory itself. It's key to do this in this order so that the top-level
 407          * directory is patched as very last object in the tree, so that we can use it as quick indicator whether the
 408          * tree is properly chown()ed already. */
 409         r = patch_fd(d ? dirfd(d) : fd, NULL, st, shift);
 410         if (r == -EROFS)
 411                 goto read_only;
 412         if (r > 0)
 413                 changed = true;
 414
 415         r = changed;
 416         goto finish;
 417
 418 read_only:
 419         if (!is_toplevel) {
 420                 _cleanup_free_ char *name = NULL;
 421
 422                 /* When we hit a ready-only subtree we simply skip it, but log about it. */
 423                 (void) fd_get_path(fd, &name);
 424                 log_debug("Skippping read-only file or directory %s.", strna(name));
 425                 r = changed;
 426         }
 427
 428 finish:
 429         if (donate_fd)
 430                 safe_close(fd);
 431
 432         return r;
 433 }
 434
 435 static int fd_patch_uid_internal(int fd, bool donate_fd, uid_t shift, uid_t range) {
 436         struct stat st;
 437         int r;
 438
 439         assert(fd >= 0);
 440
 441         /* Recursively adjusts the UID/GIDs of all files of a directory tree. This is used to automatically fix up an
 442          * OS tree to the used user namespace UID range. Note that this automatic adjustment only works for UID ranges
 443          * following the concept that the upper 16bit of a UID identify the container, and the lower 16bit are the actual
 444          * UID within the container. */
 445
 446         if ((shift & 0xFFFF) != 0) {
 447                 /* We only support containers where the shift starts at a 2^16 boundary */
 448                 r = -EOPNOTSUPP;
 449                 goto finish;
 450         }
 451
 452         if (shift == UID_BUSY_BASE) {
 453                 r = -EINVAL;
 454                 goto finish;
 455         }
 456
 457         if (range != 0x10000) {
 458                 /* We only support containers with 16bit UID ranges for the patching logic */
 459                 r = -EOPNOTSUPP;
 460                 goto finish;
 461         }
 462
 463         if (fstat(fd, &st) < 0) {
 464                 r = -errno;
 465                 goto finish;
 466         }
 467
 468         if ((uint32_t) st.st_uid >> 16 != (uint32_t) st.st_gid >> 16) {
 469                 /* We only support containers where the uid/gid container ID match */
 470                 r = -EBADE;
 471                 goto finish;
 472         }
 473
 474         /* Try to detect if the range is already right. Of course, this a pretty drastic optimization, as we assume
 475          * that if the top-level dir has the right upper 16bit assigned, then everything below will have too... */
 476         if (((uint32_t) (st.st_uid ^ shift) >> 16) == 0)
 477                 return 0;
 478
 479         /* Before we start recursively chowning, mark the top-level dir as "busy" by chowning it to the "busy"
 480          * range. Should we be interrupted in the middle of our work, we'll see it owned by this user and will start
 481          * chown()ing it again, unconditionally, as the busy UID is not a valid UID we'd everpick for ourselves. */
 482
 483         if ((st.st_uid & UID_BUSY_MASK) != UID_BUSY_BASE) {
 484                 if (fchown(fd,
 485                            UID_BUSY_BASE | (st.st_uid & ~UID_BUSY_MASK),
 486                            (gid_t) UID_BUSY_BASE | (st.st_gid & ~(gid_t) UID_BUSY_MASK)) < 0) {
 487                         r = -errno;
 488                         goto finish;
 489                 }
 490         }
 491
 492         return recurse_fd(fd, donate_fd, &st, shift, true);
 493
 494 finish:
 495         if (donate_fd)
 496                 safe_close(fd);
 497
 498         return r;
 499 }
 500
 501 int fd_patch_uid(int fd, uid_t shift, uid_t range) {
 502         return fd_patch_uid_internal(fd, false, shift, range);
 503 }
 504
 505 int path_patch_uid(const char *path, uid_t shift, uid_t range) {
 506         int fd;
 507
 508         fd = open(path, O_RDONLY|O_NONBLOCK|O_DIRECTORY|O_CLOEXEC|O_NOFOLLOW|O_NOATIME);
 509         if (fd < 0)
 510                 return -errno;
 511
 512         return fd_patch_uid_internal(fd, true, shift, range);
 513 }