man2/mount_setattr.2

   1 .\" Copyright (c) 2021 by Christian Brauner <christian.brauner@ubuntu.com>
   2 .\"
   3 .\" SPDX-License-Identifier: Linux-man-pages-copyleft
   4 .\"
   5 .TH MOUNT_SETATTR 2 2021-08-27 "Linux" "Linux Programmer's Manual"
   6 .SH NAME
   7 mount_setattr \- change properties of a mount or mount tree
   8 .SH LIBRARY
   9 Standard C library
  10 .RI ( libc ", " \-lc )
  11 .SH SYNOPSIS
  12 .nf
  13
  14 .PP
  15 .BR "#include <linux/fcntl.h>" " /* Definition of " AT_* " constants */"
  16 .BR "#include <linux/mount.h>" " /* Definition of " MOUNT_ATTR_* " constants */"
  17 .BR "#include <sys/syscall.h>" " /* Definition of " SYS_* " constants */"
  18 .B #include <unistd.h>
  19 .PP
  20 .BI "int syscall(SYS_mount_setattr, int " dirfd ", const char *" pathname ,
  21 .BI "            unsigned int " flags ", struct mount_attr *" attr \
  22 ", size_t " size );
  23 .fi
  24 .PP
  25 .IR Note :
  26 glibc provides no wrapper for
  27 .BR mount_setattr (),
  28 necessitating the use of
  29 .BR syscall (2).
  30 .SH DESCRIPTION
  31 The
  32 .BR mount_setattr ()
  33 system call changes the mount properties of a mount or an entire mount tree.
  34 If
  35 .I pathname
  36 is a relative pathname,
  37 then it is interpreted relative to
  38 the directory referred to by the file descriptor
  39 .IR dirfd .
  40 If
  41 .I dirfd
  42 is the special value
  43 .BR AT_FDCWD ,
  44 then
  45 .I pathname
  46 is interpreted relative to
  47 the current working directory of the calling process.
  48 If
  49 .I pathname
  50 is the empty string and
  51 .B AT_EMPTY_PATH
  52 is specified in
  53 .IR flags ,
  54 then the mount properties of the mount identified by
  55 .I dirfd
  56 are changed.
  57 (See
  58 .BR openat (2)
  59 for an explanation of why the
  60 .I dirfd
  61 argument is useful.)
  62 .PP
  63 The
  64 .BR mount_setattr ()
  65 system call uses an extensible structure
  66 .RI ( "struct mount_attr" )
  67 to allow for future extensions.
  68 Any non-flag extensions to
  69 .BR mount_setattr ()
  70 will be implemented as new fields appended to the this structure,
  71 with a zero value in a new field resulting in the kernel behaving
  72 as though that extension field was not present.
  73 Therefore,
  74 the caller
  75 .I must
  76 zero-fill this structure on initialization.
  77 See the "Extensibility" subsection under
  78 .B NOTES
  79 for more details.
  80 .PP
  81 The
  82 .I size
  83 argument should usually be specified as
  84 .IR "sizeof(struct mount_attr)" .
  85 However, if the caller is using a kernel that supports an extended
  86 .IR "struct mount_attr" ,
  87 but the caller does not intend to make use of these features,
  88 it is possible to pass the size of an earlier
  89 version of the structure together with the extended structure.
  90 This allows the kernel to not copy later parts of the structure
  91 that aren't used anyway.
  92 With each extension that changes the size of
  93 .IR "struct mount_attr" ,
  94 the kernel will expose a definition of the form
  95 .BI MOUNT_ATTR_SIZE_VER number\c
  96 \&.
  97 For example, the macro for the size of the initial version of
  98 .I struct mount_attr
  99 is
 100 .BR MOUNT_ATTR_SIZE_VER0 .
 101 .PP
 102 The
 103 .I flags
 104 argument can be used to alter the pathname resolution behavior.
 105 The supported values are:
 106 .TP
 107 .B AT_EMPTY_PATH
 108 If
 109 .I pathname
 110 is the empty string,
 111 change the mount properties on
 112 .I dirfd
 113 itself.
 114 .TP
 115 .B AT_RECURSIVE
 116 Change the mount properties of the entire mount tree.
 117 .TP
 118 .B AT_SYMLINK_NOFOLLOW
 119 Don't follow trailing symbolic links.
 120 .TP
 121 .B AT_NO_AUTOMOUNT
 122 Don't trigger automounts.
 123 .PP
 124 The
 125 .I attr
 126 argument of
 127 .BR mount_setattr ()
 128 is a structure of the following form:
 129 .PP
 130 .in +4n
 131 .EX
 132 struct mount_attr {
 133     __u64 attr_set;     /* Mount properties to set */
 134     __u64 attr_clr;     /* Mount properties to clear */
 135     __u64 propagation;  /* Mount propagation type */
 136     __u64 userns_fd;    /* User namespace file descriptor */
 137 };
 138 .EE
 139 .in
 140 .PP
 141 The
 142 .I attr_set
 143 and
 144 .I attr_clr
 145 members are used to specify the mount properties that
 146 are supposed to be set or cleared for a mount or mount tree.
 147 Flags set in
 148 .I attr_set
 149 enable a property on a mount or mount tree,
 150 and flags set in
 151 .I attr_clr
 152 remove a property from a mount or mount tree.
 153 .PP
 154 When changing mount properties,
 155 the kernel will first clear the flags specified
 156 in the
 157 .I attr_clr
 158 field,
 159 and then set the flags specified in the
 160 .I attr_set
 161 field.
 162 For example, these settings:
 163 .PP
 164 .in +4n
 165 .EX
 166 struct mount_attr attr = {
 167     .attr_clr = MOUNT_ATTR_NOEXEC | MOUNT_ATTR_NODEV,
 168     .attr_set = MOUNT_ATTR_RDONLY | MOUNT_ATTR_NOSUID,
 169 };
 170 .EE
 171 .in
 172 .PP
 173 are equivalent to the following steps:
 174 .PP
 175 .in +4n
 176 .EX
 177 unsigned int current_mnt_flags = mnt->mnt_flags;
 178
 179 /*
 180  * Clear all flags set in .attr_clr,
 181  * clearing MOUNT_ATTR_NOEXEC and MOUNT_ATTR_NODEV.
 182  */
 183 current_mnt_flags &= ~attr->attr_clr;
 184
 185 /*
 186  * Now set all flags set in .attr_set,
 187  * applying MOUNT_ATTR_RDONLY and MOUNT_ATTR_NOSUID.
 188  */
 189 current_mnt_flags |= attr->attr_set;
 190
 191 mnt->mnt_flags = current_mnt_flags;
 192 .EE
 193 .in
 194 .PP
 195 As a result of this change, the mount or mount tree (a) is read-only;
 196 (b) blocks the execution of set-user-ID and set-group-ID programs;
 197 (c) allows execution of programs; and (d) allows access to devices.
 198 .PP
 199 Multiple changes with the same set of flags requested
 200 in
 201 .I attr_clr
 202 and
 203 .I attr_set
 204 are guaranteed to be idempotent after the changes have been applied.
 205 .PP
 206 The following mount attributes can be specified in the
 207 .I attr_set
 208 or
 209 .I attr_clr
 210 fields:
 211 .TP
 212 .B MOUNT_ATTR_RDONLY
 213 If set in
 214 .IR attr_set ,
 215 makes the mount read-only.
 216 If set in
 217 .IR attr_clr ,
 218 removes the read-only setting if set on the mount.
 219 .TP
 220 .B MOUNT_ATTR_NOSUID
 221 If set in
 222 .IR attr_set ,
 223 causes the mount not to honor the set-user-ID and set-group-ID mode bits and
 224 file capabilities when executing programs.
 225 If set in
 226 .IR attr_clr ,
 227 clears the set-user-ID, set-group-ID,
 228 and file capability restriction if set on this mount.
 229 .TP
 230 .B MOUNT_ATTR_NODEV
 231 If set in
 232 .IR attr_set ,
 233 prevents access to devices on this mount.
 234 If set in
 235 .IR attr_clr ,
 236 removes the restriction that prevented accessing devices on this mount.
 237 .TP
 238 .B MOUNT_ATTR_NOEXEC
 239 If set in
 240 .IR attr_set ,
 241 prevents executing programs on this mount.
 242 If set in
 243 .IR attr_clr ,
 244 removes the restriction that prevented executing programs on this mount.
 245 .TP
 246 .B MOUNT_ATTR_NOSYMFOLLOW
 247 If set in
 248 .IR attr_set ,
 249 prevents following symbolic links on this mount.
 250 If set in
 251 .IR attr_clr ,
 252 removes the restriction that prevented following symbolic links on this mount.
 253 .TP
 254 .B MOUNT_ATTR_NODIRATIME
 255 If set in
 256 .IR attr_set ,
 257 prevents updating access time for directories on this mount.
 258 If set in
 259 .IR attr_clr ,
 260 removes the restriction that prevented updating access time for directories.
 261 Note that
 262 .B MOUNT_ATTR_NODIRATIME
 263 can be combined with other access-time settings
 264 and is implied by the noatime setting.
 265 All other access-time settings are mutually exclusive.
 266 .TP
 267 .BR MOUNT_ATTR__ATIME " - changing access-time settings"
 268 The access-time values listed below are an enumeration that
 269 includes the value zero, expressed in the bits defined by the mask
 270 .BR MOUNT_ATTR__ATIME .
 271 Even though these bits are an enumeration
 272 (in contrast to the other mount flags such as
 273 .BR MOUNT_ATTR_NOEXEC ),
 274 they are nonetheless passed in
 275 .I attr_set
 276 and
 277 .I attr_clr
 278 for consistency with
 279 .BR fsmount (2),
 280 which introduced this behavior.
 281 .IP
 282 Note that,
 283 since the access-time values are an enumeration rather than bit values,
 284 a caller wanting to transition to a different access-time setting
 285 cannot simply specify the access-time setting in
 286 .IR attr_set ,
 287 but must also include
 288 .B MOUNT_ATTR__ATIME
 289 in the
 290 .I attr_clr
 291 field.
 292 The kernel will verify that
 293 .B MOUNT_ATTR__ATIME
 294 isn't partially set in
 295 .IR attr_clr
 296 (i.e., either all bits in the
 297 .B MOUNT_ATTR__ATIME
 298 bit field are either set or clear), and that
 299 .I attr_set
 300 doesn't have any access-time bits set if
 301 .B MOUNT_ATTR__ATIME
 302 isn't set in
 303 .IR attr_clr .
 304 .RS
 305 .TP
 306 .B MOUNT_ATTR_RELATIME
 307 When a file is accessed via this mount,
 308 update the file's last access time (atime)
 309 only if the current value of atime is less than or equal to
 310 the file's last modification time (mtime) or last status change time (ctime).
 311 .IP
 312 To enable this access-time setting on a mount or mount tree,
 313 .B MOUNT_ATTR_RELATIME
 314 must be set in
 315 .I attr_set
 316 and
 317 .B MOUNT_ATTR__ATIME
 318 must be set in the
 319 .I attr_clr
 320 field.
 321 .TP
 322 .B MOUNT_ATTR_NOATIME
 323 Do not update access times for (all types of) files on this mount.
 324 .IP
 325 To enable this access-time setting on a mount or mount tree,
 326 .B MOUNT_ATTR_NOATIME
 327 must be set in
 328 .I attr_set
 329 and
 330 .B MOUNT_ATTR__ATIME
 331 must be set in the
 332 .I attr_clr
 333 field.
 334 .TP
 335 .B MOUNT_ATTR_STRICTATIME
 336 Always update the last access time (atime)
 337 when files are accessed on this mount.
 338 .IP
 339 To enable this access-time setting on a mount or mount tree,
 340 .B MOUNT_ATTR_STRICTATIME
 341 must be set in
 342 .I attr_set
 343 and
 344 .B MOUNT_ATTR__ATIME
 345 must be set in the
 346 .I attr_clr
 347 field.
 348 .RE
 349 .TP
 350 .B MOUNT_ATTR_IDMAP
 351 If set in
 352 .IR attr_set ,
 353 creates an ID-mapped mount.
 354 The ID mapping is taken from the user namespace specified in
 355 .I userns_fd
 356 and attached to the mount.
 357 .IP
 358 Since it is not supported to
 359 change the ID mapping of a mount after it has been ID mapped,
 360 it is invalid to specify
 361 .B MOUNT_ATTR_IDMAP
 362 in
 363 .IR attr_clr .
 364 .IP
 365 For further details, see the subsection "ID-mapped mounts" under NOTES.
 366 .PP
 367 The
 368 .I propagation
 369 field is used to specify the propagation type of the mount or mount tree.
 370 This field either has the value zero,
 371 meaning leave the propagation type unchanged, or it has one of
 372 the following values:
 373 .TP
 374 .B MS_PRIVATE
 375 Turn all mounts into private mounts.
 376 .TP
 377 .B MS_SHARED
 378 Turn all mounts into shared mounts.
 379 .TP
 380 .B MS_SLAVE
 381 Turn all mounts into dependent mounts.
 382 .TP
 383 .B MS_UNBINDABLE
 384 Turn all mounts into unbindable mounts.
 385 .PP
 386 For further details on the above propagation types, see
 387 .BR mount_namespaces (7).
 388 .SH RETURN VALUE
 389 On success,
 390 .BR mount_setattr ()
 391 returns zero.
 392 On error,
 393 \-1 is returned and
 394 .I errno
 395 is set to indicate the cause of the error.
 396 .SH ERRORS
 397 .TP
 398 .B EBADF
 399 .I pathname
 400 is relative but
 401 .I dirfd
 402 is neither
 403 .B AT_FDCWD
 404 nor a valid file descriptor.
 405 .TP
 406 .B EBADF
 407 .I userns_fd
 408 is not a valid file descriptor.
 409 .TP
 410 .B EBUSY
 411 The caller tried to change the mount to
 412 .BR MOUNT_ATTR_RDONLY ,
 413 but the mount still holds files open for writing.
 414 .TP
 415 .B EINVAL
 416 The pathname specified via the
 417 .I dirfd
 418 and
 419 .I pathname
 420 arguments to
 421 .BR mount_setattr ()
 422 isn't a mount point.
 423 .TP
 424 .B EINVAL
 425 An unsupported value was set in
 426 .IR flags .
 427 .TP
 428 .B EINVAL
 429 An unsupported value was specified in the
 430 .I attr_set
 431 field of
 432 .IR mount_attr .
 433 .TP
 434 .B EINVAL
 435 An unsupported value was specified in the
 436 .I attr_clr
 437 field of
 438 .IR mount_attr .
 439 .TP
 440 .B EINVAL
 441 An unsupported value was specified in the
 442 .I propagation
 443 field of
 444 .IR mount_attr .
 445 .TP
 446 .B EINVAL
 447 More than one of
 448 .BR MS_SHARED ,
 449 .BR MS_SLAVE ,
 450 .BR MS_PRIVATE ,
 451 or
 452 .B MS_UNBINDABLE
 453 was set in the
 454 .I propagation
 455 field of
 456 .IR mount_attr .
 457 .TP
 458 .B EINVAL
 459 An access-time setting was specified in the
 460 .I attr_set
 461 field without
 462 .B MOUNT_ATTR__ATIME
 463 being set in the
 464 .I attr_clr
 465 field.
 466 .TP
 467 .B EINVAL
 468 .B MOUNT_ATTR_IDMAP
 469 was specified in
 470 .IR attr_clr .
 471 .TP
 472 .B EINVAL
 473 A file descriptor value was specified in
 474 .I userns_fd
 475 which exceeds
 476 .BR INT_MAX .
 477 .TP
 478 .B EINVAL
 479 A valid file descriptor value was specified in
 480 .IR userns_fd ,
 481 but the file descriptor did not refer to a user namespace.
 482 .TP
 483 .B EINVAL
 484 The underlying filesystem does not support ID-mapped mounts.
 485 .TP
 486 .B EINVAL
 487 The mount that is to be ID mapped is not a detached mount;
 488 that is, the mount has not previously been visible in a mount namespace.
 489 .TP
 490 .B EINVAL
 491 A partial access-time setting was specified in
 492 .I attr_clr
 493 instead of
 494 .B MOUNT_ATTR__ATIME
 495 being set.
 496 .TP
 497 .B EINVAL
 498 The mount is located outside the caller's mount namespace.
 499 .TP
 500 .B EINVAL
 501 The underlying filesystem has been mounted in a mount namespace that is
 502 owned by a noninitial user namespace
 503 .TP
 504 .B ENOENT
 505 A pathname was empty or had a nonexistent component.
 506 .TP
 507 .B ENOMEM
 508 When changing mount propagation to
 509 .BR MS_SHARED ,
 510 a new peer group ID needs to be allocated for all mounts without a peer group
 511 ID set.
 512 This allocation failed because there was not
 513 enough memory to allocate the relevant internal structures.
 514 .TP
 515 .B ENOSPC
 516 When changing mount propagation to
 517 .BR MS_SHARED ,
 518 a new peer group ID needs to be allocated for all mounts without a peer group
 519 ID set.
 520 This allocation failed because
 521 the kernel has run out of IDs.
 522 .\" Christian Bruner: i.e. someone has somehow managed to
 523 .\" allocate so many peer groups and managed to keep the kernel running
 524 .\" (???) that the ida has ran out of ids
 525 .\" Note that technically further error codes are possible that are
 526 .\" specific to the ID allocation implementation used.
 527 .TP
 528 .B EPERM
 529 One of the mounts had at least one of
 530 .BR MOUNT_ATTR_NOATIME ,
 531 .BR MOUNT_ATTR_NODEV ,
 532 .BR MOUNT_ATTR_NODIRATIME ,
 533 .BR MOUNT_ATTR_NOEXEC ,
 534 .BR MOUNT_ATTR_NOSUID ,
 535 or
 536 .B MOUNT_ATTR_RDONLY
 537 set and the flag is locked.
 538 Mount attributes become locked on a mount if:
 539 .RS
 540 .IP \(bu 3
 541 A new mount or mount tree is created causing mount propagation across user
 542 namespaces
 543 (i.e., propagation to a mount namespace owned by a different user namespace).
 544 The kernel will lock the aforementioned flags to prevent these sensitive
 545 properties from being altered.
 546 .IP \(bu
 547 A new mount and user namespace pair is created.
 548 This happens for example when specifying
 549 .B CLONE_NEWUSER | CLONE_NEWNS
 550 in
 551 .BR unshare (2),
 552 .BR clone (2),
 553 or
 554 .BR clone3 (2).
 555 The aforementioned flags become locked in the new mount namespace
 556 to prevent sensitive mount properties from being altered.
 557 Since the newly created mount namespace will be owned by the
 558 newly created user namespace,
 559 a calling process that is privileged in the new
 560 user namespace would\(emin the absence of such locking\(embe
 561 able to alter sensitive mount properties (e.g., to remount a mount
 562 that was marked read-only as read-write in the new mount namespace).
 563 .RE
 564 .TP
 565 .B EPERM
 566 A valid file descriptor value was specified in
 567 .IR userns_fd ,
 568 but the file descriptor refers to the initial user namespace.
 569 .TP
 570 .B EPERM
 571 An attempt was made to add an ID mapping to a mount that is already ID mapped.
 572 .TP
 573 .B EPERM
 574 The caller does not have
 575 .B CAP_SYS_ADMIN
 576 in the initial user namespace.
 577 .SH VERSIONS
 578 .BR mount_setattr ()
 579 first appeared in Linux 5.12.
 580 .\" commit 7d6beb71da3cc033649d641e1e608713b8220290
 581 .\" commit 2a1867219c7b27f928e2545782b86daaf9ad50bd
 582 .\" commit 9caccd41541a6f7d6279928d9f971f6642c361af
 583 .SH CONFORMING TO
 584 .BR mount_setattr ()
 585 is Linux-specific.
 586 .SH NOTES
 587 .SS ID-mapped mounts
 588 Creating an ID-mapped mount makes it possible to
 589 change the ownership of all files located under a mount.
 590 Thus, ID-mapped mounts make it possible to
 591 change ownership in a temporary and localized way.
 592 It is a localized change because the ownership changes are
 593 visible only via a specific mount.
 594 All other users and locations where the filesystem is exposed are unaffected.
 595 It is a temporary change because
 596 the ownership changes are tied to the lifetime of the mount.
 597 .PP
 598 Whenever callers interact with the filesystem through an ID-mapped mount,
 599 the ID mapping of the mount will be applied to
 600 user and group IDs associated with filesystem objects.
 601 This encompasses the user and group IDs associated with inodes
 602 and also the following
 603 .BR xattr (7)
 604 keys:
 605 .IP \(bu 3
 606 .IR security.capability ,
 607 whenever filesystem capabilities
 608 are stored or returned in the
 609 .B VFS_CAP_REVISION_3
 610 format,
 611 which stores a root user ID alongside the capabilities
 612 (see
 613 .BR capabilities (7)).
 614 .IP \(bu
 615 .I system.posix_acl_access
 616 and
 617 .IR system.posix_acl_default ,
 618 whenever user IDs or group IDs are stored in
 619 .B ACL_USER
 620 or
 621 .B ACL_GROUP
 622 entries.
 623 .PP
 624 The following conditions must be met in order to create an ID-mapped mount:
 625 .IP \(bu 3
 626 The caller must have the
 627 .B CAP_SYS_ADMIN
 628 capability in the initial user namespace.
 629 .IP \(bu
 630 The filesystem must be mounted in a mount namespace
 631 that is owned by the initial user namespace.
 632 .IP \(bu
 633 The underlying filesystem must support ID-mapped mounts.
 634 Currently, the
 635 .BR xfs (5),
 636 .BR ext4 (5),
 637 and
 638 .B FAT
 639 filesystems support ID-mapped mounts
 640 with more filesystems being actively worked on.
 641 .IP \(bu
 642 The mount must not already be ID-mapped.
 643 This also implies that the ID mapping of a mount cannot be altered.
 644 .IP \(bu
 645 The mount must be a detached mount;
 646 that is,
 647 it must have been created by calling
 648 .BR open_tree (2)
 649 with the
 650 .B OPEN_TREE_CLONE
 651 flag and it must not already have been visible in a mount namespace.
 652 (To put things another way:
 653 the mount must not have been attached to the filesystem hierarchy
 654 with a system call such as
 655 .BR move_mount (2).)
 656 .PP
 657 ID mappings can be created for user IDs, group IDs, and project IDs.
 658 An ID mapping is essentially a mapping of a range of user or group IDs into
 659 another or the same range of user or group IDs.
 660 ID mappings are written to map files as three numbers
 661 separated by white space.
 662 The first two numbers specify the starting user or group ID
 663 in each of the two user namespaces.
 664 The third number specifies the range of the ID mapping.
 665 For example,
 666 a mapping for user IDs such as "1000\ 1001\ 1" would indicate that
 667 user ID 1000 in the caller's user namespace is mapped to
 668 user ID 1001 in its ancestor user namespace.
 669 Since the map range is 1,
 670 only user ID 1000 is mapped.
 671 .PP
 672 It is possible to specify up to 340 ID mappings for each ID mapping type.
 673 If any user IDs or group IDs are not mapped,
 674 all files owned by that unmapped user or group ID will appear as
 675 being owned by the overflow user ID or overflow group ID respectively.
 676 .PP
 677 Further details on setting up ID mappings can be found in
 678 .BR user_namespaces (7).
 679 .PP
 680 In the common case, the user namespace passed in
 681 .I userns_fd
 682 (together with
 683 .B MOUNT_ATTR_IDMAP
 684 in
 685 .IR attr_set )
 686 to create an ID-mapped mount will be the user namespace of a container.
 687 In other scenarios it will be a dedicated user namespace associated with
 688 a user's login session as is the case for portable home directories in
 689 .BR systemd-homed.service (8)).
 690 It is also perfectly fine to create a dedicated user namespace
 691 for the sake of ID mapping a mount.
 692 .PP
 693 ID-mapped mounts can be useful in the following
 694 and a variety of other scenarios:
 695 .IP \(bu 3
 696 Sharing files or filesystems
 697 between multiple users or multiple machines,
 698 especially in complex scenarios.
 699 For example,
 700 ID-mapped mounts are used to implement portable home directories in
 701 .BR systemd-homed.service (8),
 702 where they allow users to move their home directory
 703 to an external storage device
 704 and use it on multiple computers
 705 where they are assigned different user IDs and group IDs.
 706 This effectively makes it possible to
 707 assign random user IDs and group IDs at login time.
 708 .IP \(bu
 709 Sharing files or filesystems
 710 from the host with unprivileged containers.
 711 This allows a user to avoid having to change ownership permanently through
 712 .BR chown (2).
 713 .IP \(bu
 714 ID mapping a container's root filesystem.
 715 Users don't need to change ownership permanently through
 716 .BR chown (2).
 717 Especially for large root filesystems, using
 718 .BR chown (2)
 719 can be prohibitively expensive.
 720 .IP \(bu
 721 Sharing files or filesystems
 722 between containers with non-overlapping ID mappings.
 723 .IP \(bu
 724 Implementing discretionary access (DAC) permission checking
 725 for filesystems lacking a concept of ownership.
 726 .IP \(bu
 727 Efficiently changing ownership on a per-mount basis.
 728 In contrast to
 729 .BR chown (2),
 730 changing ownership of large sets of files is instantaneous with
 731 ID-mapped mounts.
 732 This is especially useful when ownership of
 733 an entire root filesystem of a virtual machine or container
 734 is to be changed as mentioned above.
 735 With ID-mapped mounts,
 736 a single
 737 .BR mount_setattr ()
 738 system call will be sufficient to change the ownership of all files.
 739 .IP \(bu
 740 Taking the current ownership into account.
 741 ID mappings specify precisely
 742 what a user or group ID is supposed to be mapped to.
 743 This contrasts with the
 744 .BR chown (2)
 745 system call which cannot by itself
 746 take the current ownership of the files it changes into account.
 747 It simply changes the ownership to the specified user ID and group ID.
 748 .IP \(bu
 749 Locally and temporarily restricted ownership changes.
 750 ID-mapped mounts make it possible to change ownership locally,
 751 restricting the ownership changes to specific mounts,
 752 and temporarily as the ownership changes only apply as long as the mount exists.
 753 By contrast,
 754 changing ownership via the
 755 .BR chown (2)
 756 system call changes the ownership globally and permanently.
 757 .\"
 758 .SS Extensibility
 759 In order to allow for future extensibility,
 760 .BR mount_setattr ()
 761 requires the user-space application to specify the size of the
 762 .I mount_attr
 763 structure that it is passing.
 764 By providing this information, it is possible for
 765 .BR mount_setattr ()
 766 to provide both forwards- and backwards-compatibility, with
 767 .I size
 768 acting as an implicit version number.
 769 (Because new extension fields will always
 770 be appended, the structure size will always increase.)
 771 This extensibility design is very similar to other system calls such as
 772 .BR perf_setattr (2),
 773 .BR perf_event_open (2),
 774 .BR clone3 (2)
 775 and
 776 .BR openat2 (2).
 777 .PP
 778 Let
 779 .I usize
 780 be the size of the structure as specified by the user-space application,
 781 and let
 782 .I ksize
 783 be the size of the structure which the kernel supports,
 784 then there are three cases to consider:
 785 .IP \(bu 3
 786 If
 787 .I ksize
 788 equals
 789 .IR usize ,
 790 then there is no version mismatch and
 791 .I attr
 792 can be used verbatim.
 793 .IP \(bu
 794 If
 795 .I ksize
 796 is larger than
 797 .IR usize ,
 798 then there are some extension fields that the kernel supports
 799 which the user-space application is unaware of.
 800 Because a zero value in any added extension field signifies a no-op,
 801 the kernel treats all of the extension fields
 802 not provided by the user-space application
 803 as having zero values.
 804 This provides backwards-compatibility.
 805 .IP \(bu
 806 If
 807 .I ksize
 808 is smaller than
 809 .IR usize ,
 810 then there are some extension fields which the user-space application is aware
 811 of but which the kernel does not support.
 812 Because any extension field must have its zero values signify a no-op,
 813 the kernel can safely ignore the unsupported extension fields
 814 if they are all zero.
 815 If any unsupported extension fields are non-zero,
 816 then \-1 is returned and
 817 .I errno
 818 is set to
 819 .BR E2BIG .
 820 This provides forwards-compatibility.
 821 .PP
 822 Because the definition of
 823 .I struct mount_attr
 824 may change in the future
 825 (with new fields being added when system headers are updated),
 826 user-space applications should zero-fill
 827 .I struct mount_attr
 828 to ensure that recompiling the program with new headers will not result in
 829 spurious errors at runtime.
 830 The simplest way is to use a designated initializer:
 831 .PP
 832 .in +4n
 833 .EX
 834 struct mount_attr attr = {
 835     .attr_set = MOUNT_ATTR_RDONLY,
 836     .attr_clr = MOUNT_ATTR_NODEV
 837 };
 838 .EE
 839 .in
 840 .PP
 841 Alternatively, the structure can be zero-filled using
 842 .BR memset (3)
 843 or similar functions:
 844 .PP
 845 .in +4n
 846 .EX
 847 struct mount_attr attr;
 848 memset(&attr, 0, sizeof(attr));
 849 attr.attr_set = MOUNT_ATTR_RDONLY;
 850 attr.attr_clr = MOUNT_ATTR_NODEV;
 851 .EE
 852 .in
 853 .PP
 854 A user-space application that wishes to determine which extensions the running
 855 kernel supports can do so by conducting a binary search on
 856 .I size
 857 with a structure which has every byte nonzero
 858 (to find the largest value which doesn't produce an error of
 859 .BR E2BIG ).
 860 .SH EXAMPLES
 861 .EX
 862 /*
 863  * This program allows the caller to create a new detached mount
 864  * and set various properties on it.
 865  */
 866 #define _GNU_SOURCE
 867 #include <errno.h>
 868 #include <fcntl.h>
 869 #include <getopt.h>
 870 #include <linux/mount.h>
 871 #include <linux/types.h>
 872 #include <stdbool.h>
 873 #include <stdio.h>
 874 #include <stdlib.h>
 875 #include <string.h>
 876 #include <sys/syscall.h>
 877 #include <unistd.h>
 878
 879 static inline int
 880 mount_setattr(int dirfd, const char *pathname, unsigned int flags,
 881               struct mount_attr *attr, size_t size)
 882 {
 883     return syscall(SYS_mount_setattr, dirfd, pathname, flags,
 884                    attr, size);
 885 }
 886
 887 static inline int
 888 open_tree(int dirfd, const char *filename, unsigned int flags)
 889 {
 890     return syscall(SYS_open_tree, dirfd, filename, flags);
 891 }
 892
 893 static inline int
 894 move_mount(int from_dirfd, const char *from_pathname,
 895            int to_dirfd, const char *to_pathname, unsigned int flags)
 896 {
 897     return syscall(SYS_move_mount, from_dirfd, from_pathname,
 898                    to_dirfd, to_pathname, flags);
 899 }
 900
 901 static const struct option longopts[] = {
 902     {"map\-mount",       required_argument,  NULL,  'a'},
 903     {"recursive",       no_argument,        NULL,  'b'},
 904     {"read\-only",       no_argument,        NULL,  'c'},
 905     {"block\-setid",     no_argument,        NULL,  'd'},
 906     {"block\-devices",   no_argument,        NULL,  'e'},
 907     {"block\-exec",      no_argument,        NULL,  'f'},
 908     {"no\-access\-time",  no_argument,        NULL,  'g'},
 909     { NULL,             0,                  NULL,   0 },
 910 };
 911
 912 #define exit_log(format, ...)  do           \e
 913 {                                           \e
 914     fprintf(stderr, format, ##__VA_ARGS__); \e
 915     exit(EXIT_FAILURE);                     \e
 916 } while (0)
 917
 918 int
 919 main(int argc, char *argv[])
 920 {
 921     struct mount_attr *attr = &(struct mount_attr){};
 922     int fd_userns = \-1;
 923     bool recursive = false;
 924     int index = 0;
 925     int ret;
 926
 927     while ((ret = getopt_long_only(argc, argv, "",
 928                                    longopts, &index)) != \-1) {
 929         switch (ret) {
 930         case 'a':
 931             fd_userns = open(optarg, O_RDONLY | O_CLOEXEC);
 932             if (fd_userns == \-1)
 933                 exit_log("%m \- Failed top open %s\en", optarg);
 934             break;
 935         case 'b':
 936             recursive = true;
 937             break;
 938         case 'c':
 939             attr\->attr_set |= MOUNT_ATTR_RDONLY;
 940             break;
 941         case 'd':
 942             attr\->attr_set |= MOUNT_ATTR_NOSUID;
 943             break;
 944         case 'e':
 945             attr\->attr_set |= MOUNT_ATTR_NODEV;
 946             break;
 947         case 'f':
 948             attr\->attr_set |= MOUNT_ATTR_NOEXEC;
 949             break;
 950         case 'g':
 951             attr\->attr_set |= MOUNT_ATTR_NOATIME;
 952             attr\->attr_clr |= MOUNT_ATTR__ATIME;
 953             break;
 954         default:
 955             exit_log("Invalid argument specified");
 956         }
 957     }
 958
 959     if ((argc \- optind) < 2)
 960         exit_log("Missing source or target mount point\en");
 961
 962     const char *source = argv[optind];
 963     const char *target = argv[optind + 1];
 964
 965     /* In the following, \-1 as the \(aqdirfd\(aq argument ensures that
 966        open_tree() fails if \(aqsource\(aq is not an absolute pathname. */
 967 .\" Christian Brauner
 968 .\"     When writing programs I like to never use relative paths with AT_FDCWD
 969 .\"     because. Because making assumptions about the current working directory
 970 .\"     of the calling process is just too easy to get wrong; especially when
 971 .\"     pivot_root() or chroot() are in play.
 972 .\"     My absolut preference (joke intended) is to open a well-known starting
 973 .\"     point with an absolute path to get a dirfd and then scope all future
 974 .\"     operations beneath that dirfd. This already works with old-style
 975 .\"     openat() and _very_ cautious programming but openat2() and its
 976 .\"     resolve-flag space have made this **chef's kiss**.
 977 .\"     If I can't operate based on a well-known dirfd I use absolute paths
 978 .\"     with a -EBADF dirfd passed to *at() functions.
 979
 980     int fd_tree = open_tree(\-1, source,
 981                        OPEN_TREE_CLONE | OPEN_TREE_CLOEXEC |
 982                        AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0));
 983     if (fd_tree == \-1)
 984         exit_log("%m \- Failed to open %s\en", source);
 985
 986     if (fd_userns >= 0) {
 987         attr\->attr_set  |= MOUNT_ATTR_IDMAP;
 988         attr\->userns_fd = fd_userns;
 989     }
 990
 991     ret = mount_setattr(fd_tree, "",
 992                         AT_EMPTY_PATH | (recursive ? AT_RECURSIVE : 0),
 993                         attr, sizeof(struct mount_attr));
 994     if (ret == \-1)
 995         exit_log("%m \- Failed to change mount attributes\en");
 996
 997     close(fd_userns);
 998
 999     /* In the following, \-1 as the \(aqto_dirfd\(aq argument ensures that
1000        open_tree() fails if \(aqtarget\(aq is not an absolute pathname. */
1001
1002     ret = move_mount(fd_tree, "", \-1, target,
1003                      MOVE_MOUNT_F_EMPTY_PATH);
1004     if (ret == \-1)
1005         exit_log("%m \- Failed to attach mount to %s\en", target);
1006
1007     close(fd_tree);
1008
1009     exit(EXIT_SUCCESS);
1010 }
1011 .EE
1012 .SH SEE ALSO
1013 .BR newgidmap (1),
1014 .BR newuidmap (1),
1015 .BR clone (2),
1016 .BR mount (2),
1017 .BR unshare (2),
1018 .BR proc (5),
1019 .BR capabilities (7),
1020 .BR mount_namespaces (7),
1021 .BR user_namespaces (7),
1022 .BR xattr (7)