src/shared/copy.c

   1 /* SPDX-License-Identifier: LGPL-2.1-or-later */
   2
   3 #include <errno.h>
   4 #include <fcntl.h>
   5 #include <linux/btrfs.h>
   6 #include <stddef.h>
   7 #include <stdio.h>
   8 #include <stdlib.h>
   9 #include <sys/file.h>
  10 #include <sys/ioctl.h>
  11 #include <sys/sendfile.h>
  12 #include <sys/xattr.h>
  13 #include <unistd.h>
  14
  15 #include "alloc-util.h"
  16 #include "btrfs-util.h"
  17 #include "chattr-util.h"
  18 #include "copy.h"
  19 #include "dirent-util.h"
  20 #include "fd-util.h"
  21 #include "fileio.h"
  22 #include "fs-util.h"
  23 #include "io-util.h"
  24 #include "macro.h"
  25 #include "missing_fs.h"
  26 #include "missing_syscall.h"
  27 #include "mkdir-label.h"
  28 #include "mountpoint-util.h"
  29 #include "nulstr-util.h"
  30 #include "rm-rf.h"
  31 #include "selinux-util.h"
  32 #include "signal-util.h"
  33 #include "stat-util.h"
  34 #include "stdio-util.h"
  35 #include "string-util.h"
  36 #include "strv.h"
  37 #include "sync-util.h"
  38 #include "time-util.h"
  39 #include "tmpfile-util.h"
  40 #include "umask-util.h"
  41 #include "user-util.h"
  42 #include "xattr-util.h"
  43
  44 #define COPY_BUFFER_SIZE (16U*1024U)
  45
  46 /* A safety net for descending recursively into file system trees to copy. On Linux PATH_MAX is 4096, which means the
  47  * deepest valid path one can build is around 2048, which we hence use as a safety net here, to not spin endlessly in
  48  * case of bind mount cycles and suchlike. */
  49 #define COPY_DEPTH_MAX 2048U
  50
  51 static ssize_t try_copy_file_range(
  52                 int fd_in, loff_t *off_in,
  53                 int fd_out, loff_t *off_out,
  54                 size_t len,
  55                 unsigned flags) {
  56
  57         static int have = -1;
  58         ssize_t r;
  59
  60         if (have == 0)
  61                 return -ENOSYS;
  62
  63         r = copy_file_range(fd_in, off_in, fd_out, off_out, len, flags);
  64         if (have < 0)
  65                 have = r >= 0 || errno != ENOSYS;
  66         if (r < 0)
  67                 return -errno;
  68
  69         return r;
  70 }
  71
  72 enum {
  73         FD_IS_NO_PIPE,
  74         FD_IS_BLOCKING_PIPE,
  75         FD_IS_NONBLOCKING_PIPE,
  76 };
  77
  78 static int fd_is_nonblock_pipe(int fd) {
  79         struct stat st;
  80         int flags;
  81
  82         /* Checks whether the specified file descriptor refers to a pipe, and if so if O_NONBLOCK is set. */
  83
  84         if (fstat(fd, &st) < 0)
  85                 return -errno;
  86
  87         if (!S_ISFIFO(st.st_mode))
  88                 return FD_IS_NO_PIPE;
  89
  90         flags = fcntl(fd, F_GETFL);
  91         if (flags < 0)
  92                 return -errno;
  93
  94         return FLAGS_SET(flags, O_NONBLOCK) ? FD_IS_NONBLOCKING_PIPE : FD_IS_BLOCKING_PIPE;
  95 }
  96
  97 static int look_for_signals(CopyFlags copy_flags) {
  98         int r;
  99
 100         if ((copy_flags & (COPY_SIGINT|COPY_SIGTERM)) == 0)
 101                 return 0;
 102
 103         r = pop_pending_signal(copy_flags & COPY_SIGINT ? SIGINT : 0,
 104                                copy_flags & COPY_SIGTERM ? SIGTERM : 0);
 105         if (r < 0)
 106                 return r;
 107         if (r != 0)
 108                 return log_debug_errno(SYNTHETIC_ERRNO(EINTR),
 109                                        "Got %s, cancelling copy operation.", signal_to_string(r));
 110
 111         return 0;
 112 }
 113
 114 static int create_hole(int fd, off_t size) {
 115         off_t offset;
 116         off_t end;
 117
 118         offset = lseek(fd, 0, SEEK_CUR);
 119         if (offset < 0)
 120                 return -errno;
 121
 122         end = lseek(fd, 0, SEEK_END);
 123         if (end < 0)
 124                 return -errno;
 125
 126         /* If we're not at the end of the target file, try to punch a hole in the existing space using fallocate(). */
 127
 128         if (offset < end &&
 129             fallocate(fd, FALLOC_FL_PUNCH_HOLE | FALLOC_FL_KEEP_SIZE, offset, MIN(size, end - offset)) < 0 &&
 130             !ERRNO_IS_NOT_SUPPORTED(errno))
 131                 return -errno;
 132
 133         if (end - offset >= size) {
 134                 /* If we've created the full hole, set the file pointer to the end of the hole we created and exit. */
 135                 if (lseek(fd, offset + size, SEEK_SET) < 0)
 136                         return -errno;
 137
 138                 return 0;
 139         }
 140
 141         /* If we haven't created the full hole, use ftruncate() to grow the file (and the hole) to the
 142          * required size and move the file pointer to the end of the file. */
 143
 144         size -= end - offset;
 145
 146         if (ftruncate(fd, end + size) < 0)
 147                 return -errno;
 148
 149         if (lseek(fd, 0, SEEK_END) < 0)
 150                 return -errno;
 151
 152         return 0;
 153 }
 154
 155 int copy_bytes_full(
 156                 int fdf, int fdt,
 157                 uint64_t max_bytes,
 158                 CopyFlags copy_flags,
 159                 void **ret_remains,
 160                 size_t *ret_remains_size,
 161                 copy_progress_bytes_t progress,
 162                 void *userdata) {
 163
 164         _cleanup_close_ int fdf_opened = -EBADF, fdt_opened = -EBADF;
 165         bool try_cfr = true, try_sendfile = true, try_splice = true, copied_something = false;
 166         int r, nonblock_pipe = -1;
 167         size_t m = SSIZE_MAX; /* that is the maximum that sendfile and c_f_r accept */
 168
 169         assert(fdf >= 0);
 170         assert(fdt >= 0);
 171         assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
 172
 173         /* Tries to copy bytes from the file descriptor 'fdf' to 'fdt' in the smartest possible way. Copies a maximum
 174          * of 'max_bytes', which may be specified as UINT64_MAX, in which no maximum is applied. Returns negative on
 175          * error, zero if EOF is hit before the bytes limit is hit and positive otherwise. If the copy fails for some
 176          * reason but we read but didn't yet write some data an ret_remains/ret_remains_size is not NULL, then it will
 177          * be initialized with an allocated buffer containing this "remaining" data. Note that these two parameters are
 178          * initialized with a valid buffer only on failure and only if there's actually data already read. Otherwise
 179          * these parameters if non-NULL are set to NULL. */
 180
 181         if (ret_remains)
 182                 *ret_remains = NULL;
 183         if (ret_remains_size)
 184                 *ret_remains_size = 0;
 185
 186         fdf = fd_reopen_condition(fdf, O_CLOEXEC | O_NOCTTY | O_RDONLY, O_PATH, &fdf_opened);
 187         if (fdf < 0)
 188                 return fdf;
 189         fdt = fd_reopen_condition(fdt, O_CLOEXEC | O_NOCTTY | O_RDWR, O_PATH, &fdt_opened);
 190         if (fdt < 0)
 191                 return fdt;
 192
 193         /* Try btrfs reflinks first. This only works on regular, seekable files, hence let's check the file offsets of
 194          * source and destination first. */
 195         if ((copy_flags & COPY_REFLINK)) {
 196                 off_t foffset;
 197
 198                 foffset = lseek(fdf, 0, SEEK_CUR);
 199                 if (foffset >= 0) {
 200                         off_t toffset;
 201
 202                         toffset = lseek(fdt, 0, SEEK_CUR);
 203                         if (toffset >= 0) {
 204
 205                                 if (foffset == 0 && toffset == 0 && max_bytes == UINT64_MAX)
 206                                         r = reflink(fdf, fdt); /* full file reflink */
 207                                 else
 208                                         r = reflink_range(fdf, foffset, fdt, toffset, max_bytes == UINT64_MAX ? 0 : max_bytes); /* partial reflink */
 209                                 if (r >= 0) {
 210                                         off_t t;
 211
 212                                         /* This worked, yay! Now — to be fully correct — let's adjust the file pointers */
 213                                         if (max_bytes == UINT64_MAX) {
 214
 215                                                 /* We cloned to the end of the source file, let's position the read
 216                                                  * pointer there, and query it at the same time. */
 217                                                 t = lseek(fdf, 0, SEEK_END);
 218                                                 if (t < 0)
 219                                                         return -errno;
 220                                                 if (t < foffset)
 221                                                         return -ESPIPE;
 222
 223                                                 /* Let's adjust the destination file write pointer by the same number
 224                                                  * of bytes. */
 225                                                 t = lseek(fdt, toffset + (t - foffset), SEEK_SET);
 226                                                 if (t < 0)
 227                                                         return -errno;
 228
 229                                                 return 0; /* we copied the whole thing, hence hit EOF, return 0 */
 230                                         } else {
 231                                                 t = lseek(fdf, foffset + max_bytes, SEEK_SET);
 232                                                 if (t < 0)
 233                                                         return -errno;
 234
 235                                                 t = lseek(fdt, toffset + max_bytes, SEEK_SET);
 236                                                 if (t < 0)
 237                                                         return -errno;
 238
 239                                                 return 1; /* we copied only some number of bytes, which worked, but this means we didn't hit EOF, return 1 */
 240                                         }
 241                                 }
 242                         }
 243                 }
 244         }
 245
 246         for (;;) {
 247                 ssize_t n;
 248
 249                 if (max_bytes <= 0)
 250                         break;
 251
 252                 r = look_for_signals(copy_flags);
 253                 if (r < 0)
 254                         return r;
 255
 256                 if (max_bytes != UINT64_MAX && m > max_bytes)
 257                         m = max_bytes;
 258
 259                 if (copy_flags & COPY_HOLES) {
 260                         off_t c, e;
 261
 262                         c = lseek(fdf, 0, SEEK_CUR);
 263                         if (c < 0)
 264                                 return -errno;
 265
 266                         /* To see if we're in a hole, we search for the next data offset. */
 267                         e = lseek(fdf, c, SEEK_DATA);
 268                         if (e < 0 && errno == ENXIO)
 269                                 /* If errno == ENXIO, that means we've reached the final hole of the file and
 270                                 * that hole isn't followed by more data. */
 271                                 e = lseek(fdf, 0, SEEK_END);
 272                         if (e < 0)
 273                                 return -errno;
 274
 275                         /* If we're in a hole (current offset is not a data offset), create a hole of the
 276                          * same size in the target file. */
 277                         if (e > c) {
 278                                 /* Make sure our new hole doesn't go over the maximum size we're allowed to copy. */
 279                                 n = MIN(max_bytes, (uint64_t) e - c);
 280                                 r = create_hole(fdt, n);
 281                                 if (r < 0)
 282                                         return r;
 283
 284                                 /* Make sure holes are taken into account in the maximum size we're supposed to copy. */
 285                                 if (max_bytes != UINT64_MAX) {
 286                                         max_bytes -= n;
 287                                         if (max_bytes <= 0)
 288                                                 break;
 289                                 }
 290
 291                                 /* Update the size we're supposed to copy in this iteration if needed. */
 292                                 if (m > max_bytes)
 293                                         m = max_bytes;
 294                         }
 295
 296                         c = e; /* Set c to the start of the data segment. */
 297
 298                         /* After copying a potential hole, find the end of the data segment by looking for
 299                          * the next hole. If we get ENXIO, we're at EOF. */
 300                         e = lseek(fdf, c, SEEK_HOLE);
 301                         if (e < 0) {
 302                                 if (errno == ENXIO)
 303                                         break;
 304                                 return -errno;
 305                         }
 306
 307                         /* SEEK_HOLE modifies the file offset so we need to move back to the initial offset. */
 308                         if (lseek(fdf, c, SEEK_SET) < 0)
 309                                 return -errno;
 310
 311                         /* Make sure we're not copying more than the current data segment. */
 312                         m = MIN(m, (size_t) e - c);
 313                 }
 314
 315                 /* First try copy_file_range(), unless we already tried */
 316                 if (try_cfr) {
 317                         n = try_copy_file_range(fdf, NULL, fdt, NULL, m, 0u);
 318                         if (n < 0) {
 319                                 if (!IN_SET(n, -EINVAL, -ENOSYS, -EXDEV, -EBADF))
 320                                         return n;
 321
 322                                 try_cfr = false;
 323                                 /* use fallback below */
 324                         } else if (n == 0) { /* likely EOF */
 325
 326                                 if (copied_something)
 327                                         break;
 328
 329                                 /* So, we hit EOF immediately, without having copied a single byte. This
 330                                  * could indicate two things: the file is actually empty, or we are on some
 331                                  * virtual file system such as procfs/sysfs where the syscall actually
 332                                  * doesn't work but doesn't return an error. Try to handle that, by falling
 333                                  * back to simple read()s in case we encounter empty files.
 334                                  *
 335                                  * See: https://lwn.net/Articles/846403/ */
 336                                 try_cfr = try_sendfile = try_splice = false;
 337                         } else
 338                                 /* Success! */
 339                                 goto next;
 340                 }
 341
 342                 /* First try sendfile(), unless we already tried */
 343                 if (try_sendfile) {
 344                         n = sendfile(fdt, fdf, NULL, m);
 345                         if (n < 0) {
 346                                 if (!IN_SET(errno, EINVAL, ENOSYS))
 347                                         return -errno;
 348
 349                                 try_sendfile = false;
 350                                 /* use fallback below */
 351                         } else if (n == 0) { /* likely EOF */
 352
 353                                 if (copied_something)
 354                                         break;
 355
 356                                 try_sendfile = try_splice = false; /* same logic as above for copy_file_range() */
 357                         } else
 358                                 /* Success! */
 359                                 goto next;
 360                 }
 361
 362                 /* Then try splice, unless we already tried. */
 363                 if (try_splice) {
 364
 365                         /* splice()'s asynchronous I/O support is a bit weird. When it encounters a pipe file
 366                          * descriptor, then it will ignore its O_NONBLOCK flag and instead only honour the
 367                          * SPLICE_F_NONBLOCK flag specified in its flag parameter. Let's hide this behaviour
 368                          * here, and check if either of the specified fds are a pipe, and if so, let's pass
 369                          * the flag automatically, depending on O_NONBLOCK being set.
 370                          *
 371                          * Here's a twist though: when we use it to move data between two pipes of which one
 372                          * has O_NONBLOCK set and the other has not, then we have no individual control over
 373                          * O_NONBLOCK behaviour. Hence in that case we can't use splice() and still guarantee
 374                          * systematic O_NONBLOCK behaviour, hence don't. */
 375
 376                         if (nonblock_pipe < 0) {
 377                                 int a, b;
 378
 379                                 /* Check if either of these fds is a pipe, and if so non-blocking or not */
 380                                 a = fd_is_nonblock_pipe(fdf);
 381                                 if (a < 0)
 382                                         return a;
 383
 384                                 b = fd_is_nonblock_pipe(fdt);
 385                                 if (b < 0)
 386                                         return b;
 387
 388                                 if ((a == FD_IS_NO_PIPE && b == FD_IS_NO_PIPE) ||
 389                                     (a == FD_IS_BLOCKING_PIPE && b == FD_IS_NONBLOCKING_PIPE) ||
 390                                     (a == FD_IS_NONBLOCKING_PIPE && b == FD_IS_BLOCKING_PIPE))
 391
 392                                         /* splice() only works if one of the fds is a pipe. If neither is,
 393                                          * let's skip this step right-away. As mentioned above, if one of the
 394                                          * two fds refers to a blocking pipe and the other to a non-blocking
 395                                          * pipe, we can't use splice() either, hence don't try either. This
 396                                          * hence means we can only use splice() if either only one of the two
 397                                          * fds is a pipe, or if both are pipes with the same nonblocking flag
 398                                          * setting. */
 399
 400                                         try_splice = false;
 401                                 else
 402                                         nonblock_pipe = a == FD_IS_NONBLOCKING_PIPE || b == FD_IS_NONBLOCKING_PIPE;
 403                         }
 404                 }
 405
 406                 if (try_splice) {
 407                         n = splice(fdf, NULL, fdt, NULL, m, nonblock_pipe ? SPLICE_F_NONBLOCK : 0);
 408                         if (n < 0) {
 409                                 if (!IN_SET(errno, EINVAL, ENOSYS))
 410                                         return -errno;
 411
 412                                 try_splice = false;
 413                                 /* use fallback below */
 414                         } else if (n == 0) { /* likely EOF */
 415
 416                                 if (copied_something)
 417                                         break;
 418
 419                                 try_splice = false; /* same logic as above for copy_file_range() + sendfile() */
 420                         } else
 421                                 /* Success! */
 422                                 goto next;
 423                 }
 424
 425                 /* As a fallback just copy bits by hand */
 426                 {
 427                         uint8_t buf[MIN(m, COPY_BUFFER_SIZE)], *p = buf;
 428                         ssize_t z;
 429
 430                         n = read(fdf, buf, sizeof buf);
 431                         if (n < 0)
 432                                 return -errno;
 433                         if (n == 0) /* EOF */
 434                                 break;
 435
 436                         z = (size_t) n;
 437                         do {
 438                                 ssize_t k;
 439
 440                                 k = write(fdt, p, z);
 441                                 if (k < 0) {
 442                                         r = -errno;
 443
 444                                         if (ret_remains) {
 445                                                 void *copy;
 446
 447                                                 copy = memdup(p, z);
 448                                                 if (!copy)
 449                                                         return -ENOMEM;
 450
 451                                                 *ret_remains = copy;
 452                                         }
 453
 454                                         if (ret_remains_size)
 455                                                 *ret_remains_size = z;
 456
 457                                         return r;
 458                                 }
 459
 460                                 assert(k <= z);
 461                                 z -= k;
 462                                 p += k;
 463                         } while (z > 0);
 464                 }
 465
 466         next:
 467                 if (progress) {
 468                         r = progress(n, userdata);
 469                         if (r < 0)
 470                                 return r;
 471                 }
 472
 473                 if (max_bytes != UINT64_MAX) {
 474                         assert(max_bytes >= (uint64_t) n);
 475                         max_bytes -= n;
 476                 }
 477
 478                 /* sendfile accepts at most SSIZE_MAX-offset bytes to copy, so reduce our maximum by the
 479                  * amount we already copied, but don't go below our copy buffer size, unless we are close the
 480                  * limit of bytes we are allowed to copy. */
 481                 m = MAX(MIN(COPY_BUFFER_SIZE, max_bytes), m - n);
 482
 483                 copied_something = true;
 484         }
 485
 486         if (copy_flags & COPY_TRUNCATE) {
 487                 off_t off = lseek(fdt, 0, SEEK_CUR);
 488                 if (off < 0)
 489                         return -errno;
 490
 491                 if (ftruncate(fdt, off) < 0)
 492                         return -errno;
 493         }
 494
 495         return max_bytes <= 0; /* return 0 if we hit EOF earlier than the size limit */
 496 }
 497
 498 static int fd_copy_symlink(
 499                 int df,
 500                 const char *from,
 501                 const struct stat *st,
 502                 int dt,
 503                 const char *to,
 504                 uid_t override_uid,
 505                 gid_t override_gid,
 506                 CopyFlags copy_flags) {
 507
 508         _cleanup_free_ char *target = NULL;
 509         int r;
 510
 511         assert(from);
 512         assert(st);
 513         assert(to);
 514
 515         r = readlinkat_malloc(df, from, &target);
 516         if (r < 0)
 517                 return r;
 518
 519         if (copy_flags & COPY_MAC_CREATE) {
 520                 r = mac_selinux_create_file_prepare_at(dt, to, S_IFLNK);
 521                 if (r < 0)
 522                         return r;
 523         }
 524         r = RET_NERRNO(symlinkat(target, dt, to));
 525         if (copy_flags & COPY_MAC_CREATE)
 526                 mac_selinux_create_file_clear();
 527         if (r < 0) {
 528                 if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_PRIVILEGE(r) || ERRNO_IS_NOT_SUPPORTED(r))) {
 529                         log_notice_errno(r, "Failed to copy symlink '%s', ignoring: %m", from);
 530                         return 0;
 531                 }
 532
 533                 return r;
 534         }
 535
 536         if (fchownat(dt, to,
 537                      uid_is_valid(override_uid) ? override_uid : st->st_uid,
 538                      gid_is_valid(override_gid) ? override_gid : st->st_gid,
 539                      AT_SYMLINK_NOFOLLOW) < 0)
 540                 r = -errno;
 541
 542         (void) copy_xattr(df, from, dt, to, copy_flags);
 543         (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
 544         return r;
 545 }
 546
 547 /* Encapsulates the database we store potential hardlink targets in */
 548 typedef struct HardlinkContext {
 549         int dir_fd;    /* An fd to the directory we use as lookup table. Never AT_FDCWD. Lazily created, when
 550                         * we add the first entry. */
 551
 552         /* These two fields are used to create the hardlink repository directory above — via
 553          * mkdirat(parent_fd, subdir) — and are kept so that we can automatically remove the directory again
 554          * when we are done. */
 555         int parent_fd; /* Possibly AT_FDCWD */
 556         char *subdir;
 557 } HardlinkContext;
 558
 559 static int hardlink_context_setup(
 560                 HardlinkContext *c,
 561                 int dt,
 562                 const char *to,
 563                 CopyFlags copy_flags) {
 564
 565         _cleanup_close_ int dt_copy = -EBADF;
 566         int r;
 567
 568         assert(c);
 569         assert(c->dir_fd < 0 && c->dir_fd != AT_FDCWD);
 570         assert(c->parent_fd < 0);
 571         assert(!c->subdir);
 572
 573         /* If hardlink recreation is requested we have to maintain a database of inodes that are potential
 574          * hardlink sources. Given that generally disk sizes have to be assumed to be larger than what fits
 575          * into physical RAM we cannot maintain that database in dynamic memory alone. Here we opt to
 576          * maintain it on disk, to simplify things: inside the destination directory we'll maintain a
 577          * temporary directory consisting of hardlinks of every inode we copied that might be subject of
 578          * hardlinks. We can then use that as hardlink source later on. Yes, this means additional disk IO
 579          * but thankfully Linux is optimized for this kind of thing. If this ever becomes a performance
 580          * bottleneck we can certainly place an in-memory hash table in front of this, but for the beginning,
 581          * let's keep things simple, and just use the disk as lookup table for inodes.
 582          *
 583          * Note that this should have zero performance impact as long as .n_link of all files copied remains
 584          * <= 0, because in that case we will not actually allocate the hardlink inode lookup table directory
 585          * on disk (we do so lazily, when the first candidate with .n_link > 1 is seen). This means, in the
 586          * common case where hardlinks are not used at all or only for few files the fact that we store the
 587          * table on disk shouldn't matter perfomance-wise. */
 588
 589         if (!FLAGS_SET(copy_flags, COPY_HARDLINKS))
 590                 return 0;
 591
 592         if (dt == AT_FDCWD)
 593                 dt_copy = AT_FDCWD;
 594         else if (dt < 0)
 595                 return -EBADF;
 596         else {
 597                 dt_copy = fcntl(dt, F_DUPFD_CLOEXEC, 3);
 598                 if (dt_copy < 0)
 599                         return -errno;
 600         }
 601
 602         r = tempfn_random_child(to, "hardlink", &c->subdir);
 603         if (r < 0)
 604                 return r;
 605
 606         c->parent_fd = TAKE_FD(dt_copy);
 607
 608         /* We don't actually create the directory we keep the table in here, that's done on-demand when the
 609          * first entry is added, using hardlink_context_realize() below. */
 610         return 1;
 611 }
 612
 613 static int hardlink_context_realize(HardlinkContext *c) {
 614         if (!c)
 615                 return 0;
 616
 617         if (c->dir_fd >= 0) /* Already realized */
 618                 return 1;
 619
 620         if (c->parent_fd < 0 && c->parent_fd != AT_FDCWD) /* Not configured */
 621                 return 0;
 622
 623         assert(c->subdir);
 624
 625         c->dir_fd = open_mkdir_at(c->parent_fd, c->subdir, O_EXCL|O_CLOEXEC, 0700);
 626         if (c->dir_fd < 0)
 627                 return c->dir_fd;
 628
 629         return 1;
 630 }
 631
 632 static void hardlink_context_destroy(HardlinkContext *c) {
 633         int r;
 634
 635         assert(c);
 636
 637         /* Automatically remove the hardlink lookup table directory again after we are done. This is used via
 638          * _cleanup_() so that we really delete this, even on failure. */
 639
 640         if (c->dir_fd >= 0) {
 641                 /* <dir_fd> might be have already been used for reading, so we need to rewind it. */
 642                 if (lseek(c->dir_fd, 0, SEEK_SET) < 0)
 643                         log_debug_errno(errno, "Failed to lseek on file descriptor, ignoring: %m");
 644
 645                 r = rm_rf_children(TAKE_FD(c->dir_fd), REMOVE_PHYSICAL, NULL); /* consumes dir_fd in all cases, even on failure */
 646                 if (r < 0)
 647                         log_debug_errno(r, "Failed to remove hardlink store (%s) contents, ignoring: %m", c->subdir);
 648
 649                 assert(c->parent_fd >= 0 || c->parent_fd == AT_FDCWD);
 650                 assert(c->subdir);
 651
 652                 if (unlinkat(c->parent_fd, c->subdir, AT_REMOVEDIR) < 0)
 653                         log_debug_errno(errno, "Failed to remove hardlink store (%s) directory, ignoring: %m", c->subdir);
 654         }
 655
 656         assert_cc(AT_FDCWD < 0);
 657         c->parent_fd = safe_close(c->parent_fd);
 658
 659         c->subdir = mfree(c->subdir);
 660 }
 661
 662 static int try_hardlink(
 663                 HardlinkContext *c,
 664                 const struct stat *st,
 665                 int dt,
 666                 const char *to) {
 667
 668         char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4];
 669
 670         assert(st);
 671         assert(dt >= 0 || dt == AT_FDCWD);
 672         assert(to);
 673
 674         if (!c) /* No temporary hardlink directory, don't bother */
 675                 return 0;
 676
 677         if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */
 678                 return 0;
 679
 680         if (c->dir_fd < 0) /* not yet realized, hence empty */
 681                 return 0;
 682
 683         xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino);
 684         if (linkat(c->dir_fd, dev_ino, dt, to, 0) < 0)  {
 685                 if (errno != ENOENT) /* doesn't exist in store yet */
 686                         log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", dev_ino, to);
 687                 return 0;
 688         }
 689
 690         return 1;
 691 }
 692
 693 static int memorize_hardlink(
 694                 HardlinkContext *c,
 695                 const struct stat *st,
 696                 int dt,
 697                 const char *to) {
 698
 699         char dev_ino[DECIMAL_STR_MAX(dev_t)*2 + DECIMAL_STR_MAX(uint64_t) + 4];
 700         int r;
 701
 702         assert(st);
 703         assert(dt >= 0 || dt == AT_FDCWD);
 704         assert(to);
 705
 706         if (!c) /* No temporary hardlink directory, don't bother */
 707                 return 0;
 708
 709         if (st->st_nlink <= 1) /* Source not hardlinked, don't bother */
 710                 return 0;
 711
 712         r = hardlink_context_realize(c); /* Create the hardlink store lazily */
 713         if (r < 0)
 714                 return r;
 715
 716         xsprintf(dev_ino, "%u:%u:%" PRIu64, major(st->st_dev), minor(st->st_dev), (uint64_t) st->st_ino);
 717         if (linkat(dt, to, c->dir_fd, dev_ino, 0) < 0) {
 718                 log_debug_errno(errno, "Failed to hardlink %s to %s, ignoring: %m", to, dev_ino);
 719                 return 0;
 720         }
 721
 722         return 1;
 723 }
 724
 725 static int fd_copy_tree_generic(
 726                 int df,
 727                 const char *from,
 728                 const struct stat *st,
 729                 int dt,
 730                 const char *to,
 731                 dev_t original_device,
 732                 unsigned depth_left,
 733                 uid_t override_uid,
 734                 gid_t override_gid,
 735                 CopyFlags copy_flags,
 736                 Hashmap *denylist,
 737                 Set *subvolumes,
 738                 HardlinkContext *hardlink_context,
 739                 const char *display_path,
 740                 copy_progress_path_t progress_path,
 741                 copy_progress_bytes_t progress_bytes,
 742                 void *userdata);
 743
 744 static int fd_copy_regular(
 745                 int df,
 746                 const char *from,
 747                 const struct stat *st,
 748                 int dt,
 749                 const char *to,
 750                 uid_t override_uid,
 751                 gid_t override_gid,
 752                 CopyFlags copy_flags,
 753                 HardlinkContext *hardlink_context,
 754                 copy_progress_bytes_t progress,
 755                 void *userdata) {
 756
 757         _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
 758         int r, q;
 759
 760         assert(from);
 761         assert(st);
 762         assert(to);
 763
 764         r = try_hardlink(hardlink_context, st, dt, to);
 765         if (r < 0)
 766                 return r;
 767         if (r > 0) /* worked! */
 768                 return 0;
 769
 770         fdf = openat(df, from, O_RDONLY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
 771         if (fdf < 0)
 772                 return -errno;
 773
 774         if (copy_flags & COPY_MAC_CREATE) {
 775                 r = mac_selinux_create_file_prepare_at(dt, to, S_IFREG);
 776                 if (r < 0)
 777                         return r;
 778         }
 779         fdt = openat(dt, to, O_WRONLY|O_CREAT|O_EXCL|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW, st->st_mode & 07777);
 780         if (copy_flags & COPY_MAC_CREATE)
 781                 mac_selinux_create_file_clear();
 782         if (fdt < 0)
 783                 return -errno;
 784
 785         r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress, userdata);
 786         if (r < 0)
 787                 goto fail;
 788
 789         if (fchown(fdt,
 790                    uid_is_valid(override_uid) ? override_uid : st->st_uid,
 791                    gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0)
 792                 r = -errno;
 793
 794         if (fchmod(fdt, st->st_mode & 07777) < 0)
 795                 r = -errno;
 796
 797         (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim });
 798         (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
 799
 800         if (copy_flags & COPY_FSYNC) {
 801                 if (fsync(fdt) < 0) {
 802                         r = -errno;
 803                         goto fail;
 804                 }
 805         }
 806
 807         q = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
 808         if (q < 0) {
 809                 r = q;
 810                 goto fail;
 811         }
 812
 813         (void) memorize_hardlink(hardlink_context, st, dt, to);
 814         return r;
 815
 816 fail:
 817         (void) unlinkat(dt, to, 0);
 818         return r;
 819 }
 820
 821 static int fd_copy_fifo(
 822                 int df,
 823                 const char *from,
 824                 const struct stat *st,
 825                 int dt,
 826                 const char *to,
 827                 uid_t override_uid,
 828                 gid_t override_gid,
 829                 CopyFlags copy_flags,
 830                 HardlinkContext *hardlink_context) {
 831         int r;
 832
 833         assert(from);
 834         assert(st);
 835         assert(to);
 836
 837         r = try_hardlink(hardlink_context, st, dt, to);
 838         if (r < 0)
 839                 return r;
 840         if (r > 0) /* worked! */
 841                 return 0;
 842
 843         if (copy_flags & COPY_MAC_CREATE) {
 844                 r = mac_selinux_create_file_prepare_at(dt, to, S_IFIFO);
 845                 if (r < 0)
 846                         return r;
 847         }
 848         r = RET_NERRNO(mkfifoat(dt, to, st->st_mode & 07777));
 849         if (copy_flags & COPY_MAC_CREATE)
 850                 mac_selinux_create_file_clear();
 851         if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) {
 852                 log_notice_errno(r, "Failed to copy fifo '%s', ignoring: %m", from);
 853                 return 0;
 854         } else if (r < 0)
 855                 return r;
 856
 857         if (fchownat(dt, to,
 858                      uid_is_valid(override_uid) ? override_uid : st->st_uid,
 859                      gid_is_valid(override_gid) ? override_gid : st->st_gid,
 860                      AT_SYMLINK_NOFOLLOW) < 0)
 861                 r = -errno;
 862
 863         if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0)
 864                 r = -errno;
 865
 866         (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
 867
 868         (void) memorize_hardlink(hardlink_context, st, dt, to);
 869         return r;
 870 }
 871
 872 static int fd_copy_node(
 873                 int df,
 874                 const char *from,
 875                 const struct stat *st,
 876                 int dt,
 877                 const char *to,
 878                 uid_t override_uid,
 879                 gid_t override_gid,
 880                 CopyFlags copy_flags,
 881                 HardlinkContext *hardlink_context) {
 882         int r;
 883
 884         assert(from);
 885         assert(st);
 886         assert(to);
 887
 888         r = try_hardlink(hardlink_context, st, dt, to);
 889         if (r < 0)
 890                 return r;
 891         if (r > 0) /* worked! */
 892                 return 0;
 893
 894         if (copy_flags & COPY_MAC_CREATE) {
 895                 r = mac_selinux_create_file_prepare_at(dt, to, st->st_mode & S_IFMT);
 896                 if (r < 0)
 897                         return r;
 898         }
 899         r = RET_NERRNO(mknodat(dt, to, st->st_mode, st->st_rdev));
 900         if (copy_flags & COPY_MAC_CREATE)
 901                 mac_selinux_create_file_clear();
 902         if (FLAGS_SET(copy_flags, COPY_GRACEFUL_WARN) && (ERRNO_IS_NEG_PRIVILEGE(r) || ERRNO_IS_NEG_NOT_SUPPORTED(r))) {
 903                 log_notice_errno(r, "Failed to copy node '%s', ignoring: %m", from);
 904                 return 0;
 905         } else if (r < 0)
 906                 return r;
 907
 908         if (fchownat(dt, to,
 909                      uid_is_valid(override_uid) ? override_uid : st->st_uid,
 910                      gid_is_valid(override_gid) ? override_gid : st->st_gid,
 911                      AT_SYMLINK_NOFOLLOW) < 0)
 912                 r = -errno;
 913
 914         if (fchmodat(dt, to, st->st_mode & 07777, 0) < 0)
 915                 r = -errno;
 916
 917         (void) utimensat(dt, to, (struct timespec[]) { st->st_atim, st->st_mtim }, AT_SYMLINK_NOFOLLOW);
 918
 919         (void) memorize_hardlink(hardlink_context, st, dt, to);
 920         return r;
 921 }
 922
 923 static int fd_copy_directory(
 924                 int df,
 925                 const char *from,
 926                 const struct stat *st,
 927                 int dt,
 928                 const char *to,
 929                 dev_t original_device,
 930                 unsigned depth_left,
 931                 uid_t override_uid,
 932                 gid_t override_gid,
 933                 CopyFlags copy_flags,
 934                 Hashmap *denylist,
 935                 Set *subvolumes,
 936                 HardlinkContext *hardlink_context,
 937                 const char *display_path,
 938                 copy_progress_path_t progress_path,
 939                 copy_progress_bytes_t progress_bytes,
 940                 void *userdata) {
 941
 942         _cleanup_(hardlink_context_destroy) HardlinkContext our_hardlink_context = {
 943                 .dir_fd = -EBADF,
 944                 .parent_fd = -EBADF,
 945         };
 946
 947         _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
 948         _cleanup_closedir_ DIR *d = NULL;
 949         bool exists;
 950         int r;
 951
 952         assert(st);
 953         assert(to);
 954
 955         if (depth_left == 0)
 956                 return -ENAMETOOLONG;
 957
 958         if (from)
 959                 fdf = openat(df, from, O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW);
 960         else
 961                 fdf = fcntl(df, F_DUPFD_CLOEXEC, 3);
 962         if (fdf < 0)
 963                 return -errno;
 964
 965         if (!hardlink_context) {
 966                 /* If recreating hardlinks is requested let's set up a context for that now. */
 967                 r = hardlink_context_setup(&our_hardlink_context, dt, to, copy_flags);
 968                 if (r < 0)
 969                         return r;
 970                 if (r > 0) /* It's enabled and allocated, let's now use the same context for all recursive
 971                             * invocations from here down */
 972                         hardlink_context = &our_hardlink_context;
 973         }
 974
 975         d = take_fdopendir(&fdf);
 976         if (!d)
 977                 return -errno;
 978
 979         r = dir_is_empty_at(dt, to, /* ignore_hidden_or_backup= */ false);
 980         if (r < 0 && r != -ENOENT)
 981                 return r;
 982         if ((r > 0 && !(copy_flags & (COPY_MERGE|COPY_MERGE_EMPTY))) || (r == 0 && !FLAGS_SET(copy_flags, COPY_MERGE)))
 983                 return -EEXIST;
 984
 985         exists = r >= 0;
 986
 987         fdt = xopenat_lock(dt, to,
 988                            O_RDONLY|O_DIRECTORY|O_CLOEXEC|O_NOCTTY|O_NOFOLLOW|(exists ? 0 : O_CREAT|O_EXCL),
 989                            (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0)|(set_contains(subvolumes, st) ? XO_SUBVOLUME : 0),
 990                            st->st_mode & 07777,
 991                            copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE,
 992                            LOCK_EX);
 993         if (fdt < 0)
 994                 return fdt;
 995
 996         r = 0;
 997
 998         if (PTR_TO_INT(hashmap_get(denylist, st)) == DENY_CONTENTS) {
 999                 log_debug("%s is in the denylist, not recursing", from);
1000                 goto finish;
1001         }
1002
1003         FOREACH_DIRENT_ALL(de, d, return -errno) {
1004                 const char *child_display_path = NULL;
1005                 _cleanup_free_ char *dp = NULL;
1006                 struct stat buf;
1007                 int q;
1008
1009                 if (dot_or_dot_dot(de->d_name))
1010                         continue;
1011
1012                 r = look_for_signals(copy_flags);
1013                 if (r < 0)
1014                         return r;
1015
1016                 if (fstatat(dirfd(d), de->d_name, &buf, AT_SYMLINK_NOFOLLOW) < 0) {
1017                         r = -errno;
1018                         continue;
1019                 }
1020
1021                 if (progress_path) {
1022                         if (display_path)
1023                                 child_display_path = dp = path_join(display_path, de->d_name);
1024                         else
1025                                 child_display_path = de->d_name;
1026
1027                         r = progress_path(child_display_path, &buf, userdata);
1028                         if (r < 0)
1029                                 return r;
1030                 }
1031
1032                 if (PTR_TO_INT(hashmap_get(denylist, &buf)) == DENY_INODE) {
1033                         log_debug("%s/%s is in the denylist, ignoring", from, de->d_name);
1034                         continue;
1035                 }
1036
1037                 if (S_ISDIR(buf.st_mode)) {
1038                         /*
1039                          * Don't descend into directories on other file systems, if this is requested. We do a simple
1040                          * .st_dev check here, which basically comes for free. Note that we do this check only on
1041                          * directories, not other kind of file system objects, for two reason:
1042                          *
1043                          * • The kernel's overlayfs pseudo file system that overlays multiple real file systems
1044                          *   propagates the .st_dev field of the file system a file originates from all the way up
1045                          *   through the stack to stat(). It doesn't do that for directories however. This means that
1046                          *   comparing .st_dev on non-directories suggests that they all are mount points. To avoid
1047                          *   confusion we hence avoid relying on this check for regular files.
1048                          *
1049                          * • The main reason we do this check at all is to protect ourselves from bind mount cycles,
1050                          *   where we really want to avoid descending down in all eternity. However the .st_dev check
1051                          *   is usually not sufficient for this protection anyway, as bind mount cycles from the same
1052                          *   file system onto itself can't be detected that way. (Note we also do a recursion depth
1053                          *   check, which is probably the better protection in this regard, which is why
1054                          *   COPY_SAME_MOUNT is optional).
1055                          */
1056
1057                         if (FLAGS_SET(copy_flags, COPY_SAME_MOUNT)) {
1058                                 if (buf.st_dev != original_device)
1059                                         continue;
1060
1061                                 r = fd_is_mount_point(dirfd(d), de->d_name, 0);
1062                                 if (r < 0)
1063                                         return r;
1064                                 if (r > 0)
1065                                         continue;
1066                         }
1067                 }
1068
1069                 q = fd_copy_tree_generic(dirfd(d), de->d_name, &buf, fdt, de->d_name, original_device,
1070                                          depth_left-1, override_uid, override_gid, copy_flags & ~COPY_LOCK_BSD,
1071                                          denylist, subvolumes, hardlink_context, child_display_path, progress_path,
1072                                          progress_bytes, userdata);
1073
1074                 if (q == -EINTR) /* Propagate SIGINT/SIGTERM up instantly */
1075                         return q;
1076                 if (q == -EEXIST && (copy_flags & COPY_MERGE))
1077                         q = 0;
1078                 if (q < 0)
1079                         r = q;
1080         }
1081
1082 finish:
1083         if (!exists) {
1084                 if (fchown(fdt,
1085                            uid_is_valid(override_uid) ? override_uid : st->st_uid,
1086                            gid_is_valid(override_gid) ? override_gid : st->st_gid) < 0)
1087                         r = -errno;
1088
1089                 if (fchmod(fdt, st->st_mode & 07777) < 0)
1090                         r = -errno;
1091
1092                 (void) copy_xattr(dirfd(d), NULL, fdt, NULL, copy_flags);
1093                 (void) futimens(fdt, (struct timespec[]) { st->st_atim, st->st_mtim });
1094         }
1095
1096         if (copy_flags & COPY_FSYNC_FULL) {
1097                 if (fsync(fdt) < 0)
1098                         return -errno;
1099         }
1100
1101         if (r < 0)
1102                 return r;
1103
1104         return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0;
1105 }
1106
1107 static int fd_copy_leaf(
1108                 int df,
1109                 const char *from,
1110                 const struct stat *st,
1111                 int dt,
1112                 const char *to,
1113                 uid_t override_uid,
1114                 gid_t override_gid,
1115                 CopyFlags copy_flags,
1116                 HardlinkContext *hardlink_context,
1117                 const char *display_path,
1118                 copy_progress_bytes_t progress_bytes,
1119                 void *userdata) {
1120         int r;
1121
1122         if (S_ISREG(st->st_mode))
1123                 r = fd_copy_regular(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, progress_bytes, userdata);
1124         else if (S_ISLNK(st->st_mode))
1125                 r = fd_copy_symlink(df, from, st, dt, to, override_uid, override_gid, copy_flags);
1126         else if (S_ISFIFO(st->st_mode))
1127                 r = fd_copy_fifo(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context);
1128         else if (S_ISBLK(st->st_mode) || S_ISCHR(st->st_mode) || S_ISSOCK(st->st_mode))
1129                 r = fd_copy_node(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context);
1130         else
1131                 r = -EOPNOTSUPP;
1132
1133         return r;
1134 }
1135
1136 static int fd_copy_tree_generic(
1137                 int df,
1138                 const char *from,
1139                 const struct stat *st,
1140                 int dt,
1141                 const char *to,
1142                 dev_t original_device,
1143                 unsigned depth_left,
1144                 uid_t override_uid,
1145                 gid_t override_gid,
1146                 CopyFlags copy_flags,
1147                 Hashmap *denylist,
1148                 Set *subvolumes,
1149                 HardlinkContext *hardlink_context,
1150                 const char *display_path,
1151                 copy_progress_path_t progress_path,
1152                 copy_progress_bytes_t progress_bytes,
1153                 void *userdata) {
1154
1155         int r;
1156
1157         assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
1158
1159         if (S_ISDIR(st->st_mode))
1160                 return fd_copy_directory(df, from, st, dt, to, original_device, depth_left-1, override_uid,
1161                                          override_gid, copy_flags, denylist, subvolumes, hardlink_context,
1162                                          display_path, progress_path, progress_bytes, userdata);
1163
1164         DenyType t = PTR_TO_INT(hashmap_get(denylist, st));
1165         if (t == DENY_INODE) {
1166                 log_debug("%s is in the denylist, ignoring", from);
1167                 return 0;
1168         } else if (t == DENY_CONTENTS)
1169                 log_debug("%s is configured to have its contents excluded, but is not a directory", from);
1170
1171         r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata);
1172         /* We just tried to copy a leaf node of the tree. If it failed because the node already exists *and* the COPY_REPLACE flag has been provided, we should unlink the node and re-copy. */
1173         if (r == -EEXIST && (copy_flags & COPY_REPLACE)) {
1174                 /* This codepath is us trying to address an error to copy, if the unlink fails, lets just return the original error. */
1175                 if (unlinkat(dt, to, 0) < 0)
1176                         return r;
1177
1178                 r = fd_copy_leaf(df, from, st, dt, to, override_uid, override_gid, copy_flags, hardlink_context, display_path, progress_bytes, userdata);
1179         }
1180
1181         return r;
1182 }
1183
1184 int copy_tree_at_full(
1185                 int fdf,
1186                 const char *from,
1187                 int fdt,
1188                 const char *to,
1189                 uid_t override_uid,
1190                 gid_t override_gid,
1191                 CopyFlags copy_flags,
1192                 Hashmap *denylist,
1193                 Set *subvolumes,
1194                 copy_progress_path_t progress_path,
1195                 copy_progress_bytes_t progress_bytes,
1196                 void *userdata) {
1197
1198         struct stat st;
1199         int r;
1200
1201         assert(from);
1202         assert(to);
1203         assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
1204
1205         if (fstatat(fdf, from, &st, AT_SYMLINK_NOFOLLOW) < 0)
1206                 return -errno;
1207
1208         r = fd_copy_tree_generic(fdf, from, &st, fdt, to, st.st_dev, COPY_DEPTH_MAX, override_uid,
1209                                  override_gid, copy_flags, denylist, subvolumes, NULL, NULL, progress_path,
1210                                  progress_bytes, userdata);
1211         if (r < 0)
1212                 return r;
1213
1214         if (S_ISDIR(st.st_mode) && (copy_flags & COPY_SYNCFS)) {
1215                 /* If the top-level inode is a directory run syncfs() now. */
1216                 r = syncfs_path(fdt, to);
1217                 if (r < 0)
1218                         return r;
1219         } else if ((copy_flags & (COPY_FSYNC_FULL|COPY_SYNCFS)) != 0) {
1220                 /* fsync() the parent dir of what we just copied if COPY_FSYNC_FULL is set. Also do this in
1221                  * case COPY_SYNCFS is set but the top-level inode wasn't actually a directory. We do this so that
1222                  * COPY_SYNCFS provides reasonable synchronization semantics on any kind of inode: when the
1223                  * copy operation is done the whole inode — regardless of its type — and all its children
1224                  * will be synchronized to disk. */
1225                 r = fsync_parent_at(fdt, to);
1226                 if (r < 0)
1227                         return r;
1228         }
1229
1230         return 0;
1231 }
1232
1233 static int sync_dir_by_flags(int dir_fd, const char *path, CopyFlags copy_flags) {
1234         assert(dir_fd >= 0 || dir_fd == AT_FDCWD);
1235         assert(path);
1236
1237         if (copy_flags & COPY_SYNCFS)
1238                 return syncfs_path(dir_fd, path);
1239         if (copy_flags & COPY_FSYNC_FULL)
1240                 return fsync_parent_at(dir_fd, path);
1241
1242         return 0;
1243 }
1244
1245 int copy_directory_at_full(
1246                 int dir_fdf,
1247                 const char *from,
1248                 int dir_fdt,
1249                 const char *to,
1250                 CopyFlags copy_flags,
1251                 copy_progress_path_t progress_path,
1252                 copy_progress_bytes_t progress_bytes,
1253                 void *userdata) {
1254
1255         _cleanup_close_ int fdt = -EBADF;
1256         struct stat st;
1257         int r;
1258
1259         assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
1260         assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD);
1261         assert(to);
1262
1263         if (fstatat(dir_fdf, strempty(from), &st, AT_SYMLINK_NOFOLLOW|(isempty(from) ? AT_EMPTY_PATH : 0)) < 0)
1264                 return -errno;
1265
1266         r = stat_verify_directory(&st);
1267         if (r < 0)
1268                 return r;
1269
1270         r = fd_copy_directory(
1271                         dir_fdf, from,
1272                         &st,
1273                         dir_fdt, to,
1274                         st.st_dev,
1275                         COPY_DEPTH_MAX,
1276                         UID_INVALID, GID_INVALID,
1277                         copy_flags,
1278                         NULL, NULL, NULL, NULL,
1279                         progress_path,
1280                         progress_bytes,
1281                         userdata);
1282         if (r < 0)
1283                 return r;
1284
1285         if (FLAGS_SET(copy_flags, COPY_LOCK_BSD))
1286                 fdt = r;
1287
1288         r = sync_dir_by_flags(dir_fdt, to, copy_flags);
1289         if (r < 0)
1290                 return r;
1291
1292         return FLAGS_SET(copy_flags, COPY_LOCK_BSD) ? TAKE_FD(fdt) : 0;
1293 }
1294
1295 int copy_file_fd_at_full(
1296                 int dir_fdf,
1297                 const char *from,
1298                 int fdt,
1299                 CopyFlags copy_flags,
1300                 copy_progress_bytes_t progress_bytes,
1301                 void *userdata) {
1302
1303         _cleanup_close_ int fdf = -EBADF;
1304         struct stat st;
1305         int r;
1306
1307         assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
1308         assert(from);
1309         assert(fdt >= 0);
1310         assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
1311
1312         fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY);
1313         if (fdf < 0)
1314                 return -errno;
1315
1316         r = fd_verify_regular(fdf);
1317         if (r < 0)
1318                 return r;
1319
1320         if (fstat(fdt, &st) < 0)
1321                 return -errno;
1322
1323         r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags, NULL, NULL, progress_bytes, userdata);
1324         if (r < 0)
1325                 return r;
1326
1327         /* Make sure to copy file attributes only over if target is a regular
1328          * file (so that copying a file to /dev/null won't alter the access
1329          * mode/ownership of that device node...) */
1330         if (S_ISREG(st.st_mode)) {
1331                 (void) copy_times(fdf, fdt, copy_flags);
1332                 (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
1333         }
1334
1335         if (copy_flags & COPY_FSYNC_FULL) {
1336                 r = fsync_full(fdt);
1337                 if (r < 0)
1338                         return r;
1339         } else if (copy_flags & COPY_FSYNC) {
1340                 if (fsync(fdt) < 0)
1341                         return -errno;
1342         }
1343
1344         return 0;
1345 }
1346
1347 int copy_file_at_full(
1348                 int dir_fdf,
1349                 const char *from,
1350                 int dir_fdt,
1351                 const char *to,
1352                 int flags,
1353                 mode_t mode,
1354                 unsigned chattr_flags,
1355                 unsigned chattr_mask,
1356                 CopyFlags copy_flags,
1357                 copy_progress_bytes_t progress_bytes,
1358                 void *userdata) {
1359
1360         _cleanup_close_ int fdf = -EBADF, fdt = -EBADF;
1361         struct stat st;
1362         int r;
1363
1364         assert(dir_fdf >= 0 || dir_fdf == AT_FDCWD);
1365         assert(dir_fdt >= 0 || dir_fdt == AT_FDCWD);
1366         assert(from);
1367         assert(to);
1368
1369         fdf = openat(dir_fdf, from, O_RDONLY|O_CLOEXEC|O_NOCTTY);
1370         if (fdf < 0)
1371                 return -errno;
1372
1373         if (fstat(fdf, &st) < 0)
1374                 return -errno;
1375
1376         r = stat_verify_regular(&st);
1377         if (r < 0)
1378                 return r;
1379
1380         WITH_UMASK(0000) {
1381                 fdt = xopenat_lock(dir_fdt, to,
1382                                    flags|O_WRONLY|O_CREAT|O_CLOEXEC|O_NOCTTY,
1383                                    (copy_flags & COPY_MAC_CREATE ? XO_LABEL : 0),
1384                                    mode != MODE_INVALID ? mode : st.st_mode,
1385                                    copy_flags & COPY_LOCK_BSD ? LOCK_BSD : LOCK_NONE, LOCK_EX);
1386                 if (fdt < 0)
1387                         return fdt;
1388         }
1389
1390         if (!FLAGS_SET(flags, O_EXCL)) { /* if O_EXCL was used we created the thing as regular file, no need to check again */
1391                 r = fd_verify_regular(fdt);
1392                 if (r < 0)
1393                         goto fail;
1394         }
1395
1396         if (chattr_mask != 0)
1397                 (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL);
1398
1399         r = copy_bytes_full(fdf, fdt, UINT64_MAX, copy_flags & ~COPY_LOCK_BSD, NULL, NULL, progress_bytes, userdata);
1400         if (r < 0)
1401                 goto fail;
1402
1403         (void) copy_times(fdf, fdt, copy_flags);
1404         (void) copy_xattr(fdf, NULL, fdt, NULL, copy_flags);
1405
1406         if (chattr_mask != 0)
1407                 (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL);
1408
1409         if (copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL)) {
1410                 if (fsync(fdt) < 0) {
1411                         r = -errno;
1412                         goto fail;
1413                 }
1414         }
1415
1416         if (!FLAGS_SET(copy_flags, COPY_LOCK_BSD)) {
1417                 r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
1418                 if (r < 0)
1419                         goto fail;
1420         }
1421
1422         if (copy_flags & COPY_FSYNC_FULL) {
1423                 r = fsync_parent_at(dir_fdt, to);
1424                 if (r < 0)
1425                         goto fail;
1426         }
1427
1428         return copy_flags & COPY_LOCK_BSD ? TAKE_FD(fdt) : 0;
1429
1430 fail:
1431         /* Only unlink if we definitely are the ones who created the file */
1432         if (FLAGS_SET(flags, O_EXCL))
1433                 (void) unlinkat(dir_fdt, to, 0);
1434
1435         return r;
1436 }
1437
1438 int copy_file_atomic_at_full(
1439                 int dir_fdf,
1440                 const char *from,
1441                 int dir_fdt,
1442                 const char *to,
1443                 mode_t mode,
1444                 unsigned chattr_flags,
1445                 unsigned chattr_mask,
1446                 CopyFlags copy_flags,
1447                 copy_progress_bytes_t progress_bytes,
1448                 void *userdata) {
1449
1450         _cleanup_(unlink_and_freep) char *t = NULL;
1451         _cleanup_close_ int fdt = -EBADF;
1452         int r;
1453
1454         assert(from);
1455         assert(to);
1456         assert(!FLAGS_SET(copy_flags, COPY_LOCK_BSD));
1457
1458         if (copy_flags & COPY_MAC_CREATE) {
1459                 r = mac_selinux_create_file_prepare_at(dir_fdt, to, S_IFREG);
1460                 if (r < 0)
1461                         return r;
1462         }
1463         fdt = open_tmpfile_linkable_at(dir_fdt, to, O_WRONLY|O_CLOEXEC, &t);
1464         if (copy_flags & COPY_MAC_CREATE)
1465                 mac_selinux_create_file_clear();
1466         if (fdt < 0)
1467                 return fdt;
1468
1469         if (chattr_mask != 0)
1470                 (void) chattr_fd(fdt, chattr_flags, chattr_mask & CHATTR_EARLY_FL, NULL);
1471
1472         r = copy_file_fd_at_full(dir_fdf, from, fdt, copy_flags, progress_bytes, userdata);
1473         if (r < 0)
1474                 return r;
1475
1476         if (fchmod(fdt, mode) < 0)
1477                 return -errno;
1478
1479         if ((copy_flags & (COPY_FSYNC|COPY_FSYNC_FULL))) {
1480                 /* Sync the file */
1481                 if (fsync(fdt) < 0)
1482                         return -errno;
1483         }
1484
1485         r = link_tmpfile_at(fdt, dir_fdt, t, to, (copy_flags & COPY_REPLACE) ? LINK_TMPFILE_REPLACE : 0);
1486         if (r < 0)
1487                 return r;
1488
1489         t = mfree(t);
1490
1491         if (chattr_mask != 0)
1492                 (void) chattr_fd(fdt, chattr_flags, chattr_mask & ~CHATTR_EARLY_FL, NULL);
1493
1494         r = close_nointr(TAKE_FD(fdt)); /* even if this fails, the fd is now invalidated */
1495         if (r < 0)
1496                 goto fail;
1497
1498         if (copy_flags & COPY_FSYNC_FULL) {
1499                 /* Sync the parent directory */
1500                 r = fsync_parent_at(dir_fdt, to);
1501                 if (r < 0)
1502                         goto fail;
1503         }
1504
1505         return 0;
1506
1507 fail:
1508         (void) unlinkat(dir_fdt, to, 0);
1509         return r;
1510 }
1511
1512 int copy_times(int fdf, int fdt, CopyFlags flags) {
1513         struct stat st;
1514
1515         assert(fdf >= 0);
1516         assert(fdt >= 0);
1517
1518         if (fstat(fdf, &st) < 0)
1519                 return -errno;
1520
1521         if (futimens(fdt, (struct timespec[2]) { st.st_atim, st.st_mtim }) < 0)
1522                 return -errno;
1523
1524         if (FLAGS_SET(flags, COPY_CRTIME)) {
1525                 usec_t crtime;
1526
1527                 if (fd_getcrtime(fdf, &crtime) >= 0)
1528                         (void) fd_setcrtime(fdt, crtime);
1529         }
1530
1531         return 0;
1532 }
1533
1534 int copy_access(int fdf, int fdt) {
1535         struct stat st;
1536
1537         assert(fdf >= 0);
1538         assert(fdt >= 0);
1539
1540         /* Copies just the access mode (and not the ownership) from fdf to fdt */
1541
1542         if (fstat(fdf, &st) < 0)
1543                 return -errno;
1544
1545         return RET_NERRNO(fchmod(fdt, st.st_mode & 07777));
1546 }
1547
1548 int copy_rights_with_fallback(int fdf, int fdt, const char *patht) {
1549         struct stat st;
1550
1551         assert(fdf >= 0);
1552         assert(fdt >= 0);
1553
1554         /* Copies both access mode and ownership from fdf to fdt */
1555
1556         if (fstat(fdf, &st) < 0)
1557                 return -errno;
1558
1559         return fchmod_and_chown_with_fallback(fdt, patht, st.st_mode & 07777, st.st_uid, st.st_gid);
1560 }
1561
1562 int copy_xattr(int df, const char *from, int dt, const char *to, CopyFlags copy_flags) {
1563         _cleanup_free_ char *names = NULL;
1564         int ret = 0, r;
1565
1566         r = listxattr_at_malloc(df, from, 0, &names);
1567         if (r < 0)
1568                 return r;
1569
1570         NULSTR_FOREACH(p, names) {
1571                 _cleanup_free_ char *value = NULL;
1572
1573                 if (!FLAGS_SET(copy_flags, COPY_ALL_XATTRS) && !startswith(p, "user."))
1574                         continue;
1575
1576                 r = getxattr_at_malloc(df, from, p, 0, &value);
1577                 if (r == -ENODATA)
1578                         continue; /* gone by now */
1579                 if (r < 0)
1580                         return r;
1581
1582                 if (xsetxattr(dt, to, p, value, r, 0) < 0)
1583                         ret = -errno;
1584         }
1585
1586         return ret;
1587 }
1588
1589 int reflink(int infd, int outfd) {
1590         int r;
1591
1592         assert(infd >= 0);
1593         assert(outfd >= 0);
1594
1595         /* Make sure we invoke the ioctl on a regular file, so that no device driver accidentally gets it. */
1596
1597         r = fd_verify_regular(outfd);
1598         if (r < 0)
1599                 return r;
1600
1601         /* FICLONE was introduced in Linux 4.5 but it uses the same number as BTRFS_IOC_CLONE introduced earlier */
1602
1603         assert_cc(FICLONE == BTRFS_IOC_CLONE);
1604
1605         return RET_NERRNO(ioctl(outfd, FICLONE, infd));
1606 }
1607
1608 assert_cc(sizeof(struct file_clone_range) == sizeof(struct btrfs_ioctl_clone_range_args));
1609
1610 int reflink_range(int infd, uint64_t in_offset, int outfd, uint64_t out_offset, uint64_t sz) {
1611         struct file_clone_range args = {
1612                 .src_fd = infd,
1613                 .src_offset = in_offset,
1614                 .src_length = sz,
1615                 .dest_offset = out_offset,
1616         };
1617         int r;
1618
1619         assert(infd >= 0);
1620         assert(outfd >= 0);
1621
1622         /* Inside the kernel, FICLONE is identical to FICLONERANGE with offsets and size set to zero, let's
1623          * simplify things and use the simple ioctl in that case. Also, do the same if the size is
1624          * UINT64_MAX, which is how we usually encode "everything". */
1625         if (in_offset == 0 && out_offset == 0 && IN_SET(sz, 0, UINT64_MAX))
1626                 return reflink(infd, outfd);
1627
1628         r = fd_verify_regular(outfd);
1629         if (r < 0)
1630                 return r;
1631
1632         assert_cc(FICLONERANGE == BTRFS_IOC_CLONE_RANGE);
1633
1634         return RET_NERRNO(ioctl(outfd, FICLONERANGE, &args));
1635 }