fs/btrfs/ioctl.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/kernel.h>
   7 #include <linux/bio.h>
   8 #include <linux/file.h>
   9 #include <linux/fs.h>
  10 #include <linux/fsnotify.h>
  11 #include <linux/pagemap.h>
  12 #include <linux/highmem.h>
  13 #include <linux/time.h>
  14 #include <linux/string.h>
  15 #include <linux/backing-dev.h>
  16 #include <linux/mount.h>
  17 #include <linux/namei.h>
  18 #include <linux/writeback.h>
  19 #include <linux/compat.h>
  20 #include <linux/security.h>
  21 #include <linux/xattr.h>
  22 #include <linux/mm.h>
  23 #include <linux/slab.h>
  24 #include <linux/blkdev.h>
  25 #include <linux/uuid.h>
  26 #include <linux/btrfs.h>
  27 #include <linux/uaccess.h>
  28 #include <linux/iversion.h>
  29 #include <linux/fileattr.h>
  30 #include <linux/fsverity.h>
  31 #include <linux/sched/xacct.h>
  32 #include <linux/io_uring/cmd.h>
  33 #include "ctree.h"
  34 #include "disk-io.h"
  35 #include "export.h"
  36 #include "transaction.h"
  37 #include "btrfs_inode.h"
  38 #include "volumes.h"
  39 #include "locking.h"
  40 #include "backref.h"
  41 #include "send.h"
  42 #include "dev-replace.h"
  43 #include "props.h"
  44 #include "sysfs.h"
  45 #include "qgroup.h"
  46 #include "tree-log.h"
  47 #include "compression.h"
  48 #include "space-info.h"
  49 #include "block-group.h"
  50 #include "fs.h"
  51 #include "accessors.h"
  52 #include "extent-tree.h"
  53 #include "root-tree.h"
  54 #include "defrag.h"
  55 #include "dir-item.h"
  56 #include "uuid-tree.h"
  57 #include "ioctl.h"
  58 #include "file.h"
  59 #include "scrub.h"
  60 #include "super.h"
  61
  62 #ifdef CONFIG_64BIT
  63 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
  64  * structures are incorrect, as the timespec structure from userspace
  65  * is 4 bytes too small. We define these alternatives here to teach
  66  * the kernel about the 32-bit struct packing.
  67  */
  68 struct btrfs_ioctl_timespec_32 {
  69         __u64 sec;
  70         __u32 nsec;
  71 } __attribute__ ((__packed__));
  72
  73 struct btrfs_ioctl_received_subvol_args_32 {
  74         char    uuid[BTRFS_UUID_SIZE];  /* in */
  75         __u64   stransid;               /* in */
  76         __u64   rtransid;               /* out */
  77         struct btrfs_ioctl_timespec_32 stime; /* in */
  78         struct btrfs_ioctl_timespec_32 rtime; /* out */
  79         __u64   flags;                  /* in */
  80         __u64   reserved[16];           /* in */
  81 } __attribute__ ((__packed__));
  82
  83 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
  84                                 struct btrfs_ioctl_received_subvol_args_32)
  85 #endif
  86
  87 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
  88 struct btrfs_ioctl_send_args_32 {
  89         __s64 send_fd;                  /* in */
  90         __u64 clone_sources_count;      /* in */
  91         compat_uptr_t clone_sources;    /* in */
  92         __u64 parent_root;              /* in */
  93         __u64 flags;                    /* in */
  94         __u32 version;                  /* in */
  95         __u8  reserved[28];             /* in */
  96 } __attribute__ ((__packed__));
  97
  98 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
  99                                struct btrfs_ioctl_send_args_32)
 100
 101 struct btrfs_ioctl_encoded_io_args_32 {
 102         compat_uptr_t iov;
 103         compat_ulong_t iovcnt;
 104         __s64 offset;
 105         __u64 flags;
 106         __u64 len;
 107         __u64 unencoded_len;
 108         __u64 unencoded_offset;
 109         __u32 compression;
 110         __u32 encryption;
 111         __u8 reserved[64];
 112 };
 113
 114 #define BTRFS_IOC_ENCODED_READ_32 _IOR(BTRFS_IOCTL_MAGIC, 64, \
 115                                        struct btrfs_ioctl_encoded_io_args_32)
 116 #define BTRFS_IOC_ENCODED_WRITE_32 _IOW(BTRFS_IOCTL_MAGIC, 64, \
 117                                         struct btrfs_ioctl_encoded_io_args_32)
 118 #endif
 119
 120 /* Mask out flags that are inappropriate for the given type of inode. */
 121 static unsigned int btrfs_mask_fsflags_for_type(const struct inode *inode,
 122                                                 unsigned int flags)
 123 {
 124         if (S_ISDIR(inode->i_mode))
 125                 return flags;
 126         else if (S_ISREG(inode->i_mode))
 127                 return flags & ~FS_DIRSYNC_FL;
 128         else
 129                 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
 130 }
 131
 132 /*
 133  * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
 134  * ioctl.
 135  */
 136 static unsigned int btrfs_inode_flags_to_fsflags(const struct btrfs_inode *inode)
 137 {
 138         unsigned int iflags = 0;
 139         u32 flags = inode->flags;
 140         u32 ro_flags = inode->ro_flags;
 141
 142         if (flags & BTRFS_INODE_SYNC)
 143                 iflags |= FS_SYNC_FL;
 144         if (flags & BTRFS_INODE_IMMUTABLE)
 145                 iflags |= FS_IMMUTABLE_FL;
 146         if (flags & BTRFS_INODE_APPEND)
 147                 iflags |= FS_APPEND_FL;
 148         if (flags & BTRFS_INODE_NODUMP)
 149                 iflags |= FS_NODUMP_FL;
 150         if (flags & BTRFS_INODE_NOATIME)
 151                 iflags |= FS_NOATIME_FL;
 152         if (flags & BTRFS_INODE_DIRSYNC)
 153                 iflags |= FS_DIRSYNC_FL;
 154         if (flags & BTRFS_INODE_NODATACOW)
 155                 iflags |= FS_NOCOW_FL;
 156         if (ro_flags & BTRFS_INODE_RO_VERITY)
 157                 iflags |= FS_VERITY_FL;
 158
 159         if (flags & BTRFS_INODE_NOCOMPRESS)
 160                 iflags |= FS_NOCOMP_FL;
 161         else if (flags & BTRFS_INODE_COMPRESS)
 162                 iflags |= FS_COMPR_FL;
 163
 164         return iflags;
 165 }
 166
 167 /*
 168  * Update inode->i_flags based on the btrfs internal flags.
 169  */
 170 void btrfs_sync_inode_flags_to_i_flags(struct btrfs_inode *inode)
 171 {
 172         unsigned int new_fl = 0;
 173
 174         if (inode->flags & BTRFS_INODE_SYNC)
 175                 new_fl |= S_SYNC;
 176         if (inode->flags & BTRFS_INODE_IMMUTABLE)
 177                 new_fl |= S_IMMUTABLE;
 178         if (inode->flags & BTRFS_INODE_APPEND)
 179                 new_fl |= S_APPEND;
 180         if (inode->flags & BTRFS_INODE_NOATIME)
 181                 new_fl |= S_NOATIME;
 182         if (inode->flags & BTRFS_INODE_DIRSYNC)
 183                 new_fl |= S_DIRSYNC;
 184         if (inode->ro_flags & BTRFS_INODE_RO_VERITY)
 185                 new_fl |= S_VERITY;
 186
 187         set_mask_bits(&inode->vfs_inode.i_flags,
 188                       S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
 189                       S_VERITY, new_fl);
 190 }
 191
 192 /*
 193  * Check if @flags are a supported and valid set of FS_*_FL flags and that
 194  * the old and new flags are not conflicting
 195  */
 196 static int check_fsflags(unsigned int old_flags, unsigned int flags)
 197 {
 198         if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
 199                       FS_NOATIME_FL | FS_NODUMP_FL | \
 200                       FS_SYNC_FL | FS_DIRSYNC_FL | \
 201                       FS_NOCOMP_FL | FS_COMPR_FL |
 202                       FS_NOCOW_FL))
 203                 return -EOPNOTSUPP;
 204
 205         /* COMPR and NOCOMP on new/old are valid */
 206         if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
 207                 return -EINVAL;
 208
 209         if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
 210                 return -EINVAL;
 211
 212         /* NOCOW and compression options are mutually exclusive */
 213         if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
 214                 return -EINVAL;
 215         if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
 216                 return -EINVAL;
 217
 218         return 0;
 219 }
 220
 221 static int check_fsflags_compatible(const struct btrfs_fs_info *fs_info,
 222                                     unsigned int flags)
 223 {
 224         if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
 225                 return -EPERM;
 226
 227         return 0;
 228 }
 229
 230 int btrfs_check_ioctl_vol_args_path(const struct btrfs_ioctl_vol_args *vol_args)
 231 {
 232         if (memchr(vol_args->name, 0, sizeof(vol_args->name)) == NULL)
 233                 return -ENAMETOOLONG;
 234         return 0;
 235 }
 236
 237 static int btrfs_check_ioctl_vol_args2_subvol_name(const struct btrfs_ioctl_vol_args_v2 *vol_args2)
 238 {
 239         if (memchr(vol_args2->name, 0, sizeof(vol_args2->name)) == NULL)
 240                 return -ENAMETOOLONG;
 241         return 0;
 242 }
 243
 244 /*
 245  * Set flags/xflags from the internal inode flags. The remaining items of
 246  * fsxattr are zeroed.
 247  */
 248 int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
 249 {
 250         const struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 251
 252         fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(inode));
 253         return 0;
 254 }
 255
 256 int btrfs_fileattr_set(struct mnt_idmap *idmap,
 257                        struct dentry *dentry, struct fileattr *fa)
 258 {
 259         struct btrfs_inode *inode = BTRFS_I(d_inode(dentry));
 260         struct btrfs_root *root = inode->root;
 261         struct btrfs_fs_info *fs_info = root->fs_info;
 262         struct btrfs_trans_handle *trans;
 263         unsigned int fsflags, old_fsflags;
 264         int ret;
 265         const char *comp = NULL;
 266         u32 inode_flags;
 267
 268         if (btrfs_root_readonly(root))
 269                 return -EROFS;
 270
 271         if (fileattr_has_fsx(fa))
 272                 return -EOPNOTSUPP;
 273
 274         fsflags = btrfs_mask_fsflags_for_type(&inode->vfs_inode, fa->flags);
 275         old_fsflags = btrfs_inode_flags_to_fsflags(inode);
 276         ret = check_fsflags(old_fsflags, fsflags);
 277         if (ret)
 278                 return ret;
 279
 280         ret = check_fsflags_compatible(fs_info, fsflags);
 281         if (ret)
 282                 return ret;
 283
 284         inode_flags = inode->flags;
 285         if (fsflags & FS_SYNC_FL)
 286                 inode_flags |= BTRFS_INODE_SYNC;
 287         else
 288                 inode_flags &= ~BTRFS_INODE_SYNC;
 289         if (fsflags & FS_IMMUTABLE_FL)
 290                 inode_flags |= BTRFS_INODE_IMMUTABLE;
 291         else
 292                 inode_flags &= ~BTRFS_INODE_IMMUTABLE;
 293         if (fsflags & FS_APPEND_FL)
 294                 inode_flags |= BTRFS_INODE_APPEND;
 295         else
 296                 inode_flags &= ~BTRFS_INODE_APPEND;
 297         if (fsflags & FS_NODUMP_FL)
 298                 inode_flags |= BTRFS_INODE_NODUMP;
 299         else
 300                 inode_flags &= ~BTRFS_INODE_NODUMP;
 301         if (fsflags & FS_NOATIME_FL)
 302                 inode_flags |= BTRFS_INODE_NOATIME;
 303         else
 304                 inode_flags &= ~BTRFS_INODE_NOATIME;
 305
 306         /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
 307         if (!fa->flags_valid) {
 308                 /* 1 item for the inode */
 309                 trans = btrfs_start_transaction(root, 1);
 310                 if (IS_ERR(trans))
 311                         return PTR_ERR(trans);
 312                 goto update_flags;
 313         }
 314
 315         if (fsflags & FS_DIRSYNC_FL)
 316                 inode_flags |= BTRFS_INODE_DIRSYNC;
 317         else
 318                 inode_flags &= ~BTRFS_INODE_DIRSYNC;
 319         if (fsflags & FS_NOCOW_FL) {
 320                 if (S_ISREG(inode->vfs_inode.i_mode)) {
 321                         /*
 322                          * It's safe to turn csums off here, no extents exist.
 323                          * Otherwise we want the flag to reflect the real COW
 324                          * status of the file and will not set it.
 325                          */
 326                         if (inode->vfs_inode.i_size == 0)
 327                                 inode_flags |= BTRFS_INODE_NODATACOW |
 328                                                BTRFS_INODE_NODATASUM;
 329                 } else {
 330                         inode_flags |= BTRFS_INODE_NODATACOW;
 331                 }
 332         } else {
 333                 /*
 334                  * Revert back under same assumptions as above
 335                  */
 336                 if (S_ISREG(inode->vfs_inode.i_mode)) {
 337                         if (inode->vfs_inode.i_size == 0)
 338                                 inode_flags &= ~(BTRFS_INODE_NODATACOW |
 339                                                  BTRFS_INODE_NODATASUM);
 340                 } else {
 341                         inode_flags &= ~BTRFS_INODE_NODATACOW;
 342                 }
 343         }
 344
 345         /*
 346          * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
 347          * flag may be changed automatically if compression code won't make
 348          * things smaller.
 349          */
 350         if (fsflags & FS_NOCOMP_FL) {
 351                 inode_flags &= ~BTRFS_INODE_COMPRESS;
 352                 inode_flags |= BTRFS_INODE_NOCOMPRESS;
 353         } else if (fsflags & FS_COMPR_FL) {
 354
 355                 if (IS_SWAPFILE(&inode->vfs_inode))
 356                         return -ETXTBSY;
 357
 358                 inode_flags |= BTRFS_INODE_COMPRESS;
 359                 inode_flags &= ~BTRFS_INODE_NOCOMPRESS;
 360
 361                 comp = btrfs_compress_type2str(fs_info->compress_type);
 362                 if (!comp || comp[0] == 0)
 363                         comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
 364         } else {
 365                 inode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
 366         }
 367
 368         /*
 369          * 1 for inode item
 370          * 2 for properties
 371          */
 372         trans = btrfs_start_transaction(root, 3);
 373         if (IS_ERR(trans))
 374                 return PTR_ERR(trans);
 375
 376         if (comp) {
 377                 ret = btrfs_set_prop(trans, inode, "btrfs.compression",
 378                                      comp, strlen(comp), 0);
 379                 if (ret) {
 380                         btrfs_abort_transaction(trans, ret);
 381                         goto out_end_trans;
 382                 }
 383         } else {
 384                 ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL, 0, 0);
 385                 if (ret && ret != -ENODATA) {
 386                         btrfs_abort_transaction(trans, ret);
 387                         goto out_end_trans;
 388                 }
 389         }
 390
 391 update_flags:
 392         inode->flags = inode_flags;
 393         btrfs_update_inode_mapping_flags(inode);
 394         btrfs_sync_inode_flags_to_i_flags(inode);
 395         inode_inc_iversion(&inode->vfs_inode);
 396         inode_set_ctime_current(&inode->vfs_inode);
 397         ret = btrfs_update_inode(trans, inode);
 398
 399  out_end_trans:
 400         btrfs_end_transaction(trans);
 401         return ret;
 402 }
 403
 404 static int btrfs_ioctl_getversion(const struct inode *inode, int __user *arg)
 405 {
 406         return put_user(inode->i_generation, arg);
 407 }
 408
 409 static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
 410                                         void __user *arg)
 411 {
 412         struct btrfs_device *device;
 413         struct fstrim_range range;
 414         u64 minlen = ULLONG_MAX;
 415         u64 num_devices = 0;
 416         int ret;
 417
 418         if (!capable(CAP_SYS_ADMIN))
 419                 return -EPERM;
 420
 421         /*
 422          * btrfs_trim_block_group() depends on space cache, which is not
 423          * available in zoned filesystem. So, disallow fitrim on a zoned
 424          * filesystem for now.
 425          */
 426         if (btrfs_is_zoned(fs_info))
 427                 return -EOPNOTSUPP;
 428
 429         /*
 430          * If the fs is mounted with nologreplay, which requires it to be
 431          * mounted in RO mode as well, we can not allow discard on free space
 432          * inside block groups, because log trees refer to extents that are not
 433          * pinned in a block group's free space cache (pinning the extents is
 434          * precisely the first phase of replaying a log tree).
 435          */
 436         if (btrfs_test_opt(fs_info, NOLOGREPLAY))
 437                 return -EROFS;
 438
 439         rcu_read_lock();
 440         list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
 441                                 dev_list) {
 442                 if (!device->bdev || !bdev_max_discard_sectors(device->bdev))
 443                         continue;
 444                 num_devices++;
 445                 minlen = min_t(u64, bdev_discard_granularity(device->bdev),
 446                                     minlen);
 447         }
 448         rcu_read_unlock();
 449
 450         if (!num_devices)
 451                 return -EOPNOTSUPP;
 452         if (copy_from_user(&range, arg, sizeof(range)))
 453                 return -EFAULT;
 454
 455         /*
 456          * NOTE: Don't truncate the range using super->total_bytes.  Bytenr of
 457          * block group is in the logical address space, which can be any
 458          * sectorsize aligned bytenr in  the range [0, U64_MAX].
 459          */
 460         if (range.len < fs_info->sectorsize)
 461                 return -EINVAL;
 462
 463         range.minlen = max(range.minlen, minlen);
 464         ret = btrfs_trim_fs(fs_info, &range);
 465
 466         if (copy_to_user(arg, &range, sizeof(range)))
 467                 return -EFAULT;
 468
 469         return ret;
 470 }
 471
 472 /*
 473  * Calculate the number of transaction items to reserve for creating a subvolume
 474  * or snapshot, not including the inode, directory entries, or parent directory.
 475  */
 476 static unsigned int create_subvol_num_items(const struct btrfs_qgroup_inherit *inherit)
 477 {
 478         /*
 479          * 1 to add root block
 480          * 1 to add root item
 481          * 1 to add root ref
 482          * 1 to add root backref
 483          * 1 to add UUID item
 484          * 1 to add qgroup info
 485          * 1 to add qgroup limit
 486          *
 487          * Ideally the last two would only be accounted if qgroups are enabled,
 488          * but that can change between now and the time we would insert them.
 489          */
 490         unsigned int num_items = 7;
 491
 492         if (inherit) {
 493                 /* 2 to add qgroup relations for each inherited qgroup */
 494                 num_items += 2 * inherit->num_qgroups;
 495         }
 496         return num_items;
 497 }
 498
 499 static noinline int create_subvol(struct mnt_idmap *idmap,
 500                                   struct inode *dir, struct dentry *dentry,
 501                                   struct btrfs_qgroup_inherit *inherit)
 502 {
 503         struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 504         struct btrfs_trans_handle *trans;
 505         struct btrfs_key key;
 506         struct btrfs_root_item *root_item;
 507         struct btrfs_inode_item *inode_item;
 508         struct extent_buffer *leaf;
 509         struct btrfs_root *root = BTRFS_I(dir)->root;
 510         struct btrfs_root *new_root;
 511         struct btrfs_block_rsv block_rsv;
 512         struct timespec64 cur_time = current_time(dir);
 513         struct btrfs_new_inode_args new_inode_args = {
 514                 .dir = dir,
 515                 .dentry = dentry,
 516                 .subvol = true,
 517         };
 518         unsigned int trans_num_items;
 519         int ret;
 520         dev_t anon_dev;
 521         u64 objectid;
 522         u64 qgroup_reserved = 0;
 523
 524         root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
 525         if (!root_item)
 526                 return -ENOMEM;
 527
 528         ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
 529         if (ret)
 530                 goto out_root_item;
 531
 532         /*
 533          * Don't create subvolume whose level is not zero. Or qgroup will be
 534          * screwed up since it assumes subvolume qgroup's level to be 0.
 535          */
 536         if (btrfs_qgroup_level(objectid)) {
 537                 ret = -ENOSPC;
 538                 goto out_root_item;
 539         }
 540
 541         ret = get_anon_bdev(&anon_dev);
 542         if (ret < 0)
 543                 goto out_root_item;
 544
 545         new_inode_args.inode = btrfs_new_subvol_inode(idmap, dir);
 546         if (!new_inode_args.inode) {
 547                 ret = -ENOMEM;
 548                 goto out_anon_dev;
 549         }
 550         ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
 551         if (ret)
 552                 goto out_inode;
 553         trans_num_items += create_subvol_num_items(inherit);
 554
 555         btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
 556         ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
 557                                                trans_num_items, false);
 558         if (ret)
 559                 goto out_new_inode_args;
 560         qgroup_reserved = block_rsv.qgroup_rsv_reserved;
 561
 562         trans = btrfs_start_transaction(root, 0);
 563         if (IS_ERR(trans)) {
 564                 ret = PTR_ERR(trans);
 565                 goto out_release_rsv;
 566         }
 567         btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
 568         qgroup_reserved = 0;
 569         trans->block_rsv = &block_rsv;
 570         trans->bytes_reserved = block_rsv.size;
 571
 572         ret = btrfs_qgroup_inherit(trans, 0, objectid, btrfs_root_id(root), inherit);
 573         if (ret)
 574                 goto out;
 575
 576         leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
 577                                       0, BTRFS_NESTING_NORMAL);
 578         if (IS_ERR(leaf)) {
 579                 ret = PTR_ERR(leaf);
 580                 goto out;
 581         }
 582
 583         btrfs_mark_buffer_dirty(trans, leaf);
 584
 585         inode_item = &root_item->inode;
 586         btrfs_set_stack_inode_generation(inode_item, 1);
 587         btrfs_set_stack_inode_size(inode_item, 3);
 588         btrfs_set_stack_inode_nlink(inode_item, 1);
 589         btrfs_set_stack_inode_nbytes(inode_item,
 590                                      fs_info->nodesize);
 591         btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
 592
 593         btrfs_set_root_flags(root_item, 0);
 594         btrfs_set_root_limit(root_item, 0);
 595         btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
 596
 597         btrfs_set_root_bytenr(root_item, leaf->start);
 598         btrfs_set_root_generation(root_item, trans->transid);
 599         btrfs_set_root_level(root_item, 0);
 600         btrfs_set_root_refs(root_item, 1);
 601         btrfs_set_root_used(root_item, leaf->len);
 602         btrfs_set_root_last_snapshot(root_item, 0);
 603
 604         btrfs_set_root_generation_v2(root_item,
 605                         btrfs_root_generation(root_item));
 606         generate_random_guid(root_item->uuid);
 607         btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
 608         btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
 609         root_item->ctime = root_item->otime;
 610         btrfs_set_root_ctransid(root_item, trans->transid);
 611         btrfs_set_root_otransid(root_item, trans->transid);
 612
 613         btrfs_tree_unlock(leaf);
 614
 615         btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
 616
 617         key.objectid = objectid;
 618         key.type = BTRFS_ROOT_ITEM_KEY;
 619         key.offset = 0;
 620         ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
 621                                 root_item);
 622         if (ret) {
 623                 int ret2;
 624
 625                 /*
 626                  * Since we don't abort the transaction in this case, free the
 627                  * tree block so that we don't leak space and leave the
 628                  * filesystem in an inconsistent state (an extent item in the
 629                  * extent tree with a backreference for a root that does not
 630                  * exists).
 631                  */
 632                 btrfs_tree_lock(leaf);
 633                 btrfs_clear_buffer_dirty(trans, leaf);
 634                 btrfs_tree_unlock(leaf);
 635                 ret2 = btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
 636                 if (ret2 < 0)
 637                         btrfs_abort_transaction(trans, ret2);
 638                 free_extent_buffer(leaf);
 639                 goto out;
 640         }
 641
 642         free_extent_buffer(leaf);
 643         leaf = NULL;
 644
 645         new_root = btrfs_get_new_fs_root(fs_info, objectid, &anon_dev);
 646         if (IS_ERR(new_root)) {
 647                 ret = PTR_ERR(new_root);
 648                 btrfs_abort_transaction(trans, ret);
 649                 goto out;
 650         }
 651         /* anon_dev is owned by new_root now. */
 652         anon_dev = 0;
 653         BTRFS_I(new_inode_args.inode)->root = new_root;
 654         /* ... and new_root is owned by new_inode_args.inode now. */
 655
 656         ret = btrfs_record_root_in_trans(trans, new_root);
 657         if (ret) {
 658                 btrfs_abort_transaction(trans, ret);
 659                 goto out;
 660         }
 661
 662         ret = btrfs_uuid_tree_add(trans, root_item->uuid,
 663                                   BTRFS_UUID_KEY_SUBVOL, objectid);
 664         if (ret) {
 665                 btrfs_abort_transaction(trans, ret);
 666                 goto out;
 667         }
 668
 669         btrfs_record_new_subvolume(trans, BTRFS_I(dir));
 670
 671         ret = btrfs_create_new_inode(trans, &new_inode_args);
 672         if (ret) {
 673                 btrfs_abort_transaction(trans, ret);
 674                 goto out;
 675         }
 676
 677         d_instantiate_new(dentry, new_inode_args.inode);
 678         new_inode_args.inode = NULL;
 679
 680 out:
 681         trans->block_rsv = NULL;
 682         trans->bytes_reserved = 0;
 683         btrfs_end_transaction(trans);
 684 out_release_rsv:
 685         btrfs_block_rsv_release(fs_info, &block_rsv, (u64)-1, NULL);
 686         if (qgroup_reserved)
 687                 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 688 out_new_inode_args:
 689         btrfs_new_inode_args_destroy(&new_inode_args);
 690 out_inode:
 691         iput(new_inode_args.inode);
 692 out_anon_dev:
 693         if (anon_dev)
 694                 free_anon_bdev(anon_dev);
 695 out_root_item:
 696         kfree(root_item);
 697         return ret;
 698 }
 699
 700 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
 701                            struct dentry *dentry, bool readonly,
 702                            struct btrfs_qgroup_inherit *inherit)
 703 {
 704         struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 705         struct inode *inode;
 706         struct btrfs_pending_snapshot *pending_snapshot;
 707         unsigned int trans_num_items;
 708         struct btrfs_trans_handle *trans;
 709         struct btrfs_block_rsv *block_rsv;
 710         u64 qgroup_reserved = 0;
 711         int ret;
 712
 713         /* We do not support snapshotting right now. */
 714         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
 715                 btrfs_warn(fs_info,
 716                            "extent tree v2 doesn't support snapshotting yet");
 717                 return -EOPNOTSUPP;
 718         }
 719
 720         if (btrfs_root_refs(&root->root_item) == 0)
 721                 return -ENOENT;
 722
 723         if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
 724                 return -EINVAL;
 725
 726         if (atomic_read(&root->nr_swapfiles)) {
 727                 btrfs_warn(fs_info,
 728                            "cannot snapshot subvolume with active swapfile");
 729                 return -ETXTBSY;
 730         }
 731
 732         pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
 733         if (!pending_snapshot)
 734                 return -ENOMEM;
 735
 736         ret = get_anon_bdev(&pending_snapshot->anon_dev);
 737         if (ret < 0)
 738                 goto free_pending;
 739         pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
 740                         GFP_KERNEL);
 741         pending_snapshot->path = btrfs_alloc_path();
 742         if (!pending_snapshot->root_item || !pending_snapshot->path) {
 743                 ret = -ENOMEM;
 744                 goto free_pending;
 745         }
 746
 747         block_rsv = &pending_snapshot->block_rsv;
 748         btrfs_init_block_rsv(block_rsv, BTRFS_BLOCK_RSV_TEMP);
 749         /*
 750          * 1 to add dir item
 751          * 1 to add dir index
 752          * 1 to update parent inode item
 753          */
 754         trans_num_items = create_subvol_num_items(inherit) + 3;
 755         ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root, block_rsv,
 756                                                trans_num_items, false);
 757         if (ret)
 758                 goto free_pending;
 759         qgroup_reserved = block_rsv->qgroup_rsv_reserved;
 760
 761         pending_snapshot->dentry = dentry;
 762         pending_snapshot->root = root;
 763         pending_snapshot->readonly = readonly;
 764         pending_snapshot->dir = BTRFS_I(dir);
 765         pending_snapshot->inherit = inherit;
 766
 767         trans = btrfs_start_transaction(root, 0);
 768         if (IS_ERR(trans)) {
 769                 ret = PTR_ERR(trans);
 770                 goto fail;
 771         }
 772         ret = btrfs_record_root_in_trans(trans, BTRFS_I(dir)->root);
 773         if (ret) {
 774                 btrfs_end_transaction(trans);
 775                 goto fail;
 776         }
 777         btrfs_qgroup_convert_reserved_meta(root, qgroup_reserved);
 778         qgroup_reserved = 0;
 779
 780         trans->pending_snapshot = pending_snapshot;
 781
 782         ret = btrfs_commit_transaction(trans);
 783         if (ret)
 784                 goto fail;
 785
 786         ret = pending_snapshot->error;
 787         if (ret)
 788                 goto fail;
 789
 790         ret = btrfs_orphan_cleanup(pending_snapshot->snap);
 791         if (ret)
 792                 goto fail;
 793
 794         inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
 795         if (IS_ERR(inode)) {
 796                 ret = PTR_ERR(inode);
 797                 goto fail;
 798         }
 799
 800         d_instantiate(dentry, inode);
 801         ret = 0;
 802         pending_snapshot->anon_dev = 0;
 803 fail:
 804         /* Prevent double freeing of anon_dev */
 805         if (ret && pending_snapshot->snap)
 806                 pending_snapshot->snap->anon_dev = 0;
 807         btrfs_put_root(pending_snapshot->snap);
 808         btrfs_block_rsv_release(fs_info, block_rsv, (u64)-1, NULL);
 809         if (qgroup_reserved)
 810                 btrfs_qgroup_free_meta_prealloc(root, qgroup_reserved);
 811 free_pending:
 812         if (pending_snapshot->anon_dev)
 813                 free_anon_bdev(pending_snapshot->anon_dev);
 814         kfree(pending_snapshot->root_item);
 815         btrfs_free_path(pending_snapshot->path);
 816         kfree(pending_snapshot);
 817
 818         return ret;
 819 }
 820
 821 /*  copy of may_delete in fs/namei.c()
 822  *      Check whether we can remove a link victim from directory dir, check
 823  *  whether the type of victim is right.
 824  *  1. We can't do it if dir is read-only (done in permission())
 825  *  2. We should have write and exec permissions on dir
 826  *  3. We can't remove anything from append-only dir
 827  *  4. We can't do anything with immutable dir (done in permission())
 828  *  5. If the sticky bit on dir is set we should either
 829  *      a. be owner of dir, or
 830  *      b. be owner of victim, or
 831  *      c. have CAP_FOWNER capability
 832  *  6. If the victim is append-only or immutable we can't do anything with
 833  *     links pointing to it.
 834  *  7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
 835  *  8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
 836  *  9. We can't remove a root or mountpoint.
 837  * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
 838  *     nfs_async_unlink().
 839  */
 840
 841 static int btrfs_may_delete(struct mnt_idmap *idmap,
 842                             struct inode *dir, struct dentry *victim, int isdir)
 843 {
 844         int error;
 845
 846         if (d_really_is_negative(victim))
 847                 return -ENOENT;
 848
 849         /* The @victim is not inside @dir. */
 850         if (d_inode(victim->d_parent) != dir)
 851                 return -EINVAL;
 852         audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
 853
 854         error = inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
 855         if (error)
 856                 return error;
 857         if (IS_APPEND(dir))
 858                 return -EPERM;
 859         if (check_sticky(idmap, dir, d_inode(victim)) ||
 860             IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
 861             IS_SWAPFILE(d_inode(victim)))
 862                 return -EPERM;
 863         if (isdir) {
 864                 if (!d_is_dir(victim))
 865                         return -ENOTDIR;
 866                 if (IS_ROOT(victim))
 867                         return -EBUSY;
 868         } else if (d_is_dir(victim))
 869                 return -EISDIR;
 870         if (IS_DEADDIR(dir))
 871                 return -ENOENT;
 872         if (victim->d_flags & DCACHE_NFSFS_RENAMED)
 873                 return -EBUSY;
 874         return 0;
 875 }
 876
 877 /* copy of may_create in fs/namei.c() */
 878 static inline int btrfs_may_create(struct mnt_idmap *idmap,
 879                                    struct inode *dir, const struct dentry *child)
 880 {
 881         if (d_really_is_positive(child))
 882                 return -EEXIST;
 883         if (IS_DEADDIR(dir))
 884                 return -ENOENT;
 885         if (!fsuidgid_has_mapping(dir->i_sb, idmap))
 886                 return -EOVERFLOW;
 887         return inode_permission(idmap, dir, MAY_WRITE | MAY_EXEC);
 888 }
 889
 890 /*
 891  * Create a new subvolume below @parent.  This is largely modeled after
 892  * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
 893  * inside this filesystem so it's quite a bit simpler.
 894  */
 895 static noinline int btrfs_mksubvol(const struct path *parent,
 896                                    struct mnt_idmap *idmap,
 897                                    const char *name, int namelen,
 898                                    struct btrfs_root *snap_src,
 899                                    bool readonly,
 900                                    struct btrfs_qgroup_inherit *inherit)
 901 {
 902         struct inode *dir = d_inode(parent->dentry);
 903         struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
 904         struct dentry *dentry;
 905         struct fscrypt_str name_str = FSTR_INIT((char *)name, namelen);
 906         int error;
 907
 908         error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
 909         if (error == -EINTR)
 910                 return error;
 911
 912         dentry = lookup_one(idmap, &QSTR_LEN(name, namelen), parent->dentry);
 913         error = PTR_ERR(dentry);
 914         if (IS_ERR(dentry))
 915                 goto out_unlock;
 916
 917         error = btrfs_may_create(idmap, dir, dentry);
 918         if (error)
 919                 goto out_dput;
 920
 921         /*
 922          * even if this name doesn't exist, we may get hash collisions.
 923          * check for them now when we can safely fail
 924          */
 925         error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
 926                                                dir->i_ino, &name_str);
 927         if (error)
 928                 goto out_dput;
 929
 930         down_read(&fs_info->subvol_sem);
 931
 932         if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
 933                 goto out_up_read;
 934
 935         if (snap_src)
 936                 error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
 937         else
 938                 error = create_subvol(idmap, dir, dentry, inherit);
 939
 940         if (!error)
 941                 fsnotify_mkdir(dir, dentry);
 942 out_up_read:
 943         up_read(&fs_info->subvol_sem);
 944 out_dput:
 945         dput(dentry);
 946 out_unlock:
 947         btrfs_inode_unlock(BTRFS_I(dir), 0);
 948         return error;
 949 }
 950
 951 static noinline int btrfs_mksnapshot(const struct path *parent,
 952                                    struct mnt_idmap *idmap,
 953                                    const char *name, int namelen,
 954                                    struct btrfs_root *root,
 955                                    bool readonly,
 956                                    struct btrfs_qgroup_inherit *inherit)
 957 {
 958         int ret;
 959
 960         /*
 961          * Force new buffered writes to reserve space even when NOCOW is
 962          * possible. This is to avoid later writeback (running dealloc) to
 963          * fallback to COW mode and unexpectedly fail with ENOSPC.
 964          */
 965         btrfs_drew_read_lock(&root->snapshot_lock);
 966
 967         ret = btrfs_start_delalloc_snapshot(root, false);
 968         if (ret)
 969                 goto out;
 970
 971         /*
 972          * All previous writes have started writeback in NOCOW mode, so now
 973          * we force future writes to fallback to COW mode during snapshot
 974          * creation.
 975          */
 976         atomic_inc(&root->snapshot_force_cow);
 977
 978         btrfs_wait_ordered_extents(root, U64_MAX, NULL);
 979
 980         ret = btrfs_mksubvol(parent, idmap, name, namelen,
 981                              root, readonly, inherit);
 982         atomic_dec(&root->snapshot_force_cow);
 983 out:
 984         btrfs_drew_read_unlock(&root->snapshot_lock);
 985         return ret;
 986 }
 987
 988 /*
 989  * Try to start exclusive operation @type or cancel it if it's running.
 990  *
 991  * Return:
 992  *   0        - normal mode, newly claimed op started
 993  *  >0        - normal mode, something else is running,
 994  *              return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
 995  * ECANCELED  - cancel mode, successful cancel
 996  * ENOTCONN   - cancel mode, operation not running anymore
 997  */
 998 static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
 999                         enum btrfs_exclusive_operation type, bool cancel)
1000 {
1001         if (!cancel) {
1002                 /* Start normal op */
1003                 if (!btrfs_exclop_start(fs_info, type))
1004                         return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
1005                 /* Exclusive operation is now claimed */
1006                 return 0;
1007         }
1008
1009         /* Cancel running op */
1010         if (btrfs_exclop_start_try_lock(fs_info, type)) {
1011                 /*
1012                  * This blocks any exclop finish from setting it to NONE, so we
1013                  * request cancellation. Either it runs and we will wait for it,
1014                  * or it has finished and no waiting will happen.
1015                  */
1016                 atomic_inc(&fs_info->reloc_cancel_req);
1017                 btrfs_exclop_start_unlock(fs_info);
1018
1019                 if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
1020                         wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
1021                                     TASK_INTERRUPTIBLE);
1022
1023                 return -ECANCELED;
1024         }
1025
1026         /* Something else is running or none */
1027         return -ENOTCONN;
1028 }
1029
1030 static noinline int btrfs_ioctl_resize(struct file *file,
1031                                         void __user *arg)
1032 {
1033         BTRFS_DEV_LOOKUP_ARGS(args);
1034         struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
1035         struct btrfs_fs_info *fs_info = root->fs_info;
1036         u64 new_size;
1037         u64 old_size;
1038         u64 devid = 1;
1039         struct btrfs_ioctl_vol_args *vol_args;
1040         struct btrfs_device *device = NULL;
1041         char *sizestr;
1042         char *devstr = NULL;
1043         int ret = 0;
1044         int mod = 0;
1045         bool cancel;
1046
1047         if (!capable(CAP_SYS_ADMIN))
1048                 return -EPERM;
1049
1050         ret = mnt_want_write_file(file);
1051         if (ret)
1052                 return ret;
1053
1054         /*
1055          * Read the arguments before checking exclusivity to be able to
1056          * distinguish regular resize and cancel
1057          */
1058         vol_args = memdup_user(arg, sizeof(*vol_args));
1059         if (IS_ERR(vol_args)) {
1060                 ret = PTR_ERR(vol_args);
1061                 goto out_drop;
1062         }
1063         ret = btrfs_check_ioctl_vol_args_path(vol_args);
1064         if (ret < 0)
1065                 goto out_free;
1066
1067         sizestr = vol_args->name;
1068         cancel = (strcmp("cancel", sizestr) == 0);
1069         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
1070         if (ret)
1071                 goto out_free;
1072         /* Exclusive operation is now claimed */
1073
1074         devstr = strchr(sizestr, ':');
1075         if (devstr) {
1076                 sizestr = devstr + 1;
1077                 *devstr = '\0';
1078                 devstr = vol_args->name;
1079                 ret = kstrtoull(devstr, 10, &devid);
1080                 if (ret)
1081                         goto out_finish;
1082                 if (!devid) {
1083                         ret = -EINVAL;
1084                         goto out_finish;
1085                 }
1086                 btrfs_info(fs_info, "resizing devid %llu", devid);
1087         }
1088
1089         args.devid = devid;
1090         device = btrfs_find_device(fs_info->fs_devices, &args);
1091         if (!device) {
1092                 btrfs_info(fs_info, "resizer unable to find device %llu",
1093                            devid);
1094                 ret = -ENODEV;
1095                 goto out_finish;
1096         }
1097
1098         if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1099                 btrfs_info(fs_info,
1100                            "resizer unable to apply on readonly device %llu",
1101                        devid);
1102                 ret = -EPERM;
1103                 goto out_finish;
1104         }
1105
1106         if (!strcmp(sizestr, "max"))
1107                 new_size = bdev_nr_bytes(device->bdev);
1108         else {
1109                 char *retptr;
1110
1111                 if (sizestr[0] == '-') {
1112                         mod = -1;
1113                         sizestr++;
1114                 } else if (sizestr[0] == '+') {
1115                         mod = 1;
1116                         sizestr++;
1117                 }
1118                 new_size = memparse(sizestr, &retptr);
1119                 if (*retptr != '\0' || new_size == 0) {
1120                         ret = -EINVAL;
1121                         goto out_finish;
1122                 }
1123         }
1124
1125         if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1126                 ret = -EPERM;
1127                 goto out_finish;
1128         }
1129
1130         old_size = btrfs_device_get_total_bytes(device);
1131
1132         if (mod < 0) {
1133                 if (new_size > old_size) {
1134                         ret = -EINVAL;
1135                         goto out_finish;
1136                 }
1137                 new_size = old_size - new_size;
1138         } else if (mod > 0) {
1139                 if (new_size > ULLONG_MAX - old_size) {
1140                         ret = -ERANGE;
1141                         goto out_finish;
1142                 }
1143                 new_size = old_size + new_size;
1144         }
1145
1146         if (new_size < SZ_256M) {
1147                 ret = -EINVAL;
1148                 goto out_finish;
1149         }
1150         if (new_size > bdev_nr_bytes(device->bdev)) {
1151                 ret = -EFBIG;
1152                 goto out_finish;
1153         }
1154
1155         new_size = round_down(new_size, fs_info->sectorsize);
1156
1157         if (new_size > old_size) {
1158                 struct btrfs_trans_handle *trans;
1159
1160                 trans = btrfs_start_transaction(root, 0);
1161                 if (IS_ERR(trans)) {
1162                         ret = PTR_ERR(trans);
1163                         goto out_finish;
1164                 }
1165                 ret = btrfs_grow_device(trans, device, new_size);
1166                 btrfs_commit_transaction(trans);
1167         } else if (new_size < old_size) {
1168                 ret = btrfs_shrink_device(device, new_size);
1169         } /* equal, nothing need to do */
1170
1171         if (ret == 0 && new_size != old_size)
1172                 btrfs_info_in_rcu(fs_info,
1173                         "resize device %s (devid %llu) from %llu to %llu",
1174                         btrfs_dev_name(device), device->devid,
1175                         old_size, new_size);
1176 out_finish:
1177         btrfs_exclop_finish(fs_info);
1178 out_free:
1179         kfree(vol_args);
1180 out_drop:
1181         mnt_drop_write_file(file);
1182         return ret;
1183 }
1184
1185 static noinline int __btrfs_ioctl_snap_create(struct file *file,
1186                                 struct mnt_idmap *idmap,
1187                                 const char *name, unsigned long fd, int subvol,
1188                                 bool readonly,
1189                                 struct btrfs_qgroup_inherit *inherit)
1190 {
1191         int namelen;
1192         int ret = 0;
1193
1194         if (!S_ISDIR(file_inode(file)->i_mode))
1195                 return -ENOTDIR;
1196
1197         ret = mnt_want_write_file(file);
1198         if (ret)
1199                 goto out;
1200
1201         namelen = strlen(name);
1202         if (strchr(name, '/')) {
1203                 ret = -EINVAL;
1204                 goto out_drop_write;
1205         }
1206
1207         if (name[0] == '.' &&
1208            (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1209                 ret = -EEXIST;
1210                 goto out_drop_write;
1211         }
1212
1213         if (subvol) {
1214                 ret = btrfs_mksubvol(&file->f_path, idmap, name,
1215                                      namelen, NULL, readonly, inherit);
1216         } else {
1217                 CLASS(fd, src)(fd);
1218                 struct inode *src_inode;
1219                 if (fd_empty(src)) {
1220                         ret = -EINVAL;
1221                         goto out_drop_write;
1222                 }
1223
1224                 src_inode = file_inode(fd_file(src));
1225                 if (src_inode->i_sb != file_inode(file)->i_sb) {
1226                         btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
1227                                    "Snapshot src from another FS");
1228                         ret = -EXDEV;
1229                 } else if (!inode_owner_or_capable(idmap, src_inode)) {
1230                         /*
1231                          * Subvolume creation is not restricted, but snapshots
1232                          * are limited to own subvolumes only
1233                          */
1234                         ret = -EPERM;
1235                 } else if (btrfs_ino(BTRFS_I(src_inode)) != BTRFS_FIRST_FREE_OBJECTID) {
1236                         /*
1237                          * Snapshots must be made with the src_inode referring
1238                          * to the subvolume inode, otherwise the permission
1239                          * checking above is useless because we may have
1240                          * permission on a lower directory but not the subvol
1241                          * itself.
1242                          */
1243                         ret = -EINVAL;
1244                 } else {
1245                         ret = btrfs_mksnapshot(&file->f_path, idmap,
1246                                                name, namelen,
1247                                                BTRFS_I(src_inode)->root,
1248                                                readonly, inherit);
1249                 }
1250         }
1251 out_drop_write:
1252         mnt_drop_write_file(file);
1253 out:
1254         return ret;
1255 }
1256
1257 static noinline int btrfs_ioctl_snap_create(struct file *file,
1258                                             void __user *arg, int subvol)
1259 {
1260         struct btrfs_ioctl_vol_args *vol_args;
1261         int ret;
1262
1263         if (!S_ISDIR(file_inode(file)->i_mode))
1264                 return -ENOTDIR;
1265
1266         vol_args = memdup_user(arg, sizeof(*vol_args));
1267         if (IS_ERR(vol_args))
1268                 return PTR_ERR(vol_args);
1269         ret = btrfs_check_ioctl_vol_args_path(vol_args);
1270         if (ret < 0)
1271                 goto out;
1272
1273         ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
1274                                         vol_args->name, vol_args->fd, subvol,
1275                                         false, NULL);
1276
1277 out:
1278         kfree(vol_args);
1279         return ret;
1280 }
1281
1282 static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1283                                                void __user *arg, int subvol)
1284 {
1285         struct btrfs_ioctl_vol_args_v2 *vol_args;
1286         int ret;
1287         bool readonly = false;
1288         struct btrfs_qgroup_inherit *inherit = NULL;
1289
1290         if (!S_ISDIR(file_inode(file)->i_mode))
1291                 return -ENOTDIR;
1292
1293         vol_args = memdup_user(arg, sizeof(*vol_args));
1294         if (IS_ERR(vol_args))
1295                 return PTR_ERR(vol_args);
1296         ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
1297         if (ret < 0)
1298                 goto free_args;
1299
1300         if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
1301                 ret = -EOPNOTSUPP;
1302                 goto free_args;
1303         }
1304
1305         if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1306                 readonly = true;
1307         if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1308                 struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
1309
1310                 if (vol_args->size < sizeof(*inherit) ||
1311                     vol_args->size > PAGE_SIZE) {
1312                         ret = -EINVAL;
1313                         goto free_args;
1314                 }
1315                 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1316                 if (IS_ERR(inherit)) {
1317                         ret = PTR_ERR(inherit);
1318                         goto free_args;
1319                 }
1320
1321                 ret = btrfs_qgroup_check_inherit(fs_info, inherit, vol_args->size);
1322                 if (ret < 0)
1323                         goto free_inherit;
1324         }
1325
1326         ret = __btrfs_ioctl_snap_create(file, file_mnt_idmap(file),
1327                                         vol_args->name, vol_args->fd, subvol,
1328                                         readonly, inherit);
1329         if (ret)
1330                 goto free_inherit;
1331 free_inherit:
1332         kfree(inherit);
1333 free_args:
1334         kfree(vol_args);
1335         return ret;
1336 }
1337
1338 static noinline int btrfs_ioctl_subvol_getflags(struct btrfs_inode *inode,
1339                                                 void __user *arg)
1340 {
1341         struct btrfs_root *root = inode->root;
1342         struct btrfs_fs_info *fs_info = root->fs_info;
1343         int ret = 0;
1344         u64 flags = 0;
1345
1346         if (btrfs_ino(inode) != BTRFS_FIRST_FREE_OBJECTID)
1347                 return -EINVAL;
1348
1349         down_read(&fs_info->subvol_sem);
1350         if (btrfs_root_readonly(root))
1351                 flags |= BTRFS_SUBVOL_RDONLY;
1352         up_read(&fs_info->subvol_sem);
1353
1354         if (copy_to_user(arg, &flags, sizeof(flags)))
1355                 ret = -EFAULT;
1356
1357         return ret;
1358 }
1359
1360 static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
1361                                               void __user *arg)
1362 {
1363         struct inode *inode = file_inode(file);
1364         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
1365         struct btrfs_root *root = BTRFS_I(inode)->root;
1366         struct btrfs_trans_handle *trans;
1367         u64 root_flags;
1368         u64 flags;
1369         int ret = 0;
1370
1371         if (!inode_owner_or_capable(file_mnt_idmap(file), inode))
1372                 return -EPERM;
1373
1374         ret = mnt_want_write_file(file);
1375         if (ret)
1376                 goto out;
1377
1378         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
1379                 ret = -EINVAL;
1380                 goto out_drop_write;
1381         }
1382
1383         if (copy_from_user(&flags, arg, sizeof(flags))) {
1384                 ret = -EFAULT;
1385                 goto out_drop_write;
1386         }
1387
1388         if (flags & ~BTRFS_SUBVOL_RDONLY) {
1389                 ret = -EOPNOTSUPP;
1390                 goto out_drop_write;
1391         }
1392
1393         down_write(&fs_info->subvol_sem);
1394
1395         /* nothing to do */
1396         if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
1397                 goto out_drop_sem;
1398
1399         root_flags = btrfs_root_flags(&root->root_item);
1400         if (flags & BTRFS_SUBVOL_RDONLY) {
1401                 btrfs_set_root_flags(&root->root_item,
1402                                      root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
1403         } else {
1404                 /*
1405                  * Block RO -> RW transition if this subvolume is involved in
1406                  * send
1407                  */
1408                 spin_lock(&root->root_item_lock);
1409                 if (root->send_in_progress == 0) {
1410                         btrfs_set_root_flags(&root->root_item,
1411                                      root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
1412                         spin_unlock(&root->root_item_lock);
1413                 } else {
1414                         spin_unlock(&root->root_item_lock);
1415                         btrfs_warn(fs_info,
1416                                    "Attempt to set subvolume %llu read-write during send",
1417                                    btrfs_root_id(root));
1418                         ret = -EPERM;
1419                         goto out_drop_sem;
1420                 }
1421         }
1422
1423         trans = btrfs_start_transaction(root, 1);
1424         if (IS_ERR(trans)) {
1425                 ret = PTR_ERR(trans);
1426                 goto out_reset;
1427         }
1428
1429         ret = btrfs_update_root(trans, fs_info->tree_root,
1430                                 &root->root_key, &root->root_item);
1431         if (ret < 0) {
1432                 btrfs_end_transaction(trans);
1433                 goto out_reset;
1434         }
1435
1436         ret = btrfs_commit_transaction(trans);
1437
1438 out_reset:
1439         if (ret)
1440                 btrfs_set_root_flags(&root->root_item, root_flags);
1441 out_drop_sem:
1442         up_write(&fs_info->subvol_sem);
1443 out_drop_write:
1444         mnt_drop_write_file(file);
1445 out:
1446         return ret;
1447 }
1448
1449 static noinline bool key_in_sk(const struct btrfs_key *key,
1450                                const struct btrfs_ioctl_search_key *sk)
1451 {
1452         struct btrfs_key test;
1453         int ret;
1454
1455         test.objectid = sk->min_objectid;
1456         test.type = sk->min_type;
1457         test.offset = sk->min_offset;
1458
1459         ret = btrfs_comp_cpu_keys(key, &test);
1460         if (ret < 0)
1461                 return false;
1462
1463         test.objectid = sk->max_objectid;
1464         test.type = sk->max_type;
1465         test.offset = sk->max_offset;
1466
1467         ret = btrfs_comp_cpu_keys(key, &test);
1468         if (ret > 0)
1469                 return false;
1470         return true;
1471 }
1472
1473 static noinline int copy_to_sk(struct btrfs_path *path,
1474                                struct btrfs_key *key,
1475                                const struct btrfs_ioctl_search_key *sk,
1476                                u64 *buf_size,
1477                                char __user *ubuf,
1478                                unsigned long *sk_offset,
1479                                int *num_found)
1480 {
1481         u64 found_transid;
1482         struct extent_buffer *leaf;
1483         struct btrfs_ioctl_search_header sh;
1484         struct btrfs_key test;
1485         unsigned long item_off;
1486         unsigned long item_len;
1487         int nritems;
1488         int i;
1489         int slot;
1490         int ret = 0;
1491
1492         leaf = path->nodes[0];
1493         slot = path->slots[0];
1494         nritems = btrfs_header_nritems(leaf);
1495
1496         if (btrfs_header_generation(leaf) > sk->max_transid) {
1497                 i = nritems;
1498                 goto advance_key;
1499         }
1500         found_transid = btrfs_header_generation(leaf);
1501
1502         for (i = slot; i < nritems; i++) {
1503                 item_off = btrfs_item_ptr_offset(leaf, i);
1504                 item_len = btrfs_item_size(leaf, i);
1505
1506                 btrfs_item_key_to_cpu(leaf, key, i);
1507                 if (!key_in_sk(key, sk))
1508                         continue;
1509
1510                 if (sizeof(sh) + item_len > *buf_size) {
1511                         if (*num_found) {
1512                                 ret = 1;
1513                                 goto out;
1514                         }
1515
1516                         /*
1517                          * return one empty item back for v1, which does not
1518                          * handle -EOVERFLOW
1519                          */
1520
1521                         *buf_size = sizeof(sh) + item_len;
1522                         item_len = 0;
1523                         ret = -EOVERFLOW;
1524                 }
1525
1526                 if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
1527                         ret = 1;
1528                         goto out;
1529                 }
1530
1531                 sh.objectid = key->objectid;
1532                 sh.type = key->type;
1533                 sh.offset = key->offset;
1534                 sh.len = item_len;
1535                 sh.transid = found_transid;
1536
1537                 /*
1538                  * Copy search result header. If we fault then loop again so we
1539                  * can fault in the pages and -EFAULT there if there's a
1540                  * problem. Otherwise we'll fault and then copy the buffer in
1541                  * properly this next time through
1542                  */
1543                 if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
1544                         ret = 0;
1545                         goto out;
1546                 }
1547
1548                 *sk_offset += sizeof(sh);
1549
1550                 if (item_len) {
1551                         char __user *up = ubuf + *sk_offset;
1552                         /*
1553                          * Copy the item, same behavior as above, but reset the
1554                          * * sk_offset so we copy the full thing again.
1555                          */
1556                         if (read_extent_buffer_to_user_nofault(leaf, up,
1557                                                 item_off, item_len)) {
1558                                 ret = 0;
1559                                 *sk_offset -= sizeof(sh);
1560                                 goto out;
1561                         }
1562
1563                         *sk_offset += item_len;
1564                 }
1565                 (*num_found)++;
1566
1567                 if (ret) /* -EOVERFLOW from above */
1568                         goto out;
1569
1570                 if (*num_found >= sk->nr_items) {
1571                         ret = 1;
1572                         goto out;
1573                 }
1574         }
1575 advance_key:
1576         ret = 0;
1577         test.objectid = sk->max_objectid;
1578         test.type = sk->max_type;
1579         test.offset = sk->max_offset;
1580         if (btrfs_comp_cpu_keys(key, &test) >= 0)
1581                 ret = 1;
1582         else if (key->offset < (u64)-1)
1583                 key->offset++;
1584         else if (key->type < (u8)-1) {
1585                 key->offset = 0;
1586                 key->type++;
1587         } else if (key->objectid < (u64)-1) {
1588                 key->offset = 0;
1589                 key->type = 0;
1590                 key->objectid++;
1591         } else
1592                 ret = 1;
1593 out:
1594         /*
1595          *  0: all items from this leaf copied, continue with next
1596          *  1: * more items can be copied, but unused buffer is too small
1597          *     * all items were found
1598          *     Either way, it will stops the loop which iterates to the next
1599          *     leaf
1600          *  -EOVERFLOW: item was to large for buffer
1601          *  -EFAULT: could not copy extent buffer back to userspace
1602          */
1603         return ret;
1604 }
1605
1606 static noinline int search_ioctl(struct btrfs_root *root,
1607                                  struct btrfs_ioctl_search_key *sk,
1608                                  u64 *buf_size,
1609                                  char __user *ubuf)
1610 {
1611         struct btrfs_fs_info *info = root->fs_info;
1612         struct btrfs_key key;
1613         struct btrfs_path *path;
1614         int ret;
1615         int num_found = 0;
1616         unsigned long sk_offset = 0;
1617
1618         if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
1619                 *buf_size = sizeof(struct btrfs_ioctl_search_header);
1620                 return -EOVERFLOW;
1621         }
1622
1623         path = btrfs_alloc_path();
1624         if (!path)
1625                 return -ENOMEM;
1626
1627         if (sk->tree_id == 0) {
1628                 /* Search the root that we got passed. */
1629                 root = btrfs_grab_root(root);
1630         } else {
1631                 /* Look up the root from the arguments. */
1632                 root = btrfs_get_fs_root(info, sk->tree_id, true);
1633                 if (IS_ERR(root)) {
1634                         btrfs_free_path(path);
1635                         return PTR_ERR(root);
1636                 }
1637         }
1638
1639         key.objectid = sk->min_objectid;
1640         key.type = sk->min_type;
1641         key.offset = sk->min_offset;
1642
1643         while (1) {
1644                 /*
1645                  * Ensure that the whole user buffer is faulted in at sub-page
1646                  * granularity, otherwise the loop may live-lock.
1647                  */
1648                 if (fault_in_subpage_writeable(ubuf + sk_offset, *buf_size - sk_offset)) {
1649                         ret = -EFAULT;
1650                         break;
1651                 }
1652
1653                 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
1654                 if (ret)
1655                         break;
1656
1657                 ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
1658                                  &sk_offset, &num_found);
1659                 btrfs_release_path(path);
1660                 if (ret)
1661                         break;
1662
1663         }
1664         /* Normalize return values from btrfs_search_forward() and copy_to_sk(). */
1665         if (ret > 0)
1666                 ret = 0;
1667
1668         sk->nr_items = num_found;
1669         btrfs_put_root(root);
1670         btrfs_free_path(path);
1671         return ret;
1672 }
1673
1674 static noinline int btrfs_ioctl_tree_search(struct btrfs_root *root,
1675                                             void __user *argp)
1676 {
1677         struct btrfs_ioctl_search_args __user *uargs = argp;
1678         struct btrfs_ioctl_search_key sk;
1679         int ret;
1680         u64 buf_size;
1681
1682         if (!capable(CAP_SYS_ADMIN))
1683                 return -EPERM;
1684
1685         if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
1686                 return -EFAULT;
1687
1688         buf_size = sizeof(uargs->buf);
1689
1690         ret = search_ioctl(root, &sk, &buf_size, uargs->buf);
1691
1692         /*
1693          * In the origin implementation an overflow is handled by returning a
1694          * search header with a len of zero, so reset ret.
1695          */
1696         if (ret == -EOVERFLOW)
1697                 ret = 0;
1698
1699         if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
1700                 ret = -EFAULT;
1701         return ret;
1702 }
1703
1704 static noinline int btrfs_ioctl_tree_search_v2(struct btrfs_root *root,
1705                                                void __user *argp)
1706 {
1707         struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
1708         struct btrfs_ioctl_search_args_v2 args;
1709         int ret;
1710         u64 buf_size;
1711         const u64 buf_limit = SZ_16M;
1712
1713         if (!capable(CAP_SYS_ADMIN))
1714                 return -EPERM;
1715
1716         /* copy search header and buffer size */
1717         if (copy_from_user(&args, uarg, sizeof(args)))
1718                 return -EFAULT;
1719
1720         buf_size = args.buf_size;
1721
1722         /* limit result size to 16MB */
1723         if (buf_size > buf_limit)
1724                 buf_size = buf_limit;
1725
1726         ret = search_ioctl(root, &args.key, &buf_size,
1727                            (char __user *)(&uarg->buf[0]));
1728         if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
1729                 ret = -EFAULT;
1730         else if (ret == -EOVERFLOW &&
1731                 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
1732                 ret = -EFAULT;
1733
1734         return ret;
1735 }
1736
1737 /*
1738  * Search INODE_REFs to identify path name of 'dirid' directory
1739  * in a 'tree_id' tree. and sets path name to 'name'.
1740  */
1741 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
1742                                 u64 tree_id, u64 dirid, char *name)
1743 {
1744         struct btrfs_root *root;
1745         struct btrfs_key key;
1746         char *ptr;
1747         int ret = -1;
1748         int slot;
1749         int len;
1750         int total_len = 0;
1751         struct btrfs_inode_ref *iref;
1752         struct extent_buffer *l;
1753         struct btrfs_path *path;
1754
1755         if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
1756                 name[0]='\0';
1757                 return 0;
1758         }
1759
1760         path = btrfs_alloc_path();
1761         if (!path)
1762                 return -ENOMEM;
1763
1764         ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
1765
1766         root = btrfs_get_fs_root(info, tree_id, true);
1767         if (IS_ERR(root)) {
1768                 ret = PTR_ERR(root);
1769                 root = NULL;
1770                 goto out;
1771         }
1772
1773         key.objectid = dirid;
1774         key.type = BTRFS_INODE_REF_KEY;
1775         key.offset = (u64)-1;
1776
1777         while (1) {
1778                 ret = btrfs_search_backwards(root, &key, path);
1779                 if (ret < 0)
1780                         goto out;
1781                 else if (ret > 0) {
1782                         ret = -ENOENT;
1783                         goto out;
1784                 }
1785
1786                 l = path->nodes[0];
1787                 slot = path->slots[0];
1788
1789                 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
1790                 len = btrfs_inode_ref_name_len(l, iref);
1791                 ptr -= len + 1;
1792                 total_len += len + 1;
1793                 if (ptr < name) {
1794                         ret = -ENAMETOOLONG;
1795                         goto out;
1796                 }
1797
1798                 *(ptr + len) = '/';
1799                 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
1800
1801                 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
1802                         break;
1803
1804                 btrfs_release_path(path);
1805                 key.objectid = key.offset;
1806                 key.offset = (u64)-1;
1807                 dirid = key.objectid;
1808         }
1809         memmove(name, ptr, total_len);
1810         name[total_len] = '\0';
1811         ret = 0;
1812 out:
1813         btrfs_put_root(root);
1814         btrfs_free_path(path);
1815         return ret;
1816 }
1817
1818 static int btrfs_search_path_in_tree_user(struct mnt_idmap *idmap,
1819                                 struct inode *inode,
1820                                 struct btrfs_ioctl_ino_lookup_user_args *args)
1821 {
1822         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
1823         u64 upper_limit = btrfs_ino(BTRFS_I(inode));
1824         u64 treeid = btrfs_root_id(BTRFS_I(inode)->root);
1825         u64 dirid = args->dirid;
1826         unsigned long item_off;
1827         unsigned long item_len;
1828         struct btrfs_inode_ref *iref;
1829         struct btrfs_root_ref *rref;
1830         struct btrfs_root *root = NULL;
1831         struct btrfs_path *path;
1832         struct btrfs_key key, key2;
1833         struct extent_buffer *leaf;
1834         char *ptr;
1835         int slot;
1836         int len;
1837         int total_len = 0;
1838         int ret;
1839
1840         path = btrfs_alloc_path();
1841         if (!path)
1842                 return -ENOMEM;
1843
1844         /*
1845          * If the bottom subvolume does not exist directly under upper_limit,
1846          * construct the path in from the bottom up.
1847          */
1848         if (dirid != upper_limit) {
1849                 ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
1850
1851                 root = btrfs_get_fs_root(fs_info, treeid, true);
1852                 if (IS_ERR(root)) {
1853                         ret = PTR_ERR(root);
1854                         goto out;
1855                 }
1856
1857                 key.objectid = dirid;
1858                 key.type = BTRFS_INODE_REF_KEY;
1859                 key.offset = (u64)-1;
1860                 while (1) {
1861                         struct btrfs_inode *temp_inode;
1862
1863                         ret = btrfs_search_backwards(root, &key, path);
1864                         if (ret < 0)
1865                                 goto out_put;
1866                         else if (ret > 0) {
1867                                 ret = -ENOENT;
1868                                 goto out_put;
1869                         }
1870
1871                         leaf = path->nodes[0];
1872                         slot = path->slots[0];
1873
1874                         iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
1875                         len = btrfs_inode_ref_name_len(leaf, iref);
1876                         ptr -= len + 1;
1877                         total_len += len + 1;
1878                         if (ptr < args->path) {
1879                                 ret = -ENAMETOOLONG;
1880                                 goto out_put;
1881                         }
1882
1883                         *(ptr + len) = '/';
1884                         read_extent_buffer(leaf, ptr,
1885                                         (unsigned long)(iref + 1), len);
1886
1887                         /* Check the read+exec permission of this directory */
1888                         ret = btrfs_previous_item(root, path, dirid,
1889                                                   BTRFS_INODE_ITEM_KEY);
1890                         if (ret < 0) {
1891                                 goto out_put;
1892                         } else if (ret > 0) {
1893                                 ret = -ENOENT;
1894                                 goto out_put;
1895                         }
1896
1897                         leaf = path->nodes[0];
1898                         slot = path->slots[0];
1899                         btrfs_item_key_to_cpu(leaf, &key2, slot);
1900                         if (key2.objectid != dirid) {
1901                                 ret = -ENOENT;
1902                                 goto out_put;
1903                         }
1904
1905                         /*
1906                          * We don't need the path anymore, so release it and
1907                          * avoid deadlocks and lockdep warnings in case
1908                          * btrfs_iget() needs to lookup the inode from its root
1909                          * btree and lock the same leaf.
1910                          */
1911                         btrfs_release_path(path);
1912                         temp_inode = btrfs_iget(key2.objectid, root);
1913                         if (IS_ERR(temp_inode)) {
1914                                 ret = PTR_ERR(temp_inode);
1915                                 goto out_put;
1916                         }
1917                         ret = inode_permission(idmap, &temp_inode->vfs_inode,
1918                                                MAY_READ | MAY_EXEC);
1919                         iput(&temp_inode->vfs_inode);
1920                         if (ret) {
1921                                 ret = -EACCES;
1922                                 goto out_put;
1923                         }
1924
1925                         if (key.offset == upper_limit)
1926                                 break;
1927                         if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
1928                                 ret = -EACCES;
1929                                 goto out_put;
1930                         }
1931
1932                         key.objectid = key.offset;
1933                         key.offset = (u64)-1;
1934                         dirid = key.objectid;
1935                 }
1936
1937                 memmove(args->path, ptr, total_len);
1938                 args->path[total_len] = '\0';
1939                 btrfs_put_root(root);
1940                 root = NULL;
1941                 btrfs_release_path(path);
1942         }
1943
1944         /* Get the bottom subvolume's name from ROOT_REF */
1945         key.objectid = treeid;
1946         key.type = BTRFS_ROOT_REF_KEY;
1947         key.offset = args->treeid;
1948         ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
1949         if (ret < 0) {
1950                 goto out;
1951         } else if (ret > 0) {
1952                 ret = -ENOENT;
1953                 goto out;
1954         }
1955
1956         leaf = path->nodes[0];
1957         slot = path->slots[0];
1958         btrfs_item_key_to_cpu(leaf, &key, slot);
1959
1960         item_off = btrfs_item_ptr_offset(leaf, slot);
1961         item_len = btrfs_item_size(leaf, slot);
1962         /* Check if dirid in ROOT_REF corresponds to passed dirid */
1963         rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
1964         if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
1965                 ret = -EINVAL;
1966                 goto out;
1967         }
1968
1969         /* Copy subvolume's name */
1970         item_off += sizeof(struct btrfs_root_ref);
1971         item_len -= sizeof(struct btrfs_root_ref);
1972         read_extent_buffer(leaf, args->name, item_off, item_len);
1973         args->name[item_len] = 0;
1974
1975 out_put:
1976         btrfs_put_root(root);
1977 out:
1978         btrfs_free_path(path);
1979         return ret;
1980 }
1981
1982 static noinline int btrfs_ioctl_ino_lookup(struct btrfs_root *root,
1983                                            void __user *argp)
1984 {
1985         struct btrfs_ioctl_ino_lookup_args *args;
1986         int ret = 0;
1987
1988         args = memdup_user(argp, sizeof(*args));
1989         if (IS_ERR(args))
1990                 return PTR_ERR(args);
1991
1992         /*
1993          * Unprivileged query to obtain the containing subvolume root id. The
1994          * path is reset so it's consistent with btrfs_search_path_in_tree.
1995          */
1996         if (args->treeid == 0)
1997                 args->treeid = btrfs_root_id(root);
1998
1999         if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
2000                 args->name[0] = 0;
2001                 goto out;
2002         }
2003
2004         if (!capable(CAP_SYS_ADMIN)) {
2005                 ret = -EPERM;
2006                 goto out;
2007         }
2008
2009         ret = btrfs_search_path_in_tree(root->fs_info,
2010                                         args->treeid, args->objectid,
2011                                         args->name);
2012
2013 out:
2014         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2015                 ret = -EFAULT;
2016
2017         kfree(args);
2018         return ret;
2019 }
2020
2021 /*
2022  * Version of ino_lookup ioctl (unprivileged)
2023  *
2024  * The main differences from ino_lookup ioctl are:
2025  *
2026  *   1. Read + Exec permission will be checked using inode_permission() during
2027  *      path construction. -EACCES will be returned in case of failure.
2028  *   2. Path construction will be stopped at the inode number which corresponds
2029  *      to the fd with which this ioctl is called. If constructed path does not
2030  *      exist under fd's inode, -EACCES will be returned.
2031  *   3. The name of bottom subvolume is also searched and filled.
2032  */
2033 static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
2034 {
2035         struct btrfs_ioctl_ino_lookup_user_args *args;
2036         struct inode *inode;
2037         int ret;
2038
2039         args = memdup_user(argp, sizeof(*args));
2040         if (IS_ERR(args))
2041                 return PTR_ERR(args);
2042
2043         inode = file_inode(file);
2044
2045         if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
2046             btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
2047                 /*
2048                  * The subvolume does not exist under fd with which this is
2049                  * called
2050                  */
2051                 kfree(args);
2052                 return -EACCES;
2053         }
2054
2055         ret = btrfs_search_path_in_tree_user(file_mnt_idmap(file), inode, args);
2056
2057         if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2058                 ret = -EFAULT;
2059
2060         kfree(args);
2061         return ret;
2062 }
2063
2064 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
2065 static int btrfs_ioctl_get_subvol_info(struct inode *inode, void __user *argp)
2066 {
2067         struct btrfs_ioctl_get_subvol_info_args *subvol_info;
2068         struct btrfs_fs_info *fs_info;
2069         struct btrfs_root *root;
2070         struct btrfs_path *path;
2071         struct btrfs_key key;
2072         struct btrfs_root_item *root_item;
2073         struct btrfs_root_ref *rref;
2074         struct extent_buffer *leaf;
2075         unsigned long item_off;
2076         unsigned long item_len;
2077         int slot;
2078         int ret = 0;
2079
2080         path = btrfs_alloc_path();
2081         if (!path)
2082                 return -ENOMEM;
2083
2084         subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
2085         if (!subvol_info) {
2086                 btrfs_free_path(path);
2087                 return -ENOMEM;
2088         }
2089
2090         fs_info = BTRFS_I(inode)->root->fs_info;
2091
2092         /* Get root_item of inode's subvolume */
2093         key.objectid = btrfs_root_id(BTRFS_I(inode)->root);
2094         root = btrfs_get_fs_root(fs_info, key.objectid, true);
2095         if (IS_ERR(root)) {
2096                 ret = PTR_ERR(root);
2097                 goto out_free;
2098         }
2099         root_item = &root->root_item;
2100
2101         subvol_info->treeid = key.objectid;
2102
2103         subvol_info->generation = btrfs_root_generation(root_item);
2104         subvol_info->flags = btrfs_root_flags(root_item);
2105
2106         memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
2107         memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
2108                                                     BTRFS_UUID_SIZE);
2109         memcpy(subvol_info->received_uuid, root_item->received_uuid,
2110                                                     BTRFS_UUID_SIZE);
2111
2112         subvol_info->ctransid = btrfs_root_ctransid(root_item);
2113         subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
2114         subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
2115
2116         subvol_info->otransid = btrfs_root_otransid(root_item);
2117         subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
2118         subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
2119
2120         subvol_info->stransid = btrfs_root_stransid(root_item);
2121         subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
2122         subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
2123
2124         subvol_info->rtransid = btrfs_root_rtransid(root_item);
2125         subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
2126         subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
2127
2128         if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
2129                 /* Search root tree for ROOT_BACKREF of this subvolume */
2130                 key.type = BTRFS_ROOT_BACKREF_KEY;
2131                 key.offset = 0;
2132                 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2133                 if (ret < 0) {
2134                         goto out;
2135                 } else if (path->slots[0] >=
2136                            btrfs_header_nritems(path->nodes[0])) {
2137                         ret = btrfs_next_leaf(fs_info->tree_root, path);
2138                         if (ret < 0) {
2139                                 goto out;
2140                         } else if (ret > 0) {
2141                                 ret = -EUCLEAN;
2142                                 goto out;
2143                         }
2144                 }
2145
2146                 leaf = path->nodes[0];
2147                 slot = path->slots[0];
2148                 btrfs_item_key_to_cpu(leaf, &key, slot);
2149                 if (key.objectid == subvol_info->treeid &&
2150                     key.type == BTRFS_ROOT_BACKREF_KEY) {
2151                         subvol_info->parent_id = key.offset;
2152
2153                         rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2154                         subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
2155
2156                         item_off = btrfs_item_ptr_offset(leaf, slot)
2157                                         + sizeof(struct btrfs_root_ref);
2158                         item_len = btrfs_item_size(leaf, slot)
2159                                         - sizeof(struct btrfs_root_ref);
2160                         read_extent_buffer(leaf, subvol_info->name,
2161                                            item_off, item_len);
2162                 } else {
2163                         ret = -ENOENT;
2164                         goto out;
2165                 }
2166         }
2167
2168         btrfs_free_path(path);
2169         path = NULL;
2170         if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
2171                 ret = -EFAULT;
2172
2173 out:
2174         btrfs_put_root(root);
2175 out_free:
2176         btrfs_free_path(path);
2177         kfree(subvol_info);
2178         return ret;
2179 }
2180
2181 /*
2182  * Return ROOT_REF information of the subvolume containing this inode
2183  * except the subvolume name.
2184  */
2185 static int btrfs_ioctl_get_subvol_rootref(struct btrfs_root *root,
2186                                           void __user *argp)
2187 {
2188         struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
2189         struct btrfs_root_ref *rref;
2190         struct btrfs_path *path;
2191         struct btrfs_key key;
2192         struct extent_buffer *leaf;
2193         u64 objectid;
2194         int slot;
2195         int ret;
2196         u8 found;
2197
2198         path = btrfs_alloc_path();
2199         if (!path)
2200                 return -ENOMEM;
2201
2202         rootrefs = memdup_user(argp, sizeof(*rootrefs));
2203         if (IS_ERR(rootrefs)) {
2204                 btrfs_free_path(path);
2205                 return PTR_ERR(rootrefs);
2206         }
2207
2208         objectid = btrfs_root_id(root);
2209         key.objectid = objectid;
2210         key.type = BTRFS_ROOT_REF_KEY;
2211         key.offset = rootrefs->min_treeid;
2212         found = 0;
2213
2214         root = root->fs_info->tree_root;
2215         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2216         if (ret < 0) {
2217                 goto out;
2218         } else if (path->slots[0] >=
2219                    btrfs_header_nritems(path->nodes[0])) {
2220                 ret = btrfs_next_leaf(root, path);
2221                 if (ret < 0) {
2222                         goto out;
2223                 } else if (ret > 0) {
2224                         ret = -EUCLEAN;
2225                         goto out;
2226                 }
2227         }
2228         while (1) {
2229                 leaf = path->nodes[0];
2230                 slot = path->slots[0];
2231
2232                 btrfs_item_key_to_cpu(leaf, &key, slot);
2233                 if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
2234                         ret = 0;
2235                         goto out;
2236                 }
2237
2238                 if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
2239                         ret = -EOVERFLOW;
2240                         goto out;
2241                 }
2242
2243                 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2244                 rootrefs->rootref[found].treeid = key.offset;
2245                 rootrefs->rootref[found].dirid =
2246                                   btrfs_root_ref_dirid(leaf, rref);
2247                 found++;
2248
2249                 ret = btrfs_next_item(root, path);
2250                 if (ret < 0) {
2251                         goto out;
2252                 } else if (ret > 0) {
2253                         ret = -EUCLEAN;
2254                         goto out;
2255                 }
2256         }
2257
2258 out:
2259         btrfs_free_path(path);
2260
2261         if (!ret || ret == -EOVERFLOW) {
2262                 rootrefs->num_items = found;
2263                 /* update min_treeid for next search */
2264                 if (found)
2265                         rootrefs->min_treeid =
2266                                 rootrefs->rootref[found - 1].treeid + 1;
2267                 if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
2268                         ret = -EFAULT;
2269         }
2270
2271         kfree(rootrefs);
2272
2273         return ret;
2274 }
2275
2276 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2277                                              void __user *arg,
2278                                              bool destroy_v2)
2279 {
2280         struct dentry *parent = file->f_path.dentry;
2281         struct dentry *dentry;
2282         struct inode *dir = d_inode(parent);
2283         struct btrfs_fs_info *fs_info = inode_to_fs_info(dir);
2284         struct inode *inode;
2285         struct btrfs_root *root = BTRFS_I(dir)->root;
2286         struct btrfs_root *dest = NULL;
2287         struct btrfs_ioctl_vol_args *vol_args = NULL;
2288         struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
2289         struct mnt_idmap *idmap = file_mnt_idmap(file);
2290         char *subvol_name, *subvol_name_ptr = NULL;
2291         int ret = 0;
2292         bool destroy_parent = false;
2293
2294         /* We don't support snapshots with extent tree v2 yet. */
2295         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2296                 btrfs_err(fs_info,
2297                           "extent tree v2 doesn't support snapshot deletion yet");
2298                 return -EOPNOTSUPP;
2299         }
2300
2301         if (destroy_v2) {
2302                 vol_args2 = memdup_user(arg, sizeof(*vol_args2));
2303                 if (IS_ERR(vol_args2))
2304                         return PTR_ERR(vol_args2);
2305
2306                 if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
2307                         ret = -EOPNOTSUPP;
2308                         goto out;
2309                 }
2310
2311                 /*
2312                  * If SPEC_BY_ID is not set, we are looking for the subvolume by
2313                  * name, same as v1 currently does.
2314                  */
2315                 if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
2316                         ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args2);
2317                         if (ret < 0)
2318                                 goto out;
2319                         subvol_name = vol_args2->name;
2320
2321                         ret = mnt_want_write_file(file);
2322                         if (ret)
2323                                 goto out;
2324                 } else {
2325                         struct inode *old_dir;
2326
2327                         if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
2328                                 ret = -EINVAL;
2329                                 goto out;
2330                         }
2331
2332                         ret = mnt_want_write_file(file);
2333                         if (ret)
2334                                 goto out;
2335
2336                         dentry = btrfs_get_dentry(fs_info->sb,
2337                                         BTRFS_FIRST_FREE_OBJECTID,
2338                                         vol_args2->subvolid, 0);
2339                         if (IS_ERR(dentry)) {
2340                                 ret = PTR_ERR(dentry);
2341                                 goto out_drop_write;
2342                         }
2343
2344                         /*
2345                          * Change the default parent since the subvolume being
2346                          * deleted can be outside of the current mount point.
2347                          */
2348                         parent = btrfs_get_parent(dentry);
2349
2350                         /*
2351                          * At this point dentry->d_name can point to '/' if the
2352                          * subvolume we want to destroy is outsite of the
2353                          * current mount point, so we need to release the
2354                          * current dentry and execute the lookup to return a new
2355                          * one with ->d_name pointing to the
2356                          * <mount point>/subvol_name.
2357                          */
2358                         dput(dentry);
2359                         if (IS_ERR(parent)) {
2360                                 ret = PTR_ERR(parent);
2361                                 goto out_drop_write;
2362                         }
2363                         old_dir = dir;
2364                         dir = d_inode(parent);
2365
2366                         /*
2367                          * If v2 was used with SPEC_BY_ID, a new parent was
2368                          * allocated since the subvolume can be outside of the
2369                          * current mount point. Later on we need to release this
2370                          * new parent dentry.
2371                          */
2372                         destroy_parent = true;
2373
2374                         /*
2375                          * On idmapped mounts, deletion via subvolid is
2376                          * restricted to subvolumes that are immediate
2377                          * ancestors of the inode referenced by the file
2378                          * descriptor in the ioctl. Otherwise the idmapping
2379                          * could potentially be abused to delete subvolumes
2380                          * anywhere in the filesystem the user wouldn't be able
2381                          * to delete without an idmapped mount.
2382                          */
2383                         if (old_dir != dir && idmap != &nop_mnt_idmap) {
2384                                 ret = -EOPNOTSUPP;
2385                                 goto free_parent;
2386                         }
2387
2388                         subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
2389                                                 fs_info, vol_args2->subvolid);
2390                         if (IS_ERR(subvol_name_ptr)) {
2391                                 ret = PTR_ERR(subvol_name_ptr);
2392                                 goto free_parent;
2393                         }
2394                         /* subvol_name_ptr is already nul terminated */
2395                         subvol_name = (char *)kbasename(subvol_name_ptr);
2396                 }
2397         } else {
2398                 vol_args = memdup_user(arg, sizeof(*vol_args));
2399                 if (IS_ERR(vol_args))
2400                         return PTR_ERR(vol_args);
2401
2402                 ret = btrfs_check_ioctl_vol_args_path(vol_args);
2403                 if (ret < 0)
2404                         goto out;
2405
2406                 subvol_name = vol_args->name;
2407
2408                 ret = mnt_want_write_file(file);
2409                 if (ret)
2410                         goto out;
2411         }
2412
2413         if (strchr(subvol_name, '/') ||
2414             strcmp(subvol_name, "..") == 0) {
2415                 ret = -EINVAL;
2416                 goto free_subvol_name;
2417         }
2418
2419         if (!S_ISDIR(dir->i_mode)) {
2420                 ret = -ENOTDIR;
2421                 goto free_subvol_name;
2422         }
2423
2424         ret = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
2425         if (ret == -EINTR)
2426                 goto free_subvol_name;
2427         dentry = lookup_one(idmap, &QSTR(subvol_name), parent);
2428         if (IS_ERR(dentry)) {
2429                 ret = PTR_ERR(dentry);
2430                 goto out_unlock_dir;
2431         }
2432
2433         if (d_really_is_negative(dentry)) {
2434                 ret = -ENOENT;
2435                 goto out_dput;
2436         }
2437
2438         inode = d_inode(dentry);
2439         dest = BTRFS_I(inode)->root;
2440         if (!capable(CAP_SYS_ADMIN)) {
2441                 /*
2442                  * Regular user.  Only allow this with a special mount
2443                  * option, when the user has write+exec access to the
2444                  * subvol root, and when rmdir(2) would have been
2445                  * allowed.
2446                  *
2447                  * Note that this is _not_ check that the subvol is
2448                  * empty or doesn't contain data that we wouldn't
2449                  * otherwise be able to delete.
2450                  *
2451                  * Users who want to delete empty subvols should try
2452                  * rmdir(2).
2453                  */
2454                 ret = -EPERM;
2455                 if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
2456                         goto out_dput;
2457
2458                 /*
2459                  * Do not allow deletion if the parent dir is the same
2460                  * as the dir to be deleted.  That means the ioctl
2461                  * must be called on the dentry referencing the root
2462                  * of the subvol, not a random directory contained
2463                  * within it.
2464                  */
2465                 ret = -EINVAL;
2466                 if (root == dest)
2467                         goto out_dput;
2468
2469                 ret = inode_permission(idmap, inode, MAY_WRITE | MAY_EXEC);
2470                 if (ret)
2471                         goto out_dput;
2472         }
2473
2474         /* check if subvolume may be deleted by a user */
2475         ret = btrfs_may_delete(idmap, dir, dentry, 1);
2476         if (ret)
2477                 goto out_dput;
2478
2479         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
2480                 ret = -EINVAL;
2481                 goto out_dput;
2482         }
2483
2484         btrfs_inode_lock(BTRFS_I(inode), 0);
2485         ret = btrfs_delete_subvolume(BTRFS_I(dir), dentry);
2486         btrfs_inode_unlock(BTRFS_I(inode), 0);
2487         if (!ret)
2488                 d_delete_notify(dir, dentry);
2489
2490 out_dput:
2491         dput(dentry);
2492 out_unlock_dir:
2493         btrfs_inode_unlock(BTRFS_I(dir), 0);
2494 free_subvol_name:
2495         kfree(subvol_name_ptr);
2496 free_parent:
2497         if (destroy_parent)
2498                 dput(parent);
2499 out_drop_write:
2500         mnt_drop_write_file(file);
2501 out:
2502         kfree(vol_args2);
2503         kfree(vol_args);
2504         return ret;
2505 }
2506
2507 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
2508 {
2509         struct inode *inode = file_inode(file);
2510         struct btrfs_root *root = BTRFS_I(inode)->root;
2511         struct btrfs_ioctl_defrag_range_args range = {0};
2512         int ret;
2513
2514         ret = mnt_want_write_file(file);
2515         if (ret)
2516                 return ret;
2517
2518         if (btrfs_root_readonly(root)) {
2519                 ret = -EROFS;
2520                 goto out;
2521         }
2522
2523         switch (inode->i_mode & S_IFMT) {
2524         case S_IFDIR:
2525                 if (!capable(CAP_SYS_ADMIN)) {
2526                         ret = -EPERM;
2527                         goto out;
2528                 }
2529                 ret = btrfs_defrag_root(root);
2530                 break;
2531         case S_IFREG:
2532                 /*
2533                  * Note that this does not check the file descriptor for write
2534                  * access. This prevents defragmenting executables that are
2535                  * running and allows defrag on files open in read-only mode.
2536                  */
2537                 if (!capable(CAP_SYS_ADMIN) &&
2538                     inode_permission(&nop_mnt_idmap, inode, MAY_WRITE)) {
2539                         ret = -EPERM;
2540                         goto out;
2541                 }
2542
2543                 /*
2544                  * Don't allow defrag on pre-content watched files, as it could
2545                  * populate the page cache with 0's via readahead.
2546                  */
2547                 if (unlikely(FMODE_FSNOTIFY_HSM(file->f_mode))) {
2548                         ret = -EINVAL;
2549                         goto out;
2550                 }
2551
2552                 if (argp) {
2553                         if (copy_from_user(&range, argp, sizeof(range))) {
2554                                 ret = -EFAULT;
2555                                 goto out;
2556                         }
2557                         if (range.flags & ~BTRFS_DEFRAG_RANGE_FLAGS_SUPP) {
2558                                 ret = -EOPNOTSUPP;
2559                                 goto out;
2560                         }
2561                         /* compression requires us to start the IO */
2562                         if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
2563                                 range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
2564                                 range.extent_thresh = (u32)-1;
2565                         }
2566                 } else {
2567                         /* the rest are all set to zero by kzalloc */
2568                         range.len = (u64)-1;
2569                 }
2570                 ret = btrfs_defrag_file(BTRFS_I(file_inode(file)), &file->f_ra,
2571                                         &range, BTRFS_OLDEST_GENERATION, 0);
2572                 if (ret > 0)
2573                         ret = 0;
2574                 break;
2575         default:
2576                 ret = -EINVAL;
2577         }
2578 out:
2579         mnt_drop_write_file(file);
2580         return ret;
2581 }
2582
2583 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
2584 {
2585         struct btrfs_ioctl_vol_args *vol_args;
2586         bool restore_op = false;
2587         int ret;
2588
2589         if (!capable(CAP_SYS_ADMIN))
2590                 return -EPERM;
2591
2592         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
2593                 btrfs_err(fs_info, "device add not supported on extent tree v2 yet");
2594                 return -EINVAL;
2595         }
2596
2597         if (fs_info->fs_devices->temp_fsid) {
2598                 btrfs_err(fs_info,
2599                           "device add not supported on cloned temp-fsid mount");
2600                 return -EINVAL;
2601         }
2602
2603         if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
2604                 if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
2605                         return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
2606
2607                 /*
2608                  * We can do the device add because we have a paused balanced,
2609                  * change the exclusive op type and remember we should bring
2610                  * back the paused balance
2611                  */
2612                 fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
2613                 btrfs_exclop_start_unlock(fs_info);
2614                 restore_op = true;
2615         }
2616
2617         vol_args = memdup_user(arg, sizeof(*vol_args));
2618         if (IS_ERR(vol_args)) {
2619                 ret = PTR_ERR(vol_args);
2620                 goto out;
2621         }
2622
2623         ret = btrfs_check_ioctl_vol_args_path(vol_args);
2624         if (ret < 0)
2625                 goto out_free;
2626
2627         ret = btrfs_init_new_device(fs_info, vol_args->name);
2628
2629         if (!ret)
2630                 btrfs_info(fs_info, "disk added %s", vol_args->name);
2631
2632 out_free:
2633         kfree(vol_args);
2634 out:
2635         if (restore_op)
2636                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
2637         else
2638                 btrfs_exclop_finish(fs_info);
2639         return ret;
2640 }
2641
2642 static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
2643 {
2644         BTRFS_DEV_LOOKUP_ARGS(args);
2645         struct inode *inode = file_inode(file);
2646         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2647         struct btrfs_ioctl_vol_args_v2 *vol_args;
2648         struct file *bdev_file = NULL;
2649         int ret;
2650         bool cancel = false;
2651
2652         if (!capable(CAP_SYS_ADMIN))
2653                 return -EPERM;
2654
2655         vol_args = memdup_user(arg, sizeof(*vol_args));
2656         if (IS_ERR(vol_args))
2657                 return PTR_ERR(vol_args);
2658
2659         if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
2660                 ret = -EOPNOTSUPP;
2661                 goto out;
2662         }
2663
2664         ret = btrfs_check_ioctl_vol_args2_subvol_name(vol_args);
2665         if (ret < 0)
2666                 goto out;
2667
2668         if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
2669                 args.devid = vol_args->devid;
2670         } else if (!strcmp("cancel", vol_args->name)) {
2671                 cancel = true;
2672         } else {
2673                 ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
2674                 if (ret)
2675                         goto out;
2676         }
2677
2678         ret = mnt_want_write_file(file);
2679         if (ret)
2680                 goto out;
2681
2682         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
2683                                            cancel);
2684         if (ret)
2685                 goto err_drop;
2686
2687         /* Exclusive operation is now claimed */
2688         ret = btrfs_rm_device(fs_info, &args, &bdev_file);
2689
2690         btrfs_exclop_finish(fs_info);
2691
2692         if (!ret) {
2693                 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
2694                         btrfs_info(fs_info, "device deleted: id %llu",
2695                                         vol_args->devid);
2696                 else
2697                         btrfs_info(fs_info, "device deleted: %s",
2698                                         vol_args->name);
2699         }
2700 err_drop:
2701         mnt_drop_write_file(file);
2702         if (bdev_file)
2703                 fput(bdev_file);
2704 out:
2705         btrfs_put_dev_args_from_path(&args);
2706         kfree(vol_args);
2707         return ret;
2708 }
2709
2710 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
2711 {
2712         BTRFS_DEV_LOOKUP_ARGS(args);
2713         struct inode *inode = file_inode(file);
2714         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2715         struct btrfs_ioctl_vol_args *vol_args;
2716         struct file *bdev_file = NULL;
2717         int ret;
2718         bool cancel = false;
2719
2720         if (!capable(CAP_SYS_ADMIN))
2721                 return -EPERM;
2722
2723         vol_args = memdup_user(arg, sizeof(*vol_args));
2724         if (IS_ERR(vol_args))
2725                 return PTR_ERR(vol_args);
2726
2727         ret = btrfs_check_ioctl_vol_args_path(vol_args);
2728         if (ret < 0)
2729                 goto out_free;
2730
2731         if (!strcmp("cancel", vol_args->name)) {
2732                 cancel = true;
2733         } else {
2734                 ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
2735                 if (ret)
2736                         goto out;
2737         }
2738
2739         ret = mnt_want_write_file(file);
2740         if (ret)
2741                 goto out;
2742
2743         ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
2744                                            cancel);
2745         if (ret == 0) {
2746                 ret = btrfs_rm_device(fs_info, &args, &bdev_file);
2747                 if (!ret)
2748                         btrfs_info(fs_info, "disk deleted %s", vol_args->name);
2749                 btrfs_exclop_finish(fs_info);
2750         }
2751
2752         mnt_drop_write_file(file);
2753         if (bdev_file)
2754                 fput(bdev_file);
2755 out:
2756         btrfs_put_dev_args_from_path(&args);
2757 out_free:
2758         kfree(vol_args);
2759         return ret;
2760 }
2761
2762 static long btrfs_ioctl_fs_info(const struct btrfs_fs_info *fs_info,
2763                                 void __user *arg)
2764 {
2765         struct btrfs_ioctl_fs_info_args *fi_args;
2766         struct btrfs_device *device;
2767         struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
2768         u64 flags_in;
2769         int ret = 0;
2770
2771         fi_args = memdup_user(arg, sizeof(*fi_args));
2772         if (IS_ERR(fi_args))
2773                 return PTR_ERR(fi_args);
2774
2775         flags_in = fi_args->flags;
2776         memset(fi_args, 0, sizeof(*fi_args));
2777
2778         rcu_read_lock();
2779         fi_args->num_devices = fs_devices->num_devices;
2780
2781         list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
2782                 if (device->devid > fi_args->max_id)
2783                         fi_args->max_id = device->devid;
2784         }
2785         rcu_read_unlock();
2786
2787         memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
2788         fi_args->nodesize = fs_info->nodesize;
2789         fi_args->sectorsize = fs_info->sectorsize;
2790         fi_args->clone_alignment = fs_info->sectorsize;
2791
2792         if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
2793                 fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
2794                 fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
2795                 fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
2796         }
2797
2798         if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
2799                 fi_args->generation = btrfs_get_fs_generation(fs_info);
2800                 fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
2801         }
2802
2803         if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
2804                 memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
2805                        sizeof(fi_args->metadata_uuid));
2806                 fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
2807         }
2808
2809         if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
2810                 ret = -EFAULT;
2811
2812         kfree(fi_args);
2813         return ret;
2814 }
2815
2816 static long btrfs_ioctl_dev_info(const struct btrfs_fs_info *fs_info,
2817                                  void __user *arg)
2818 {
2819         BTRFS_DEV_LOOKUP_ARGS(args);
2820         struct btrfs_ioctl_dev_info_args *di_args;
2821         struct btrfs_device *dev;
2822         int ret = 0;
2823
2824         di_args = memdup_user(arg, sizeof(*di_args));
2825         if (IS_ERR(di_args))
2826                 return PTR_ERR(di_args);
2827
2828         args.devid = di_args->devid;
2829         if (!btrfs_is_empty_uuid(di_args->uuid))
2830                 args.uuid = di_args->uuid;
2831
2832         rcu_read_lock();
2833         dev = btrfs_find_device(fs_info->fs_devices, &args);
2834         if (!dev) {
2835                 ret = -ENODEV;
2836                 goto out;
2837         }
2838
2839         di_args->devid = dev->devid;
2840         di_args->bytes_used = btrfs_device_get_bytes_used(dev);
2841         di_args->total_bytes = btrfs_device_get_total_bytes(dev);
2842         memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
2843         memcpy(di_args->fsid, dev->fs_devices->fsid, BTRFS_UUID_SIZE);
2844         if (dev->name)
2845                 strscpy(di_args->path, btrfs_dev_name(dev), sizeof(di_args->path));
2846         else
2847                 di_args->path[0] = '\0';
2848
2849 out:
2850         rcu_read_unlock();
2851         if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
2852                 ret = -EFAULT;
2853
2854         kfree(di_args);
2855         return ret;
2856 }
2857
2858 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
2859 {
2860         struct inode *inode = file_inode(file);
2861         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
2862         struct btrfs_root *root = BTRFS_I(inode)->root;
2863         struct btrfs_root *new_root;
2864         struct btrfs_dir_item *di;
2865         struct btrfs_trans_handle *trans;
2866         struct btrfs_path *path = NULL;
2867         struct btrfs_disk_key disk_key;
2868         struct fscrypt_str name = FSTR_INIT("default", 7);
2869         u64 objectid = 0;
2870         u64 dir_id;
2871         int ret;
2872
2873         if (!capable(CAP_SYS_ADMIN))
2874                 return -EPERM;
2875
2876         ret = mnt_want_write_file(file);
2877         if (ret)
2878                 return ret;
2879
2880         if (copy_from_user(&objectid, argp, sizeof(objectid))) {
2881                 ret = -EFAULT;
2882                 goto out;
2883         }
2884
2885         if (!objectid)
2886                 objectid = BTRFS_FS_TREE_OBJECTID;
2887
2888         new_root = btrfs_get_fs_root(fs_info, objectid, true);
2889         if (IS_ERR(new_root)) {
2890                 ret = PTR_ERR(new_root);
2891                 goto out;
2892         }
2893         if (!is_fstree(btrfs_root_id(new_root))) {
2894                 ret = -ENOENT;
2895                 goto out_free;
2896         }
2897
2898         path = btrfs_alloc_path();
2899         if (!path) {
2900                 ret = -ENOMEM;
2901                 goto out_free;
2902         }
2903
2904         trans = btrfs_start_transaction(root, 1);
2905         if (IS_ERR(trans)) {
2906                 ret = PTR_ERR(trans);
2907                 goto out_free;
2908         }
2909
2910         dir_id = btrfs_super_root_dir(fs_info->super_copy);
2911         di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
2912                                    dir_id, &name, 1);
2913         if (IS_ERR_OR_NULL(di)) {
2914                 btrfs_release_path(path);
2915                 btrfs_end_transaction(trans);
2916                 btrfs_err(fs_info,
2917                           "Umm, you don't have the default diritem, this isn't going to work");
2918                 ret = -ENOENT;
2919                 goto out_free;
2920         }
2921
2922         btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
2923         btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
2924         btrfs_release_path(path);
2925
2926         btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
2927         btrfs_end_transaction(trans);
2928 out_free:
2929         btrfs_put_root(new_root);
2930         btrfs_free_path(path);
2931 out:
2932         mnt_drop_write_file(file);
2933         return ret;
2934 }
2935
2936 static void get_block_group_info(struct list_head *groups_list,
2937                                  struct btrfs_ioctl_space_info *space)
2938 {
2939         struct btrfs_block_group *block_group;
2940
2941         space->total_bytes = 0;
2942         space->used_bytes = 0;
2943         space->flags = 0;
2944         list_for_each_entry(block_group, groups_list, list) {
2945                 space->flags = block_group->flags;
2946                 space->total_bytes += block_group->length;
2947                 space->used_bytes += block_group->used;
2948         }
2949 }
2950
2951 static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
2952                                    void __user *arg)
2953 {
2954         struct btrfs_ioctl_space_args space_args = { 0 };
2955         struct btrfs_ioctl_space_info space;
2956         struct btrfs_ioctl_space_info *dest;
2957         struct btrfs_ioctl_space_info *dest_orig;
2958         struct btrfs_ioctl_space_info __user *user_dest;
2959         struct btrfs_space_info *info;
2960         static const u64 types[] = {
2961                 BTRFS_BLOCK_GROUP_DATA,
2962                 BTRFS_BLOCK_GROUP_SYSTEM,
2963                 BTRFS_BLOCK_GROUP_METADATA,
2964                 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
2965         };
2966         int num_types = 4;
2967         int alloc_size;
2968         int ret = 0;
2969         u64 slot_count = 0;
2970         int i, c;
2971
2972         if (copy_from_user(&space_args,
2973                            (struct btrfs_ioctl_space_args __user *)arg,
2974                            sizeof(space_args)))
2975                 return -EFAULT;
2976
2977         for (i = 0; i < num_types; i++) {
2978                 struct btrfs_space_info *tmp;
2979
2980                 info = NULL;
2981                 list_for_each_entry(tmp, &fs_info->space_info, list) {
2982                         if (tmp->flags == types[i]) {
2983                                 info = tmp;
2984                                 break;
2985                         }
2986                 }
2987
2988                 if (!info)
2989                         continue;
2990
2991                 down_read(&info->groups_sem);
2992                 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
2993                         if (!list_empty(&info->block_groups[c]))
2994                                 slot_count++;
2995                 }
2996                 up_read(&info->groups_sem);
2997         }
2998
2999         /*
3000          * Global block reserve, exported as a space_info
3001          */
3002         slot_count++;
3003
3004         /* space_slots == 0 means they are asking for a count */
3005         if (space_args.space_slots == 0) {
3006                 space_args.total_spaces = slot_count;
3007                 goto out;
3008         }
3009
3010         slot_count = min_t(u64, space_args.space_slots, slot_count);
3011
3012         alloc_size = sizeof(*dest) * slot_count;
3013
3014         /* we generally have at most 6 or so space infos, one for each raid
3015          * level.  So, a whole page should be more than enough for everyone
3016          */
3017         if (alloc_size > PAGE_SIZE)
3018                 return -ENOMEM;
3019
3020         space_args.total_spaces = 0;
3021         dest = kmalloc(alloc_size, GFP_KERNEL);
3022         if (!dest)
3023                 return -ENOMEM;
3024         dest_orig = dest;
3025
3026         /* now we have a buffer to copy into */
3027         for (i = 0; i < num_types; i++) {
3028                 struct btrfs_space_info *tmp;
3029
3030                 if (!slot_count)
3031                         break;
3032
3033                 info = NULL;
3034                 list_for_each_entry(tmp, &fs_info->space_info, list) {
3035                         if (tmp->flags == types[i]) {
3036                                 info = tmp;
3037                                 break;
3038                         }
3039                 }
3040
3041                 if (!info)
3042                         continue;
3043                 down_read(&info->groups_sem);
3044                 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3045                         if (!list_empty(&info->block_groups[c])) {
3046                                 get_block_group_info(&info->block_groups[c],
3047                                                      &space);
3048                                 memcpy(dest, &space, sizeof(space));
3049                                 dest++;
3050                                 space_args.total_spaces++;
3051                                 slot_count--;
3052                         }
3053                         if (!slot_count)
3054                                 break;
3055                 }
3056                 up_read(&info->groups_sem);
3057         }
3058
3059         /*
3060          * Add global block reserve
3061          */
3062         if (slot_count) {
3063                 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3064
3065                 spin_lock(&block_rsv->lock);
3066                 space.total_bytes = block_rsv->size;
3067                 space.used_bytes = block_rsv->size - block_rsv->reserved;
3068                 spin_unlock(&block_rsv->lock);
3069                 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
3070                 memcpy(dest, &space, sizeof(space));
3071                 space_args.total_spaces++;
3072         }
3073
3074         user_dest = (struct btrfs_ioctl_space_info __user *)
3075                 (arg + sizeof(struct btrfs_ioctl_space_args));
3076
3077         if (copy_to_user(user_dest, dest_orig, alloc_size))
3078                 ret = -EFAULT;
3079
3080         kfree(dest_orig);
3081 out:
3082         if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
3083                 ret = -EFAULT;
3084
3085         return ret;
3086 }
3087
3088 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3089                                             void __user *argp)
3090 {
3091         struct btrfs_trans_handle *trans;
3092         u64 transid;
3093
3094         /*
3095          * Start orphan cleanup here for the given root in case it hasn't been
3096          * started already by other means. Errors are handled in the other
3097          * functions during transaction commit.
3098          */
3099         btrfs_orphan_cleanup(root);
3100
3101         trans = btrfs_attach_transaction_barrier(root);
3102         if (IS_ERR(trans)) {
3103                 if (PTR_ERR(trans) != -ENOENT)
3104                         return PTR_ERR(trans);
3105
3106                 /* No running transaction, don't bother */
3107                 transid = btrfs_get_last_trans_committed(root->fs_info);
3108                 goto out;
3109         }
3110         transid = trans->transid;
3111         btrfs_commit_transaction_async(trans);
3112 out:
3113         if (argp)
3114                 if (copy_to_user(argp, &transid, sizeof(transid)))
3115                         return -EFAULT;
3116         return 0;
3117 }
3118
3119 static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
3120                                            void __user *argp)
3121 {
3122         /* By default wait for the current transaction. */
3123         u64 transid = 0;
3124
3125         if (argp)
3126                 if (copy_from_user(&transid, argp, sizeof(transid)))
3127                         return -EFAULT;
3128
3129         return btrfs_wait_for_commit(fs_info, transid);
3130 }
3131
3132 static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3133 {
3134         struct btrfs_fs_info *fs_info = inode_to_fs_info(file_inode(file));
3135         struct btrfs_ioctl_scrub_args *sa;
3136         int ret;
3137
3138         if (!capable(CAP_SYS_ADMIN))
3139                 return -EPERM;
3140
3141         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3142                 btrfs_err(fs_info, "scrub: extent tree v2 not yet supported");
3143                 return -EINVAL;
3144         }
3145
3146         sa = memdup_user(arg, sizeof(*sa));
3147         if (IS_ERR(sa))
3148                 return PTR_ERR(sa);
3149
3150         if (sa->flags & ~BTRFS_SCRUB_SUPPORTED_FLAGS) {
3151                 ret = -EOPNOTSUPP;
3152                 goto out;
3153         }
3154
3155         if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3156                 ret = mnt_want_write_file(file);
3157                 if (ret)
3158                         goto out;
3159         }
3160
3161         ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
3162                               &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3163                               0);
3164
3165         /*
3166          * Copy scrub args to user space even if btrfs_scrub_dev() returned an
3167          * error. This is important as it allows user space to know how much
3168          * progress scrub has done. For example, if scrub is canceled we get
3169          * -ECANCELED from btrfs_scrub_dev() and return that error back to user
3170          * space. Later user space can inspect the progress from the structure
3171          * btrfs_ioctl_scrub_args and resume scrub from where it left off
3172          * previously (btrfs-progs does this).
3173          * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
3174          * then return -EFAULT to signal the structure was not copied or it may
3175          * be corrupt and unreliable due to a partial copy.
3176          */
3177         if (copy_to_user(arg, sa, sizeof(*sa)))
3178                 ret = -EFAULT;
3179
3180         if (!(sa->flags & BTRFS_SCRUB_READONLY))
3181                 mnt_drop_write_file(file);
3182 out:
3183         kfree(sa);
3184         return ret;
3185 }
3186
3187 static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
3188 {
3189         if (!capable(CAP_SYS_ADMIN))
3190                 return -EPERM;
3191
3192         return btrfs_scrub_cancel(fs_info);
3193 }
3194
3195 static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
3196                                        void __user *arg)
3197 {
3198         struct btrfs_ioctl_scrub_args *sa;
3199         int ret;
3200
3201         if (!capable(CAP_SYS_ADMIN))
3202                 return -EPERM;
3203
3204         sa = memdup_user(arg, sizeof(*sa));
3205         if (IS_ERR(sa))
3206                 return PTR_ERR(sa);
3207
3208         ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
3209
3210         if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
3211                 ret = -EFAULT;
3212
3213         kfree(sa);
3214         return ret;
3215 }
3216
3217 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
3218                                       void __user *arg)
3219 {
3220         struct btrfs_ioctl_get_dev_stats *sa;
3221         int ret;
3222
3223         sa = memdup_user(arg, sizeof(*sa));
3224         if (IS_ERR(sa))
3225                 return PTR_ERR(sa);
3226
3227         if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3228                 kfree(sa);
3229                 return -EPERM;
3230         }
3231
3232         ret = btrfs_get_dev_stats(fs_info, sa);
3233
3234         if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
3235                 ret = -EFAULT;
3236
3237         kfree(sa);
3238         return ret;
3239 }
3240
3241 static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
3242                                     void __user *arg)
3243 {
3244         struct btrfs_ioctl_dev_replace_args *p;
3245         int ret;
3246
3247         if (!capable(CAP_SYS_ADMIN))
3248                 return -EPERM;
3249
3250         if (btrfs_fs_incompat(fs_info, EXTENT_TREE_V2)) {
3251                 btrfs_err(fs_info, "device replace not supported on extent tree v2 yet");
3252                 return -EINVAL;
3253         }
3254
3255         p = memdup_user(arg, sizeof(*p));
3256         if (IS_ERR(p))
3257                 return PTR_ERR(p);
3258
3259         switch (p->cmd) {
3260         case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3261                 if (sb_rdonly(fs_info->sb)) {
3262                         ret = -EROFS;
3263                         goto out;
3264                 }
3265                 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
3266                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3267                 } else {
3268                         ret = btrfs_dev_replace_by_ioctl(fs_info, p);
3269                         btrfs_exclop_finish(fs_info);
3270                 }
3271                 break;
3272         case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3273                 btrfs_dev_replace_status(fs_info, p);
3274                 ret = 0;
3275                 break;
3276         case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3277                 p->result = btrfs_dev_replace_cancel(fs_info);
3278                 ret = 0;
3279                 break;
3280         default:
3281                 ret = -EINVAL;
3282                 break;
3283         }
3284
3285         if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
3286                 ret = -EFAULT;
3287 out:
3288         kfree(p);
3289         return ret;
3290 }
3291
3292 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3293 {
3294         int ret = 0;
3295         int i;
3296         u64 rel_ptr;
3297         int size;
3298         struct btrfs_ioctl_ino_path_args *ipa = NULL;
3299         struct inode_fs_paths *ipath = NULL;
3300         struct btrfs_path *path;
3301
3302         if (!capable(CAP_DAC_READ_SEARCH))
3303                 return -EPERM;
3304
3305         path = btrfs_alloc_path();
3306         if (!path) {
3307                 ret = -ENOMEM;
3308                 goto out;
3309         }
3310
3311         ipa = memdup_user(arg, sizeof(*ipa));
3312         if (IS_ERR(ipa)) {
3313                 ret = PTR_ERR(ipa);
3314                 ipa = NULL;
3315                 goto out;
3316         }
3317
3318         size = min_t(u32, ipa->size, 4096);
3319         ipath = init_ipath(size, root, path);
3320         if (IS_ERR(ipath)) {
3321                 ret = PTR_ERR(ipath);
3322                 ipath = NULL;
3323                 goto out;
3324         }
3325
3326         ret = paths_from_inode(ipa->inum, ipath);
3327         if (ret < 0)
3328                 goto out;
3329
3330         for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
3331                 rel_ptr = ipath->fspath->val[i] -
3332                           (u64)(unsigned long)ipath->fspath->val;
3333                 ipath->fspath->val[i] = rel_ptr;
3334         }
3335
3336         btrfs_free_path(path);
3337         path = NULL;
3338         ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
3339                            ipath->fspath, size);
3340         if (ret) {
3341                 ret = -EFAULT;
3342                 goto out;
3343         }
3344
3345 out:
3346         btrfs_free_path(path);
3347         free_ipath(ipath);
3348         kfree(ipa);
3349
3350         return ret;
3351 }
3352
3353 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
3354                                         void __user *arg, int version)
3355 {
3356         int ret = 0;
3357         int size;
3358         struct btrfs_ioctl_logical_ino_args *loi;
3359         struct btrfs_data_container *inodes = NULL;
3360         struct btrfs_path *path = NULL;
3361         bool ignore_offset;
3362
3363         if (!capable(CAP_SYS_ADMIN))
3364                 return -EPERM;
3365
3366         loi = memdup_user(arg, sizeof(*loi));
3367         if (IS_ERR(loi))
3368                 return PTR_ERR(loi);
3369
3370         if (version == 1) {
3371                 ignore_offset = false;
3372                 size = min_t(u32, loi->size, SZ_64K);
3373         } else {
3374                 /* All reserved bits must be 0 for now */
3375                 if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
3376                         ret = -EINVAL;
3377                         goto out_loi;
3378                 }
3379                 /* Only accept flags we have defined so far */
3380                 if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
3381                         ret = -EINVAL;
3382                         goto out_loi;
3383                 }
3384                 ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
3385                 size = min_t(u32, loi->size, SZ_16M);
3386         }
3387
3388         inodes = init_data_container(size);
3389         if (IS_ERR(inodes)) {
3390                 ret = PTR_ERR(inodes);
3391                 goto out_loi;
3392         }
3393
3394         path = btrfs_alloc_path();
3395         if (!path) {
3396                 ret = -ENOMEM;
3397                 goto out;
3398         }
3399         ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
3400                                           inodes, ignore_offset);
3401         btrfs_free_path(path);
3402         if (ret == -EINVAL)
3403                 ret = -ENOENT;
3404         if (ret < 0)
3405                 goto out;
3406
3407         ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
3408                            size);
3409         if (ret)
3410                 ret = -EFAULT;
3411
3412 out:
3413         kvfree(inodes);
3414 out_loi:
3415         kfree(loi);
3416
3417         return ret;
3418 }
3419
3420 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
3421                                struct btrfs_ioctl_balance_args *bargs)
3422 {
3423         struct btrfs_balance_control *bctl = fs_info->balance_ctl;
3424
3425         bargs->flags = bctl->flags;
3426
3427         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
3428                 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
3429         if (atomic_read(&fs_info->balance_pause_req))
3430                 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
3431         if (atomic_read(&fs_info->balance_cancel_req))
3432                 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
3433
3434         memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
3435         memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
3436         memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
3437
3438         spin_lock(&fs_info->balance_lock);
3439         memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
3440         spin_unlock(&fs_info->balance_lock);
3441 }
3442
3443 /*
3444  * Try to acquire fs_info::balance_mutex as well as set BTRFS_EXLCOP_BALANCE as
3445  * required.
3446  *
3447  * @fs_info:       the filesystem
3448  * @excl_acquired: ptr to boolean value which is set to false in case balance
3449  *                 is being resumed
3450  *
3451  * Return 0 on success in which case both fs_info::balance is acquired as well
3452  * as exclusive ops are blocked. In case of failure return an error code.
3453  */
3454 static int btrfs_try_lock_balance(struct btrfs_fs_info *fs_info, bool *excl_acquired)
3455 {
3456         int ret;
3457
3458         /*
3459          * Exclusive operation is locked. Three possibilities:
3460          *   (1) some other op is running
3461          *   (2) balance is running
3462          *   (3) balance is paused -- special case (think resume)
3463          */
3464         while (1) {
3465                 if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
3466                         *excl_acquired = true;
3467                         mutex_lock(&fs_info->balance_mutex);
3468                         return 0;
3469                 }
3470
3471                 mutex_lock(&fs_info->balance_mutex);
3472                 if (fs_info->balance_ctl) {
3473                         /* This is either (2) or (3) */
3474                         if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
3475                                 /* This is (2) */
3476                                 ret = -EINPROGRESS;
3477                                 goto out_failure;
3478
3479                         } else {
3480                                 mutex_unlock(&fs_info->balance_mutex);
3481                                 /*
3482                                  * Lock released to allow other waiters to
3483                                  * continue, we'll reexamine the status again.
3484                                  */
3485                                 mutex_lock(&fs_info->balance_mutex);
3486
3487                                 if (fs_info->balance_ctl &&
3488                                     !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
3489                                         /* This is (3) */
3490                                         *excl_acquired = false;
3491                                         return 0;
3492                                 }
3493                         }
3494                 } else {
3495                         /* This is (1) */
3496                         ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3497                         goto out_failure;
3498                 }
3499
3500                 mutex_unlock(&fs_info->balance_mutex);
3501         }
3502
3503 out_failure:
3504         mutex_unlock(&fs_info->balance_mutex);
3505         *excl_acquired = false;
3506         return ret;
3507 }
3508
3509 static long btrfs_ioctl_balance(struct file *file, void __user *arg)
3510 {
3511         struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
3512         struct btrfs_fs_info *fs_info = root->fs_info;
3513         struct btrfs_ioctl_balance_args *bargs;
3514         struct btrfs_balance_control *bctl;
3515         bool need_unlock = true;
3516         int ret;
3517
3518         if (!capable(CAP_SYS_ADMIN))
3519                 return -EPERM;
3520
3521         ret = mnt_want_write_file(file);
3522         if (ret)
3523                 return ret;
3524
3525         bargs = memdup_user(arg, sizeof(*bargs));
3526         if (IS_ERR(bargs)) {
3527                 ret = PTR_ERR(bargs);
3528                 bargs = NULL;
3529                 goto out;
3530         }
3531
3532         ret = btrfs_try_lock_balance(fs_info, &need_unlock);
3533         if (ret)
3534                 goto out;
3535
3536         lockdep_assert_held(&fs_info->balance_mutex);
3537
3538         if (bargs->flags & BTRFS_BALANCE_RESUME) {
3539                 if (!fs_info->balance_ctl) {
3540                         ret = -ENOTCONN;
3541                         goto out_unlock;
3542                 }
3543
3544                 bctl = fs_info->balance_ctl;
3545                 spin_lock(&fs_info->balance_lock);
3546                 bctl->flags |= BTRFS_BALANCE_RESUME;
3547                 spin_unlock(&fs_info->balance_lock);
3548                 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
3549
3550                 goto do_balance;
3551         }
3552
3553         if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
3554                 ret = -EINVAL;
3555                 goto out_unlock;
3556         }
3557
3558         if (fs_info->balance_ctl) {
3559                 ret = -EINPROGRESS;
3560                 goto out_unlock;
3561         }
3562
3563         bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
3564         if (!bctl) {
3565                 ret = -ENOMEM;
3566                 goto out_unlock;
3567         }
3568
3569         memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
3570         memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
3571         memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
3572
3573         bctl->flags = bargs->flags;
3574 do_balance:
3575         /*
3576          * Ownership of bctl and exclusive operation goes to btrfs_balance.
3577          * bctl is freed in reset_balance_state, or, if restriper was paused
3578          * all the way until unmount, in free_fs_info.  The flag should be
3579          * cleared after reset_balance_state.
3580          */
3581         need_unlock = false;
3582
3583         ret = btrfs_balance(fs_info, bctl, bargs);
3584         bctl = NULL;
3585
3586         if (ret == 0 || ret == -ECANCELED) {
3587                 if (copy_to_user(arg, bargs, sizeof(*bargs)))
3588                         ret = -EFAULT;
3589         }
3590
3591         kfree(bctl);
3592 out_unlock:
3593         mutex_unlock(&fs_info->balance_mutex);
3594         if (need_unlock)
3595                 btrfs_exclop_finish(fs_info);
3596 out:
3597         mnt_drop_write_file(file);
3598         kfree(bargs);
3599         return ret;
3600 }
3601
3602 static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
3603 {
3604         if (!capable(CAP_SYS_ADMIN))
3605                 return -EPERM;
3606
3607         switch (cmd) {
3608         case BTRFS_BALANCE_CTL_PAUSE:
3609                 return btrfs_pause_balance(fs_info);
3610         case BTRFS_BALANCE_CTL_CANCEL:
3611                 return btrfs_cancel_balance(fs_info);
3612         }
3613
3614         return -EINVAL;
3615 }
3616
3617 static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
3618                                          void __user *arg)
3619 {
3620         struct btrfs_ioctl_balance_args *bargs;
3621         int ret = 0;
3622
3623         if (!capable(CAP_SYS_ADMIN))
3624                 return -EPERM;
3625
3626         mutex_lock(&fs_info->balance_mutex);
3627         if (!fs_info->balance_ctl) {
3628                 ret = -ENOTCONN;
3629                 goto out;
3630         }
3631
3632         bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
3633         if (!bargs) {
3634                 ret = -ENOMEM;
3635                 goto out;
3636         }
3637
3638         btrfs_update_ioctl_balance_args(fs_info, bargs);
3639
3640         if (copy_to_user(arg, bargs, sizeof(*bargs)))
3641                 ret = -EFAULT;
3642
3643         kfree(bargs);
3644 out:
3645         mutex_unlock(&fs_info->balance_mutex);
3646         return ret;
3647 }
3648
3649 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
3650 {
3651         struct inode *inode = file_inode(file);
3652         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
3653         struct btrfs_ioctl_quota_ctl_args *sa;
3654         int ret;
3655
3656         if (!capable(CAP_SYS_ADMIN))
3657                 return -EPERM;
3658
3659         ret = mnt_want_write_file(file);
3660         if (ret)
3661                 return ret;
3662
3663         sa = memdup_user(arg, sizeof(*sa));
3664         if (IS_ERR(sa)) {
3665                 ret = PTR_ERR(sa);
3666                 goto drop_write;
3667         }
3668
3669         switch (sa->cmd) {
3670         case BTRFS_QUOTA_CTL_ENABLE:
3671         case BTRFS_QUOTA_CTL_ENABLE_SIMPLE_QUOTA:
3672                 down_write(&fs_info->subvol_sem);
3673                 ret = btrfs_quota_enable(fs_info, sa);
3674                 up_write(&fs_info->subvol_sem);
3675                 break;
3676         case BTRFS_QUOTA_CTL_DISABLE:
3677                 /*
3678                  * Lock the cleaner mutex to prevent races with concurrent
3679                  * relocation, because relocation may be building backrefs for
3680                  * blocks of the quota root while we are deleting the root. This
3681                  * is like dropping fs roots of deleted snapshots/subvolumes, we
3682                  * need the same protection.
3683                  *
3684                  * This also prevents races between concurrent tasks trying to
3685                  * disable quotas, because we will unlock and relock
3686                  * qgroup_ioctl_lock across BTRFS_FS_QUOTA_ENABLED changes.
3687                  *
3688                  * We take this here because we have the dependency of
3689                  *
3690                  * inode_lock -> subvol_sem
3691                  *
3692                  * because of rename.  With relocation we can prealloc extents,
3693                  * so that makes the dependency chain
3694                  *
3695                  * cleaner_mutex -> inode_lock -> subvol_sem
3696                  *
3697                  * so we must take the cleaner_mutex here before we take the
3698                  * subvol_sem.  The deadlock can't actually happen, but this
3699                  * quiets lockdep.
3700                  */
3701                 mutex_lock(&fs_info->cleaner_mutex);
3702                 down_write(&fs_info->subvol_sem);
3703                 ret = btrfs_quota_disable(fs_info);
3704                 up_write(&fs_info->subvol_sem);
3705                 mutex_unlock(&fs_info->cleaner_mutex);
3706                 break;
3707         default:
3708                 ret = -EINVAL;
3709                 break;
3710         }
3711
3712         kfree(sa);
3713 drop_write:
3714         mnt_drop_write_file(file);
3715         return ret;
3716 }
3717
3718 /*
3719  * Quick check for ioctl handlers if quotas are enabled. Proper locking must be
3720  * done before any operations.
3721  */
3722 static bool qgroup_enabled(struct btrfs_fs_info *fs_info)
3723 {
3724         bool ret = true;
3725
3726         mutex_lock(&fs_info->qgroup_ioctl_lock);
3727         if (!fs_info->quota_root)
3728                 ret = false;
3729         mutex_unlock(&fs_info->qgroup_ioctl_lock);
3730
3731         return ret;
3732 }
3733
3734 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
3735 {
3736         struct inode *inode = file_inode(file);
3737         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
3738         struct btrfs_root *root = BTRFS_I(inode)->root;
3739         struct btrfs_ioctl_qgroup_assign_args *sa;
3740         struct btrfs_qgroup_list *prealloc = NULL;
3741         struct btrfs_trans_handle *trans;
3742         int ret;
3743         int err;
3744
3745         if (!capable(CAP_SYS_ADMIN))
3746                 return -EPERM;
3747
3748         if (!qgroup_enabled(root->fs_info))
3749                 return -ENOTCONN;
3750
3751         ret = mnt_want_write_file(file);
3752         if (ret)
3753                 return ret;
3754
3755         sa = memdup_user(arg, sizeof(*sa));
3756         if (IS_ERR(sa)) {
3757                 ret = PTR_ERR(sa);
3758                 goto drop_write;
3759         }
3760
3761         if (sa->assign) {
3762                 prealloc = kzalloc(sizeof(*prealloc), GFP_KERNEL);
3763                 if (!prealloc) {
3764                         ret = -ENOMEM;
3765                         goto drop_write;
3766                 }
3767         }
3768
3769         trans = btrfs_join_transaction(root);
3770         if (IS_ERR(trans)) {
3771                 ret = PTR_ERR(trans);
3772                 goto out;
3773         }
3774
3775         /*
3776          * Prealloc ownership is moved to the relation handler, there it's used
3777          * or freed on error.
3778          */
3779         if (sa->assign) {
3780                 ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst, prealloc);
3781                 prealloc = NULL;
3782         } else {
3783                 ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
3784         }
3785
3786         /* update qgroup status and info */
3787         mutex_lock(&fs_info->qgroup_ioctl_lock);
3788         err = btrfs_run_qgroups(trans);
3789         mutex_unlock(&fs_info->qgroup_ioctl_lock);
3790         if (err < 0)
3791                 btrfs_warn(fs_info,
3792                            "qgroup status update failed after %s relation, marked as inconsistent",
3793                            sa->assign ? "adding" : "deleting");
3794         err = btrfs_end_transaction(trans);
3795         if (err && !ret)
3796                 ret = err;
3797
3798 out:
3799         kfree(prealloc);
3800         kfree(sa);
3801 drop_write:
3802         mnt_drop_write_file(file);
3803         return ret;
3804 }
3805
3806 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
3807 {
3808         struct inode *inode = file_inode(file);
3809         struct btrfs_root *root = BTRFS_I(inode)->root;
3810         struct btrfs_ioctl_qgroup_create_args *sa;
3811         struct btrfs_trans_handle *trans;
3812         int ret;
3813         int err;
3814
3815         if (!capable(CAP_SYS_ADMIN))
3816                 return -EPERM;
3817
3818         if (!qgroup_enabled(root->fs_info))
3819                 return -ENOTCONN;
3820
3821         ret = mnt_want_write_file(file);
3822         if (ret)
3823                 return ret;
3824
3825         sa = memdup_user(arg, sizeof(*sa));
3826         if (IS_ERR(sa)) {
3827                 ret = PTR_ERR(sa);
3828                 goto drop_write;
3829         }
3830
3831         if (!sa->qgroupid) {
3832                 ret = -EINVAL;
3833                 goto out;
3834         }
3835
3836         if (sa->create && is_fstree(sa->qgroupid)) {
3837                 ret = -EINVAL;
3838                 goto out;
3839         }
3840
3841         trans = btrfs_join_transaction(root);
3842         if (IS_ERR(trans)) {
3843                 ret = PTR_ERR(trans);
3844                 goto out;
3845         }
3846
3847         if (sa->create) {
3848                 ret = btrfs_create_qgroup(trans, sa->qgroupid);
3849         } else {
3850                 ret = btrfs_remove_qgroup(trans, sa->qgroupid);
3851         }
3852
3853         err = btrfs_end_transaction(trans);
3854         if (err && !ret)
3855                 ret = err;
3856
3857 out:
3858         kfree(sa);
3859 drop_write:
3860         mnt_drop_write_file(file);
3861         return ret;
3862 }
3863
3864 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
3865 {
3866         struct inode *inode = file_inode(file);
3867         struct btrfs_root *root = BTRFS_I(inode)->root;
3868         struct btrfs_ioctl_qgroup_limit_args *sa;
3869         struct btrfs_trans_handle *trans;
3870         int ret;
3871         int err;
3872         u64 qgroupid;
3873
3874         if (!capable(CAP_SYS_ADMIN))
3875                 return -EPERM;
3876
3877         if (!qgroup_enabled(root->fs_info))
3878                 return -ENOTCONN;
3879
3880         ret = mnt_want_write_file(file);
3881         if (ret)
3882                 return ret;
3883
3884         sa = memdup_user(arg, sizeof(*sa));
3885         if (IS_ERR(sa)) {
3886                 ret = PTR_ERR(sa);
3887                 goto drop_write;
3888         }
3889
3890         trans = btrfs_join_transaction(root);
3891         if (IS_ERR(trans)) {
3892                 ret = PTR_ERR(trans);
3893                 goto out;
3894         }
3895
3896         qgroupid = sa->qgroupid;
3897         if (!qgroupid) {
3898                 /* take the current subvol as qgroup */
3899                 qgroupid = btrfs_root_id(root);
3900         }
3901
3902         ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
3903
3904         err = btrfs_end_transaction(trans);
3905         if (err && !ret)
3906                 ret = err;
3907
3908 out:
3909         kfree(sa);
3910 drop_write:
3911         mnt_drop_write_file(file);
3912         return ret;
3913 }
3914
3915 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
3916 {
3917         struct inode *inode = file_inode(file);
3918         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
3919         struct btrfs_ioctl_quota_rescan_args *qsa;
3920         int ret;
3921
3922         if (!capable(CAP_SYS_ADMIN))
3923                 return -EPERM;
3924
3925         if (!qgroup_enabled(fs_info))
3926                 return -ENOTCONN;
3927
3928         ret = mnt_want_write_file(file);
3929         if (ret)
3930                 return ret;
3931
3932         qsa = memdup_user(arg, sizeof(*qsa));
3933         if (IS_ERR(qsa)) {
3934                 ret = PTR_ERR(qsa);
3935                 goto drop_write;
3936         }
3937
3938         if (qsa->flags) {
3939                 ret = -EINVAL;
3940                 goto out;
3941         }
3942
3943         ret = btrfs_qgroup_rescan(fs_info);
3944
3945 out:
3946         kfree(qsa);
3947 drop_write:
3948         mnt_drop_write_file(file);
3949         return ret;
3950 }
3951
3952 static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
3953                                                 void __user *arg)
3954 {
3955         struct btrfs_ioctl_quota_rescan_args qsa = {0};
3956
3957         if (!capable(CAP_SYS_ADMIN))
3958                 return -EPERM;
3959
3960         if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
3961                 qsa.flags = 1;
3962                 qsa.progress = fs_info->qgroup_rescan_progress.objectid;
3963         }
3964
3965         if (copy_to_user(arg, &qsa, sizeof(qsa)))
3966                 return -EFAULT;
3967
3968         return 0;
3969 }
3970
3971 static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info)
3972 {
3973         if (!capable(CAP_SYS_ADMIN))
3974                 return -EPERM;
3975
3976         return btrfs_qgroup_wait_for_completion(fs_info, true);
3977 }
3978
3979 static long _btrfs_ioctl_set_received_subvol(struct file *file,
3980                                             struct mnt_idmap *idmap,
3981                                             struct btrfs_ioctl_received_subvol_args *sa)
3982 {
3983         struct inode *inode = file_inode(file);
3984         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
3985         struct btrfs_root *root = BTRFS_I(inode)->root;
3986         struct btrfs_root_item *root_item = &root->root_item;
3987         struct btrfs_trans_handle *trans;
3988         struct timespec64 ct = current_time(inode);
3989         int ret = 0;
3990         int received_uuid_changed;
3991
3992         if (!inode_owner_or_capable(idmap, inode))
3993                 return -EPERM;
3994
3995         ret = mnt_want_write_file(file);
3996         if (ret < 0)
3997                 return ret;
3998
3999         down_write(&fs_info->subvol_sem);
4000
4001         if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
4002                 ret = -EINVAL;
4003                 goto out;
4004         }
4005
4006         if (btrfs_root_readonly(root)) {
4007                 ret = -EROFS;
4008                 goto out;
4009         }
4010
4011         /*
4012          * 1 - root item
4013          * 2 - uuid items (received uuid + subvol uuid)
4014          */
4015         trans = btrfs_start_transaction(root, 3);
4016         if (IS_ERR(trans)) {
4017                 ret = PTR_ERR(trans);
4018                 trans = NULL;
4019                 goto out;
4020         }
4021
4022         sa->rtransid = trans->transid;
4023         sa->rtime.sec = ct.tv_sec;
4024         sa->rtime.nsec = ct.tv_nsec;
4025
4026         received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
4027                                        BTRFS_UUID_SIZE);
4028         if (received_uuid_changed &&
4029             !btrfs_is_empty_uuid(root_item->received_uuid)) {
4030                 ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
4031                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4032                                           btrfs_root_id(root));
4033                 if (ret && ret != -ENOENT) {
4034                         btrfs_abort_transaction(trans, ret);
4035                         btrfs_end_transaction(trans);
4036                         goto out;
4037                 }
4038         }
4039         memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
4040         btrfs_set_root_stransid(root_item, sa->stransid);
4041         btrfs_set_root_rtransid(root_item, sa->rtransid);
4042         btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
4043         btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
4044         btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
4045         btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
4046
4047         ret = btrfs_update_root(trans, fs_info->tree_root,
4048                                 &root->root_key, &root->root_item);
4049         if (ret < 0) {
4050                 btrfs_end_transaction(trans);
4051                 goto out;
4052         }
4053         if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
4054                 ret = btrfs_uuid_tree_add(trans, sa->uuid,
4055                                           BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4056                                           btrfs_root_id(root));
4057                 if (ret < 0 && ret != -EEXIST) {
4058                         btrfs_abort_transaction(trans, ret);
4059                         btrfs_end_transaction(trans);
4060                         goto out;
4061                 }
4062         }
4063         ret = btrfs_commit_transaction(trans);
4064 out:
4065         up_write(&fs_info->subvol_sem);
4066         mnt_drop_write_file(file);
4067         return ret;
4068 }
4069
4070 #ifdef CONFIG_64BIT
4071 static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4072                                                 void __user *arg)
4073 {
4074         struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4075         struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4076         int ret = 0;
4077
4078         args32 = memdup_user(arg, sizeof(*args32));
4079         if (IS_ERR(args32))
4080                 return PTR_ERR(args32);
4081
4082         args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
4083         if (!args64) {
4084                 ret = -ENOMEM;
4085                 goto out;
4086         }
4087
4088         memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4089         args64->stransid = args32->stransid;
4090         args64->rtransid = args32->rtransid;
4091         args64->stime.sec = args32->stime.sec;
4092         args64->stime.nsec = args32->stime.nsec;
4093         args64->rtime.sec = args32->rtime.sec;
4094         args64->rtime.nsec = args32->rtime.nsec;
4095         args64->flags = args32->flags;
4096
4097         ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), args64);
4098         if (ret)
4099                 goto out;
4100
4101         memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4102         args32->stransid = args64->stransid;
4103         args32->rtransid = args64->rtransid;
4104         args32->stime.sec = args64->stime.sec;
4105         args32->stime.nsec = args64->stime.nsec;
4106         args32->rtime.sec = args64->rtime.sec;
4107         args32->rtime.nsec = args64->rtime.nsec;
4108         args32->flags = args64->flags;
4109
4110         ret = copy_to_user(arg, args32, sizeof(*args32));
4111         if (ret)
4112                 ret = -EFAULT;
4113
4114 out:
4115         kfree(args32);
4116         kfree(args64);
4117         return ret;
4118 }
4119 #endif
4120
4121 static long btrfs_ioctl_set_received_subvol(struct file *file,
4122                                             void __user *arg)
4123 {
4124         struct btrfs_ioctl_received_subvol_args *sa = NULL;
4125         int ret = 0;
4126
4127         sa = memdup_user(arg, sizeof(*sa));
4128         if (IS_ERR(sa))
4129                 return PTR_ERR(sa);
4130
4131         ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_idmap(file), sa);
4132
4133         if (ret)
4134                 goto out;
4135
4136         ret = copy_to_user(arg, sa, sizeof(*sa));
4137         if (ret)
4138                 ret = -EFAULT;
4139
4140 out:
4141         kfree(sa);
4142         return ret;
4143 }
4144
4145 static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
4146                                         void __user *arg)
4147 {
4148         size_t len;
4149         int ret;
4150         char label[BTRFS_LABEL_SIZE];
4151
4152         spin_lock(&fs_info->super_lock);
4153         memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
4154         spin_unlock(&fs_info->super_lock);
4155
4156         len = strnlen(label, BTRFS_LABEL_SIZE);
4157
4158         if (len == BTRFS_LABEL_SIZE) {
4159                 btrfs_warn(fs_info,
4160                            "label is too long, return the first %zu bytes",
4161                            --len);
4162         }
4163
4164         ret = copy_to_user(arg, label, len);
4165
4166         return ret ? -EFAULT : 0;
4167 }
4168
4169 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4170 {
4171         struct inode *inode = file_inode(file);
4172         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
4173         struct btrfs_root *root = BTRFS_I(inode)->root;
4174         struct btrfs_super_block *super_block = fs_info->super_copy;
4175         struct btrfs_trans_handle *trans;
4176         char label[BTRFS_LABEL_SIZE];
4177         int ret;
4178
4179         if (!capable(CAP_SYS_ADMIN))
4180                 return -EPERM;
4181
4182         if (copy_from_user(label, arg, sizeof(label)))
4183                 return -EFAULT;
4184
4185         if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4186                 btrfs_err(fs_info,
4187                           "unable to set label with more than %d bytes",
4188                           BTRFS_LABEL_SIZE - 1);
4189                 return -EINVAL;
4190         }
4191
4192         ret = mnt_want_write_file(file);
4193         if (ret)
4194                 return ret;
4195
4196         trans = btrfs_start_transaction(root, 0);
4197         if (IS_ERR(trans)) {
4198                 ret = PTR_ERR(trans);
4199                 goto out_unlock;
4200         }
4201
4202         spin_lock(&fs_info->super_lock);
4203         strcpy(super_block->label, label);
4204         spin_unlock(&fs_info->super_lock);
4205         ret = btrfs_commit_transaction(trans);
4206
4207 out_unlock:
4208         mnt_drop_write_file(file);
4209         return ret;
4210 }
4211
4212 #define INIT_FEATURE_FLAGS(suffix) \
4213         { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
4214           .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
4215           .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
4216
4217 int btrfs_ioctl_get_supported_features(void __user *arg)
4218 {
4219         static const struct btrfs_ioctl_feature_flags features[3] = {
4220                 INIT_FEATURE_FLAGS(SUPP),
4221                 INIT_FEATURE_FLAGS(SAFE_SET),
4222                 INIT_FEATURE_FLAGS(SAFE_CLEAR)
4223         };
4224
4225         if (copy_to_user(arg, &features, sizeof(features)))
4226                 return -EFAULT;
4227
4228         return 0;
4229 }
4230
4231 static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
4232                                         void __user *arg)
4233 {
4234         struct btrfs_super_block *super_block = fs_info->super_copy;
4235         struct btrfs_ioctl_feature_flags features;
4236
4237         features.compat_flags = btrfs_super_compat_flags(super_block);
4238         features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
4239         features.incompat_flags = btrfs_super_incompat_flags(super_block);
4240
4241         if (copy_to_user(arg, &features, sizeof(features)))
4242                 return -EFAULT;
4243
4244         return 0;
4245 }
4246
4247 static int check_feature_bits(const struct btrfs_fs_info *fs_info,
4248                               enum btrfs_feature_set set,
4249                               u64 change_mask, u64 flags, u64 supported_flags,
4250                               u64 safe_set, u64 safe_clear)
4251 {
4252         const char *type = btrfs_feature_set_name(set);
4253         char *names;
4254         u64 disallowed, unsupported;
4255         u64 set_mask = flags & change_mask;
4256         u64 clear_mask = ~flags & change_mask;
4257
4258         unsupported = set_mask & ~supported_flags;
4259         if (unsupported) {
4260                 names = btrfs_printable_features(set, unsupported);
4261                 if (names) {
4262                         btrfs_warn(fs_info,
4263                                    "this kernel does not support the %s feature bit%s",
4264                                    names, strchr(names, ',') ? "s" : "");
4265                         kfree(names);
4266                 } else
4267                         btrfs_warn(fs_info,
4268                                    "this kernel does not support %s bits 0x%llx",
4269                                    type, unsupported);
4270                 return -EOPNOTSUPP;
4271         }
4272
4273         disallowed = set_mask & ~safe_set;
4274         if (disallowed) {
4275                 names = btrfs_printable_features(set, disallowed);
4276                 if (names) {
4277                         btrfs_warn(fs_info,
4278                                    "can't set the %s feature bit%s while mounted",
4279                                    names, strchr(names, ',') ? "s" : "");
4280                         kfree(names);
4281                 } else
4282                         btrfs_warn(fs_info,
4283                                    "can't set %s bits 0x%llx while mounted",
4284                                    type, disallowed);
4285                 return -EPERM;
4286         }
4287
4288         disallowed = clear_mask & ~safe_clear;
4289         if (disallowed) {
4290                 names = btrfs_printable_features(set, disallowed);
4291                 if (names) {
4292                         btrfs_warn(fs_info,
4293                                    "can't clear the %s feature bit%s while mounted",
4294                                    names, strchr(names, ',') ? "s" : "");
4295                         kfree(names);
4296                 } else
4297                         btrfs_warn(fs_info,
4298                                    "can't clear %s bits 0x%llx while mounted",
4299                                    type, disallowed);
4300                 return -EPERM;
4301         }
4302
4303         return 0;
4304 }
4305
4306 #define check_feature(fs_info, change_mask, flags, mask_base)   \
4307 check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags,       \
4308                    BTRFS_FEATURE_ ## mask_base ## _SUPP,        \
4309                    BTRFS_FEATURE_ ## mask_base ## _SAFE_SET,    \
4310                    BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
4311
4312 static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
4313 {
4314         struct inode *inode = file_inode(file);
4315         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
4316         struct btrfs_root *root = BTRFS_I(inode)->root;
4317         struct btrfs_super_block *super_block = fs_info->super_copy;
4318         struct btrfs_ioctl_feature_flags flags[2];
4319         struct btrfs_trans_handle *trans;
4320         u64 newflags;
4321         int ret;
4322
4323         if (!capable(CAP_SYS_ADMIN))
4324                 return -EPERM;
4325
4326         if (copy_from_user(flags, arg, sizeof(flags)))
4327                 return -EFAULT;
4328
4329         /* Nothing to do */
4330         if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
4331             !flags[0].incompat_flags)
4332                 return 0;
4333
4334         ret = check_feature(fs_info, flags[0].compat_flags,
4335                             flags[1].compat_flags, COMPAT);
4336         if (ret)
4337                 return ret;
4338
4339         ret = check_feature(fs_info, flags[0].compat_ro_flags,
4340                             flags[1].compat_ro_flags, COMPAT_RO);
4341         if (ret)
4342                 return ret;
4343
4344         ret = check_feature(fs_info, flags[0].incompat_flags,
4345                             flags[1].incompat_flags, INCOMPAT);
4346         if (ret)
4347                 return ret;
4348
4349         ret = mnt_want_write_file(file);
4350         if (ret)
4351                 return ret;
4352
4353         trans = btrfs_start_transaction(root, 0);
4354         if (IS_ERR(trans)) {
4355                 ret = PTR_ERR(trans);
4356                 goto out_drop_write;
4357         }
4358
4359         spin_lock(&fs_info->super_lock);
4360         newflags = btrfs_super_compat_flags(super_block);
4361         newflags |= flags[0].compat_flags & flags[1].compat_flags;
4362         newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
4363         btrfs_set_super_compat_flags(super_block, newflags);
4364
4365         newflags = btrfs_super_compat_ro_flags(super_block);
4366         newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
4367         newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
4368         btrfs_set_super_compat_ro_flags(super_block, newflags);
4369
4370         newflags = btrfs_super_incompat_flags(super_block);
4371         newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
4372         newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
4373         btrfs_set_super_incompat_flags(super_block, newflags);
4374         spin_unlock(&fs_info->super_lock);
4375
4376         ret = btrfs_commit_transaction(trans);
4377 out_drop_write:
4378         mnt_drop_write_file(file);
4379
4380         return ret;
4381 }
4382
4383 static int _btrfs_ioctl_send(struct btrfs_root *root, void __user *argp, bool compat)
4384 {
4385         struct btrfs_ioctl_send_args *arg;
4386         int ret;
4387
4388         if (compat) {
4389 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4390                 struct btrfs_ioctl_send_args_32 args32 = { 0 };
4391
4392                 ret = copy_from_user(&args32, argp, sizeof(args32));
4393                 if (ret)
4394                         return -EFAULT;
4395                 arg = kzalloc(sizeof(*arg), GFP_KERNEL);
4396                 if (!arg)
4397                         return -ENOMEM;
4398                 arg->send_fd = args32.send_fd;
4399                 arg->clone_sources_count = args32.clone_sources_count;
4400                 arg->clone_sources = compat_ptr(args32.clone_sources);
4401                 arg->parent_root = args32.parent_root;
4402                 arg->flags = args32.flags;
4403                 arg->version = args32.version;
4404                 memcpy(arg->reserved, args32.reserved,
4405                        sizeof(args32.reserved));
4406 #else
4407                 return -ENOTTY;
4408 #endif
4409         } else {
4410                 arg = memdup_user(argp, sizeof(*arg));
4411                 if (IS_ERR(arg))
4412                         return PTR_ERR(arg);
4413         }
4414         ret = btrfs_ioctl_send(root, arg);
4415         kfree(arg);
4416         return ret;
4417 }
4418
4419 static int btrfs_ioctl_encoded_read(struct file *file, void __user *argp,
4420                                     bool compat)
4421 {
4422         struct btrfs_ioctl_encoded_io_args args = { 0 };
4423         size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args,
4424                                              flags);
4425         size_t copy_end;
4426         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
4427         struct btrfs_fs_info *fs_info = inode->root->fs_info;
4428         struct extent_io_tree *io_tree = &inode->io_tree;
4429         struct iovec iovstack[UIO_FASTIOV];
4430         struct iovec *iov = iovstack;
4431         struct iov_iter iter;
4432         loff_t pos;
4433         struct kiocb kiocb;
4434         ssize_t ret;
4435         u64 disk_bytenr, disk_io_size;
4436         struct extent_state *cached_state = NULL;
4437
4438         if (!capable(CAP_SYS_ADMIN)) {
4439                 ret = -EPERM;
4440                 goto out_acct;
4441         }
4442
4443         if (compat) {
4444 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4445                 struct btrfs_ioctl_encoded_io_args_32 args32;
4446
4447                 copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32,
4448                                        flags);
4449                 if (copy_from_user(&args32, argp, copy_end)) {
4450                         ret = -EFAULT;
4451                         goto out_acct;
4452                 }
4453                 args.iov = compat_ptr(args32.iov);
4454                 args.iovcnt = args32.iovcnt;
4455                 args.offset = args32.offset;
4456                 args.flags = args32.flags;
4457 #else
4458                 return -ENOTTY;
4459 #endif
4460         } else {
4461                 copy_end = copy_end_kernel;
4462                 if (copy_from_user(&args, argp, copy_end)) {
4463                         ret = -EFAULT;
4464                         goto out_acct;
4465                 }
4466         }
4467         if (args.flags != 0) {
4468                 ret = -EINVAL;
4469                 goto out_acct;
4470         }
4471
4472         ret = import_iovec(ITER_DEST, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
4473                            &iov, &iter);
4474         if (ret < 0)
4475                 goto out_acct;
4476
4477         if (iov_iter_count(&iter) == 0) {
4478                 ret = 0;
4479                 goto out_iov;
4480         }
4481         pos = args.offset;
4482         ret = rw_verify_area(READ, file, &pos, args.len);
4483         if (ret < 0)
4484                 goto out_iov;
4485
4486         init_sync_kiocb(&kiocb, file);
4487         kiocb.ki_pos = pos;
4488
4489         ret = btrfs_encoded_read(&kiocb, &iter, &args, &cached_state,
4490                                  &disk_bytenr, &disk_io_size);
4491
4492         if (ret == -EIOCBQUEUED) {
4493                 bool unlocked = false;
4494                 u64 start, lockend, count;
4495
4496                 start = ALIGN_DOWN(kiocb.ki_pos, fs_info->sectorsize);
4497                 lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
4498
4499                 if (args.compression)
4500                         count = disk_io_size;
4501                 else
4502                         count = args.len;
4503
4504                 ret = btrfs_encoded_read_regular(&kiocb, &iter, start, lockend,
4505                                                  &cached_state, disk_bytenr,
4506                                                  disk_io_size, count,
4507                                                  args.compression, &unlocked);
4508
4509                 if (!unlocked) {
4510                         btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
4511                         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
4512                 }
4513         }
4514
4515         if (ret >= 0) {
4516                 fsnotify_access(file);
4517                 if (copy_to_user(argp + copy_end,
4518                                  (char *)&args + copy_end_kernel,
4519                                  sizeof(args) - copy_end_kernel))
4520                         ret = -EFAULT;
4521         }
4522
4523 out_iov:
4524         kfree(iov);
4525 out_acct:
4526         if (ret > 0)
4527                 add_rchar(current, ret);
4528         inc_syscr(current);
4529         return ret;
4530 }
4531
4532 static int btrfs_ioctl_encoded_write(struct file *file, void __user *argp, bool compat)
4533 {
4534         struct btrfs_ioctl_encoded_io_args args;
4535         struct iovec iovstack[UIO_FASTIOV];
4536         struct iovec *iov = iovstack;
4537         struct iov_iter iter;
4538         loff_t pos;
4539         struct kiocb kiocb;
4540         ssize_t ret;
4541
4542         if (!capable(CAP_SYS_ADMIN)) {
4543                 ret = -EPERM;
4544                 goto out_acct;
4545         }
4546
4547         if (!(file->f_mode & FMODE_WRITE)) {
4548                 ret = -EBADF;
4549                 goto out_acct;
4550         }
4551
4552         if (compat) {
4553 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4554                 struct btrfs_ioctl_encoded_io_args_32 args32;
4555
4556                 if (copy_from_user(&args32, argp, sizeof(args32))) {
4557                         ret = -EFAULT;
4558                         goto out_acct;
4559                 }
4560                 args.iov = compat_ptr(args32.iov);
4561                 args.iovcnt = args32.iovcnt;
4562                 args.offset = args32.offset;
4563                 args.flags = args32.flags;
4564                 args.len = args32.len;
4565                 args.unencoded_len = args32.unencoded_len;
4566                 args.unencoded_offset = args32.unencoded_offset;
4567                 args.compression = args32.compression;
4568                 args.encryption = args32.encryption;
4569                 memcpy(args.reserved, args32.reserved, sizeof(args.reserved));
4570 #else
4571                 return -ENOTTY;
4572 #endif
4573         } else {
4574                 if (copy_from_user(&args, argp, sizeof(args))) {
4575                         ret = -EFAULT;
4576                         goto out_acct;
4577                 }
4578         }
4579
4580         ret = -EINVAL;
4581         if (args.flags != 0)
4582                 goto out_acct;
4583         if (memchr_inv(args.reserved, 0, sizeof(args.reserved)))
4584                 goto out_acct;
4585         if (args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
4586             args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
4587                 goto out_acct;
4588         if (args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
4589             args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
4590                 goto out_acct;
4591         if (args.unencoded_offset > args.unencoded_len)
4592                 goto out_acct;
4593         if (args.len > args.unencoded_len - args.unencoded_offset)
4594                 goto out_acct;
4595
4596         ret = import_iovec(ITER_SOURCE, args.iov, args.iovcnt, ARRAY_SIZE(iovstack),
4597                            &iov, &iter);
4598         if (ret < 0)
4599                 goto out_acct;
4600
4601         if (iov_iter_count(&iter) == 0) {
4602                 ret = 0;
4603                 goto out_iov;
4604         }
4605         pos = args.offset;
4606         ret = rw_verify_area(WRITE, file, &pos, args.len);
4607         if (ret < 0)
4608                 goto out_iov;
4609
4610         init_sync_kiocb(&kiocb, file);
4611         ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
4612         if (ret)
4613                 goto out_iov;
4614         kiocb.ki_pos = pos;
4615
4616         file_start_write(file);
4617
4618         ret = btrfs_do_write_iter(&kiocb, &iter, &args);
4619         if (ret > 0)
4620                 fsnotify_modify(file);
4621
4622         file_end_write(file);
4623 out_iov:
4624         kfree(iov);
4625 out_acct:
4626         if (ret > 0)
4627                 add_wchar(current, ret);
4628         inc_syscw(current);
4629         return ret;
4630 }
4631
4632 /*
4633  * Context that's attached to an encoded read io_uring command, in cmd->pdu. It
4634  * contains the fields in btrfs_uring_read_extent that are necessary to finish
4635  * off and cleanup the I/O in btrfs_uring_read_finished.
4636  */
4637 struct btrfs_uring_priv {
4638         struct io_uring_cmd *cmd;
4639         struct page **pages;
4640         unsigned long nr_pages;
4641         struct kiocb iocb;
4642         struct iovec *iov;
4643         struct iov_iter iter;
4644         struct extent_state *cached_state;
4645         u64 count;
4646         u64 start;
4647         u64 lockend;
4648         int err;
4649         bool compressed;
4650 };
4651
4652 struct io_btrfs_cmd {
4653         struct btrfs_uring_priv *priv;
4654 };
4655
4656 static void btrfs_uring_read_finished(struct io_uring_cmd *cmd, unsigned int issue_flags)
4657 {
4658         struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(cmd, struct io_btrfs_cmd);
4659         struct btrfs_uring_priv *priv = bc->priv;
4660         struct btrfs_inode *inode = BTRFS_I(file_inode(priv->iocb.ki_filp));
4661         struct extent_io_tree *io_tree = &inode->io_tree;
4662         unsigned long index;
4663         u64 cur;
4664         size_t page_offset;
4665         ssize_t ret;
4666
4667         /* The inode lock has already been acquired in btrfs_uring_read_extent.  */
4668         btrfs_lockdep_inode_acquire(inode, i_rwsem);
4669
4670         if (priv->err) {
4671                 ret = priv->err;
4672                 goto out;
4673         }
4674
4675         if (priv->compressed) {
4676                 index = 0;
4677                 page_offset = 0;
4678         } else {
4679                 index = (priv->iocb.ki_pos - priv->start) >> PAGE_SHIFT;
4680                 page_offset = offset_in_page(priv->iocb.ki_pos - priv->start);
4681         }
4682         cur = 0;
4683         while (cur < priv->count) {
4684                 size_t bytes = min_t(size_t, priv->count - cur, PAGE_SIZE - page_offset);
4685
4686                 if (copy_page_to_iter(priv->pages[index], page_offset, bytes,
4687                                       &priv->iter) != bytes) {
4688                         ret = -EFAULT;
4689                         goto out;
4690                 }
4691
4692                 index++;
4693                 cur += bytes;
4694                 page_offset = 0;
4695         }
4696         ret = priv->count;
4697
4698 out:
4699         btrfs_unlock_extent(io_tree, priv->start, priv->lockend, &priv->cached_state);
4700         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
4701
4702         io_uring_cmd_done(cmd, ret, 0, issue_flags);
4703         add_rchar(current, ret);
4704
4705         for (index = 0; index < priv->nr_pages; index++)
4706                 __free_page(priv->pages[index]);
4707
4708         kfree(priv->pages);
4709         kfree(priv->iov);
4710         kfree(priv);
4711 }
4712
4713 void btrfs_uring_read_extent_endio(void *ctx, int err)
4714 {
4715         struct btrfs_uring_priv *priv = ctx;
4716         struct io_btrfs_cmd *bc = io_uring_cmd_to_pdu(priv->cmd, struct io_btrfs_cmd);
4717
4718         priv->err = err;
4719         bc->priv = priv;
4720
4721         io_uring_cmd_complete_in_task(priv->cmd, btrfs_uring_read_finished);
4722 }
4723
4724 static int btrfs_uring_read_extent(struct kiocb *iocb, struct iov_iter *iter,
4725                                    u64 start, u64 lockend,
4726                                    struct extent_state *cached_state,
4727                                    u64 disk_bytenr, u64 disk_io_size,
4728                                    size_t count, bool compressed,
4729                                    struct iovec *iov, struct io_uring_cmd *cmd)
4730 {
4731         struct btrfs_inode *inode = BTRFS_I(file_inode(iocb->ki_filp));
4732         struct extent_io_tree *io_tree = &inode->io_tree;
4733         struct page **pages;
4734         struct btrfs_uring_priv *priv = NULL;
4735         unsigned long nr_pages;
4736         int ret;
4737
4738         nr_pages = DIV_ROUND_UP(disk_io_size, PAGE_SIZE);
4739         pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
4740         if (!pages)
4741                 return -ENOMEM;
4742         ret = btrfs_alloc_page_array(nr_pages, pages, 0);
4743         if (ret) {
4744                 ret = -ENOMEM;
4745                 goto out_fail;
4746         }
4747
4748         priv = kmalloc(sizeof(*priv), GFP_NOFS);
4749         if (!priv) {
4750                 ret = -ENOMEM;
4751                 goto out_fail;
4752         }
4753
4754         priv->iocb = *iocb;
4755         priv->iov = iov;
4756         priv->iter = *iter;
4757         priv->count = count;
4758         priv->cmd = cmd;
4759         priv->cached_state = cached_state;
4760         priv->compressed = compressed;
4761         priv->nr_pages = nr_pages;
4762         priv->pages = pages;
4763         priv->start = start;
4764         priv->lockend = lockend;
4765         priv->err = 0;
4766
4767         ret = btrfs_encoded_read_regular_fill_pages(inode, disk_bytenr,
4768                                                     disk_io_size, pages, priv);
4769         if (ret && ret != -EIOCBQUEUED)
4770                 goto out_fail;
4771
4772         /*
4773          * If we return -EIOCBQUEUED, we're deferring the cleanup to
4774          * btrfs_uring_read_finished(), which will handle unlocking the extent
4775          * and inode and freeing the allocations.
4776          */
4777
4778         /*
4779          * We're returning to userspace with the inode lock held, and that's
4780          * okay - it'll get unlocked in a worker thread.  Call
4781          * btrfs_lockdep_inode_release() to avoid confusing lockdep.
4782          */
4783         btrfs_lockdep_inode_release(inode, i_rwsem);
4784
4785         return -EIOCBQUEUED;
4786
4787 out_fail:
4788         btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
4789         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
4790         kfree(priv);
4791         return ret;
4792 }
4793
4794 struct btrfs_uring_encoded_data {
4795         struct btrfs_ioctl_encoded_io_args args;
4796         struct iovec iovstack[UIO_FASTIOV];
4797         struct iovec *iov;
4798         struct iov_iter iter;
4799 };
4800
4801 static int btrfs_uring_encoded_read(struct io_uring_cmd *cmd, unsigned int issue_flags)
4802 {
4803         size_t copy_end_kernel = offsetofend(struct btrfs_ioctl_encoded_io_args, flags);
4804         size_t copy_end;
4805         int ret;
4806         u64 disk_bytenr, disk_io_size;
4807         struct file *file;
4808         struct btrfs_inode *inode;
4809         struct btrfs_fs_info *fs_info;
4810         struct extent_io_tree *io_tree;
4811         loff_t pos;
4812         struct kiocb kiocb;
4813         struct extent_state *cached_state = NULL;
4814         u64 start, lockend;
4815         void __user *sqe_addr;
4816         struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
4817
4818         if (!capable(CAP_SYS_ADMIN)) {
4819                 ret = -EPERM;
4820                 goto out_acct;
4821         }
4822         file = cmd->file;
4823         inode = BTRFS_I(file->f_inode);
4824         fs_info = inode->root->fs_info;
4825         io_tree = &inode->io_tree;
4826         sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
4827
4828         if (issue_flags & IO_URING_F_COMPAT) {
4829 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4830                 copy_end = offsetofend(struct btrfs_ioctl_encoded_io_args_32, flags);
4831 #else
4832                 return -ENOTTY;
4833 #endif
4834         } else {
4835                 copy_end = copy_end_kernel;
4836         }
4837
4838         if (!data) {
4839                 data = kzalloc(sizeof(*data), GFP_NOFS);
4840                 if (!data) {
4841                         ret = -ENOMEM;
4842                         goto out_acct;
4843                 }
4844
4845                 io_uring_cmd_get_async_data(cmd)->op_data = data;
4846
4847                 if (issue_flags & IO_URING_F_COMPAT) {
4848 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4849                         struct btrfs_ioctl_encoded_io_args_32 args32;
4850
4851                         if (copy_from_user(&args32, sqe_addr, copy_end)) {
4852                                 ret = -EFAULT;
4853                                 goto out_acct;
4854                         }
4855
4856                         data->args.iov = compat_ptr(args32.iov);
4857                         data->args.iovcnt = args32.iovcnt;
4858                         data->args.offset = args32.offset;
4859                         data->args.flags = args32.flags;
4860 #endif
4861                 } else {
4862                         if (copy_from_user(&data->args, sqe_addr, copy_end)) {
4863                                 ret = -EFAULT;
4864                                 goto out_acct;
4865                         }
4866                 }
4867
4868                 if (data->args.flags != 0) {
4869                         ret = -EINVAL;
4870                         goto out_acct;
4871                 }
4872
4873                 data->iov = data->iovstack;
4874                 ret = import_iovec(ITER_DEST, data->args.iov, data->args.iovcnt,
4875                                    ARRAY_SIZE(data->iovstack), &data->iov,
4876                                    &data->iter);
4877                 if (ret < 0)
4878                         goto out_acct;
4879
4880                 if (iov_iter_count(&data->iter) == 0) {
4881                         ret = 0;
4882                         goto out_free;
4883                 }
4884         }
4885
4886         pos = data->args.offset;
4887         ret = rw_verify_area(READ, file, &pos, data->args.len);
4888         if (ret < 0)
4889                 goto out_free;
4890
4891         init_sync_kiocb(&kiocb, file);
4892         kiocb.ki_pos = pos;
4893
4894         if (issue_flags & IO_URING_F_NONBLOCK)
4895                 kiocb.ki_flags |= IOCB_NOWAIT;
4896
4897         start = ALIGN_DOWN(pos, fs_info->sectorsize);
4898         lockend = start + BTRFS_MAX_UNCOMPRESSED - 1;
4899
4900         ret = btrfs_encoded_read(&kiocb, &data->iter, &data->args, &cached_state,
4901                                  &disk_bytenr, &disk_io_size);
4902         if (ret == -EAGAIN)
4903                 goto out_acct;
4904         if (ret < 0 && ret != -EIOCBQUEUED)
4905                 goto out_free;
4906
4907         file_accessed(file);
4908
4909         if (copy_to_user(sqe_addr + copy_end,
4910                          (const char *)&data->args + copy_end_kernel,
4911                          sizeof(data->args) - copy_end_kernel)) {
4912                 if (ret == -EIOCBQUEUED) {
4913                         btrfs_unlock_extent(io_tree, start, lockend, &cached_state);
4914                         btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
4915                 }
4916                 ret = -EFAULT;
4917                 goto out_free;
4918         }
4919
4920         if (ret == -EIOCBQUEUED) {
4921                 u64 count = min_t(u64, iov_iter_count(&data->iter), disk_io_size);
4922
4923                 /* Match ioctl by not returning past EOF if uncompressed. */
4924                 if (!data->args.compression)
4925                         count = min_t(u64, count, data->args.len);
4926
4927                 ret = btrfs_uring_read_extent(&kiocb, &data->iter, start, lockend,
4928                                               cached_state, disk_bytenr, disk_io_size,
4929                                               count, data->args.compression,
4930                                               data->iov, cmd);
4931
4932                 goto out_acct;
4933         }
4934
4935 out_free:
4936         kfree(data->iov);
4937
4938 out_acct:
4939         if (ret > 0)
4940                 add_rchar(current, ret);
4941         inc_syscr(current);
4942
4943         return ret;
4944 }
4945
4946 static int btrfs_uring_encoded_write(struct io_uring_cmd *cmd, unsigned int issue_flags)
4947 {
4948         loff_t pos;
4949         struct kiocb kiocb;
4950         struct file *file;
4951         ssize_t ret;
4952         void __user *sqe_addr;
4953         struct btrfs_uring_encoded_data *data = io_uring_cmd_get_async_data(cmd)->op_data;
4954
4955         if (!capable(CAP_SYS_ADMIN)) {
4956                 ret = -EPERM;
4957                 goto out_acct;
4958         }
4959
4960         file = cmd->file;
4961         sqe_addr = u64_to_user_ptr(READ_ONCE(cmd->sqe->addr));
4962
4963         if (!(file->f_mode & FMODE_WRITE)) {
4964                 ret = -EBADF;
4965                 goto out_acct;
4966         }
4967
4968         if (!data) {
4969                 data = kzalloc(sizeof(*data), GFP_NOFS);
4970                 if (!data) {
4971                         ret = -ENOMEM;
4972                         goto out_acct;
4973                 }
4974
4975                 io_uring_cmd_get_async_data(cmd)->op_data = data;
4976
4977                 if (issue_flags & IO_URING_F_COMPAT) {
4978 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4979                         struct btrfs_ioctl_encoded_io_args_32 args32;
4980
4981                         if (copy_from_user(&args32, sqe_addr, sizeof(args32))) {
4982                                 ret = -EFAULT;
4983                                 goto out_acct;
4984                         }
4985                         data->args.iov = compat_ptr(args32.iov);
4986                         data->args.iovcnt = args32.iovcnt;
4987                         data->args.offset = args32.offset;
4988                         data->args.flags = args32.flags;
4989                         data->args.len = args32.len;
4990                         data->args.unencoded_len = args32.unencoded_len;
4991                         data->args.unencoded_offset = args32.unencoded_offset;
4992                         data->args.compression = args32.compression;
4993                         data->args.encryption = args32.encryption;
4994                         memcpy(data->args.reserved, args32.reserved,
4995                                sizeof(data->args.reserved));
4996 #else
4997                         ret = -ENOTTY;
4998                         goto out_acct;
4999 #endif
5000                 } else {
5001                         if (copy_from_user(&data->args, sqe_addr, sizeof(data->args))) {
5002                                 ret = -EFAULT;
5003                                 goto out_acct;
5004                         }
5005                 }
5006
5007                 ret = -EINVAL;
5008                 if (data->args.flags != 0)
5009                         goto out_acct;
5010                 if (memchr_inv(data->args.reserved, 0, sizeof(data->args.reserved)))
5011                         goto out_acct;
5012                 if (data->args.compression == BTRFS_ENCODED_IO_COMPRESSION_NONE &&
5013                     data->args.encryption == BTRFS_ENCODED_IO_ENCRYPTION_NONE)
5014                         goto out_acct;
5015                 if (data->args.compression >= BTRFS_ENCODED_IO_COMPRESSION_TYPES ||
5016                     data->args.encryption >= BTRFS_ENCODED_IO_ENCRYPTION_TYPES)
5017                         goto out_acct;
5018                 if (data->args.unencoded_offset > data->args.unencoded_len)
5019                         goto out_acct;
5020                 if (data->args.len > data->args.unencoded_len - data->args.unencoded_offset)
5021                         goto out_acct;
5022
5023                 data->iov = data->iovstack;
5024                 ret = import_iovec(ITER_SOURCE, data->args.iov, data->args.iovcnt,
5025                                    ARRAY_SIZE(data->iovstack), &data->iov,
5026                                    &data->iter);
5027                 if (ret < 0)
5028                         goto out_acct;
5029
5030                 if (iov_iter_count(&data->iter) == 0) {
5031                         ret = 0;
5032                         goto out_iov;
5033                 }
5034         }
5035
5036         if (issue_flags & IO_URING_F_NONBLOCK) {
5037                 ret = -EAGAIN;
5038                 goto out_acct;
5039         }
5040
5041         pos = data->args.offset;
5042         ret = rw_verify_area(WRITE, file, &pos, data->args.len);
5043         if (ret < 0)
5044                 goto out_iov;
5045
5046         init_sync_kiocb(&kiocb, file);
5047         ret = kiocb_set_rw_flags(&kiocb, 0, WRITE);
5048         if (ret)
5049                 goto out_iov;
5050         kiocb.ki_pos = pos;
5051
5052         file_start_write(file);
5053
5054         ret = btrfs_do_write_iter(&kiocb, &data->iter, &data->args);
5055         if (ret > 0)
5056                 fsnotify_modify(file);
5057
5058         file_end_write(file);
5059 out_iov:
5060         kfree(data->iov);
5061 out_acct:
5062         if (ret > 0)
5063                 add_wchar(current, ret);
5064         inc_syscw(current);
5065         return ret;
5066 }
5067
5068 int btrfs_uring_cmd(struct io_uring_cmd *cmd, unsigned int issue_flags)
5069 {
5070         switch (cmd->cmd_op) {
5071         case BTRFS_IOC_ENCODED_READ:
5072 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5073         case BTRFS_IOC_ENCODED_READ_32:
5074 #endif
5075                 return btrfs_uring_encoded_read(cmd, issue_flags);
5076
5077         case BTRFS_IOC_ENCODED_WRITE:
5078 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5079         case BTRFS_IOC_ENCODED_WRITE_32:
5080 #endif
5081                 return btrfs_uring_encoded_write(cmd, issue_flags);
5082         }
5083
5084         return -EINVAL;
5085 }
5086
5087 static int btrfs_ioctl_subvol_sync(struct btrfs_fs_info *fs_info, void __user *argp)
5088 {
5089         struct btrfs_root *root;
5090         struct btrfs_ioctl_subvol_wait args = { 0 };
5091         signed long sched_ret;
5092         int refs;
5093         u64 root_flags;
5094         bool wait_for_deletion = false;
5095         bool found = false;
5096
5097         if (copy_from_user(&args, argp, sizeof(args)))
5098                 return -EFAULT;
5099
5100         switch (args.mode) {
5101         case BTRFS_SUBVOL_SYNC_WAIT_FOR_QUEUED:
5102                 /*
5103                  * Wait for the first one deleted that waits until all previous
5104                  * are cleaned.
5105                  */
5106                 spin_lock(&fs_info->trans_lock);
5107                 if (!list_empty(&fs_info->dead_roots)) {
5108                         root = list_last_entry(&fs_info->dead_roots,
5109                                                struct btrfs_root, root_list);
5110                         args.subvolid = btrfs_root_id(root);
5111                         found = true;
5112                 }
5113                 spin_unlock(&fs_info->trans_lock);
5114                 if (!found)
5115                         return -ENOENT;
5116
5117                 fallthrough;
5118         case BTRFS_SUBVOL_SYNC_WAIT_FOR_ONE:
5119                 if ((0 < args.subvolid && args.subvolid < BTRFS_FIRST_FREE_OBJECTID) ||
5120                     BTRFS_LAST_FREE_OBJECTID < args.subvolid)
5121                         return -EINVAL;
5122                 break;
5123         case BTRFS_SUBVOL_SYNC_COUNT:
5124                 spin_lock(&fs_info->trans_lock);
5125                 args.count = list_count_nodes(&fs_info->dead_roots);
5126                 spin_unlock(&fs_info->trans_lock);
5127                 if (copy_to_user(argp, &args, sizeof(args)))
5128                         return -EFAULT;
5129                 return 0;
5130         case BTRFS_SUBVOL_SYNC_PEEK_FIRST:
5131                 spin_lock(&fs_info->trans_lock);
5132                 /* Last in the list was deleted first. */
5133                 if (!list_empty(&fs_info->dead_roots)) {
5134                         root = list_last_entry(&fs_info->dead_roots,
5135                                                struct btrfs_root, root_list);
5136                         args.subvolid = btrfs_root_id(root);
5137                 } else {
5138                         args.subvolid = 0;
5139                 }
5140                 spin_unlock(&fs_info->trans_lock);
5141                 if (copy_to_user(argp, &args, sizeof(args)))
5142                         return -EFAULT;
5143                 return 0;
5144         case BTRFS_SUBVOL_SYNC_PEEK_LAST:
5145                 spin_lock(&fs_info->trans_lock);
5146                 /* First in the list was deleted last. */
5147                 if (!list_empty(&fs_info->dead_roots)) {
5148                         root = list_first_entry(&fs_info->dead_roots,
5149                                                 struct btrfs_root, root_list);
5150                         args.subvolid = btrfs_root_id(root);
5151                 } else {
5152                         args.subvolid = 0;
5153                 }
5154                 spin_unlock(&fs_info->trans_lock);
5155                 if (copy_to_user(argp, &args, sizeof(args)))
5156                         return -EFAULT;
5157                 return 0;
5158         default:
5159                 return -EINVAL;
5160         }
5161
5162         /* 32bit limitation: fs_roots_radix key is not wide enough. */
5163         if (sizeof(unsigned long) != sizeof(u64) && args.subvolid > U32_MAX)
5164                 return -EOVERFLOW;
5165
5166         while (1) {
5167                 /* Wait for the specific one. */
5168                 if (down_read_interruptible(&fs_info->subvol_sem) == -EINTR)
5169                         return -EINTR;
5170                 refs = -1;
5171                 spin_lock(&fs_info->fs_roots_radix_lock);
5172                 root = radix_tree_lookup(&fs_info->fs_roots_radix,
5173                                          (unsigned long)args.subvolid);
5174                 if (root) {
5175                         spin_lock(&root->root_item_lock);
5176                         refs = btrfs_root_refs(&root->root_item);
5177                         root_flags = btrfs_root_flags(&root->root_item);
5178                         spin_unlock(&root->root_item_lock);
5179                 }
5180                 spin_unlock(&fs_info->fs_roots_radix_lock);
5181                 up_read(&fs_info->subvol_sem);
5182
5183                 /* Subvolume does not exist. */
5184                 if (!root)
5185                         return -ENOENT;
5186
5187                 /* Subvolume not deleted at all. */
5188                 if (refs > 0)
5189                         return -EEXIST;
5190                 /* We've waited and now the subvolume is gone. */
5191                 if (wait_for_deletion && refs == -1) {
5192                         /* Return the one we waited for as the last one. */
5193                         if (copy_to_user(argp, &args, sizeof(args)))
5194                                 return -EFAULT;
5195                         return 0;
5196                 }
5197
5198                 /* Subvolume not found on the first try (deleted or never existed). */
5199                 if (refs == -1)
5200                         return -ENOENT;
5201
5202                 wait_for_deletion = true;
5203                 ASSERT(root_flags & BTRFS_ROOT_SUBVOL_DEAD);
5204                 sched_ret = schedule_timeout_interruptible(HZ);
5205                 /* Early wake up or error. */
5206                 if (sched_ret != 0)
5207                         return -EINTR;
5208         }
5209
5210         return 0;
5211 }
5212
5213 long btrfs_ioctl(struct file *file, unsigned int
5214                 cmd, unsigned long arg)
5215 {
5216         struct inode *inode = file_inode(file);
5217         struct btrfs_fs_info *fs_info = inode_to_fs_info(inode);
5218         struct btrfs_root *root = BTRFS_I(inode)->root;
5219         void __user *argp = (void __user *)arg;
5220
5221         switch (cmd) {
5222         case FS_IOC_GETVERSION:
5223                 return btrfs_ioctl_getversion(inode, argp);
5224         case FS_IOC_GETFSLABEL:
5225                 return btrfs_ioctl_get_fslabel(fs_info, argp);
5226         case FS_IOC_SETFSLABEL:
5227                 return btrfs_ioctl_set_fslabel(file, argp);
5228         case FITRIM:
5229                 return btrfs_ioctl_fitrim(fs_info, argp);
5230         case BTRFS_IOC_SNAP_CREATE:
5231                 return btrfs_ioctl_snap_create(file, argp, 0);
5232         case BTRFS_IOC_SNAP_CREATE_V2:
5233                 return btrfs_ioctl_snap_create_v2(file, argp, 0);
5234         case BTRFS_IOC_SUBVOL_CREATE:
5235                 return btrfs_ioctl_snap_create(file, argp, 1);
5236         case BTRFS_IOC_SUBVOL_CREATE_V2:
5237                 return btrfs_ioctl_snap_create_v2(file, argp, 1);
5238         case BTRFS_IOC_SNAP_DESTROY:
5239                 return btrfs_ioctl_snap_destroy(file, argp, false);
5240         case BTRFS_IOC_SNAP_DESTROY_V2:
5241                 return btrfs_ioctl_snap_destroy(file, argp, true);
5242         case BTRFS_IOC_SUBVOL_GETFLAGS:
5243                 return btrfs_ioctl_subvol_getflags(BTRFS_I(inode), argp);
5244         case BTRFS_IOC_SUBVOL_SETFLAGS:
5245                 return btrfs_ioctl_subvol_setflags(file, argp);
5246         case BTRFS_IOC_DEFAULT_SUBVOL:
5247                 return btrfs_ioctl_default_subvol(file, argp);
5248         case BTRFS_IOC_DEFRAG:
5249                 return btrfs_ioctl_defrag(file, NULL);
5250         case BTRFS_IOC_DEFRAG_RANGE:
5251                 return btrfs_ioctl_defrag(file, argp);
5252         case BTRFS_IOC_RESIZE:
5253                 return btrfs_ioctl_resize(file, argp);
5254         case BTRFS_IOC_ADD_DEV:
5255                 return btrfs_ioctl_add_dev(fs_info, argp);
5256         case BTRFS_IOC_RM_DEV:
5257                 return btrfs_ioctl_rm_dev(file, argp);
5258         case BTRFS_IOC_RM_DEV_V2:
5259                 return btrfs_ioctl_rm_dev_v2(file, argp);
5260         case BTRFS_IOC_FS_INFO:
5261                 return btrfs_ioctl_fs_info(fs_info, argp);
5262         case BTRFS_IOC_DEV_INFO:
5263                 return btrfs_ioctl_dev_info(fs_info, argp);
5264         case BTRFS_IOC_TREE_SEARCH:
5265                 return btrfs_ioctl_tree_search(root, argp);
5266         case BTRFS_IOC_TREE_SEARCH_V2:
5267                 return btrfs_ioctl_tree_search_v2(root, argp);
5268         case BTRFS_IOC_INO_LOOKUP:
5269                 return btrfs_ioctl_ino_lookup(root, argp);
5270         case BTRFS_IOC_INO_PATHS:
5271                 return btrfs_ioctl_ino_to_path(root, argp);
5272         case BTRFS_IOC_LOGICAL_INO:
5273                 return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
5274         case BTRFS_IOC_LOGICAL_INO_V2:
5275                 return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
5276         case BTRFS_IOC_SPACE_INFO:
5277                 return btrfs_ioctl_space_info(fs_info, argp);
5278         case BTRFS_IOC_SYNC: {
5279                 int ret;
5280
5281                 ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
5282                 if (ret)
5283                         return ret;
5284                 ret = btrfs_sync_fs(inode->i_sb, 1);
5285                 /*
5286                  * There may be work for the cleaner kthread to do (subvolume
5287                  * deletion, delayed iputs, defrag inodes, etc), so wake it up.
5288                  */
5289                 wake_up_process(fs_info->cleaner_kthread);
5290                 return ret;
5291         }
5292         case BTRFS_IOC_START_SYNC:
5293                 return btrfs_ioctl_start_sync(root, argp);
5294         case BTRFS_IOC_WAIT_SYNC:
5295                 return btrfs_ioctl_wait_sync(fs_info, argp);
5296         case BTRFS_IOC_SCRUB:
5297                 return btrfs_ioctl_scrub(file, argp);
5298         case BTRFS_IOC_SCRUB_CANCEL:
5299                 return btrfs_ioctl_scrub_cancel(fs_info);
5300         case BTRFS_IOC_SCRUB_PROGRESS:
5301                 return btrfs_ioctl_scrub_progress(fs_info, argp);
5302         case BTRFS_IOC_BALANCE_V2:
5303                 return btrfs_ioctl_balance(file, argp);
5304         case BTRFS_IOC_BALANCE_CTL:
5305                 return btrfs_ioctl_balance_ctl(fs_info, arg);
5306         case BTRFS_IOC_BALANCE_PROGRESS:
5307                 return btrfs_ioctl_balance_progress(fs_info, argp);
5308         case BTRFS_IOC_SET_RECEIVED_SUBVOL:
5309                 return btrfs_ioctl_set_received_subvol(file, argp);
5310 #ifdef CONFIG_64BIT
5311         case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
5312                 return btrfs_ioctl_set_received_subvol_32(file, argp);
5313 #endif
5314         case BTRFS_IOC_SEND:
5315                 return _btrfs_ioctl_send(root, argp, false);
5316 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5317         case BTRFS_IOC_SEND_32:
5318                 return _btrfs_ioctl_send(root, argp, true);
5319 #endif
5320         case BTRFS_IOC_GET_DEV_STATS:
5321                 return btrfs_ioctl_get_dev_stats(fs_info, argp);
5322         case BTRFS_IOC_QUOTA_CTL:
5323                 return btrfs_ioctl_quota_ctl(file, argp);
5324         case BTRFS_IOC_QGROUP_ASSIGN:
5325                 return btrfs_ioctl_qgroup_assign(file, argp);
5326         case BTRFS_IOC_QGROUP_CREATE:
5327                 return btrfs_ioctl_qgroup_create(file, argp);
5328         case BTRFS_IOC_QGROUP_LIMIT:
5329                 return btrfs_ioctl_qgroup_limit(file, argp);
5330         case BTRFS_IOC_QUOTA_RESCAN:
5331                 return btrfs_ioctl_quota_rescan(file, argp);
5332         case BTRFS_IOC_QUOTA_RESCAN_STATUS:
5333                 return btrfs_ioctl_quota_rescan_status(fs_info, argp);
5334         case BTRFS_IOC_QUOTA_RESCAN_WAIT:
5335                 return btrfs_ioctl_quota_rescan_wait(fs_info);
5336         case BTRFS_IOC_DEV_REPLACE:
5337                 return btrfs_ioctl_dev_replace(fs_info, argp);
5338         case BTRFS_IOC_GET_SUPPORTED_FEATURES:
5339                 return btrfs_ioctl_get_supported_features(argp);
5340         case BTRFS_IOC_GET_FEATURES:
5341                 return btrfs_ioctl_get_features(fs_info, argp);
5342         case BTRFS_IOC_SET_FEATURES:
5343                 return btrfs_ioctl_set_features(file, argp);
5344         case BTRFS_IOC_GET_SUBVOL_INFO:
5345                 return btrfs_ioctl_get_subvol_info(inode, argp);
5346         case BTRFS_IOC_GET_SUBVOL_ROOTREF:
5347                 return btrfs_ioctl_get_subvol_rootref(root, argp);
5348         case BTRFS_IOC_INO_LOOKUP_USER:
5349                 return btrfs_ioctl_ino_lookup_user(file, argp);
5350         case FS_IOC_ENABLE_VERITY:
5351                 return fsverity_ioctl_enable(file, (const void __user *)argp);
5352         case FS_IOC_MEASURE_VERITY:
5353                 return fsverity_ioctl_measure(file, argp);
5354         case FS_IOC_READ_VERITY_METADATA:
5355                 return fsverity_ioctl_read_metadata(file, argp);
5356         case BTRFS_IOC_ENCODED_READ:
5357                 return btrfs_ioctl_encoded_read(file, argp, false);
5358         case BTRFS_IOC_ENCODED_WRITE:
5359                 return btrfs_ioctl_encoded_write(file, argp, false);
5360 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5361         case BTRFS_IOC_ENCODED_READ_32:
5362                 return btrfs_ioctl_encoded_read(file, argp, true);
5363         case BTRFS_IOC_ENCODED_WRITE_32:
5364                 return btrfs_ioctl_encoded_write(file, argp, true);
5365 #endif
5366         case BTRFS_IOC_SUBVOL_SYNC_WAIT:
5367                 return btrfs_ioctl_subvol_sync(fs_info, argp);
5368         }
5369
5370         return -ENOTTY;
5371 }
5372
5373 #ifdef CONFIG_COMPAT
5374 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5375 {
5376         /*
5377          * These all access 32-bit values anyway so no further
5378          * handling is necessary.
5379          */
5380         switch (cmd) {
5381         case FS_IOC32_GETVERSION:
5382                 cmd = FS_IOC_GETVERSION;
5383                 break;
5384         }
5385
5386         return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
5387 }
5388 #endif