]> git.ipfire.org Git - thirdparty/linux.git/blob - fs/btrfs/ioctl.c
btrfs: fix use-after-free after failure to create a snapshot
[thirdparty/linux.git] / fs / btrfs / ioctl.c
1 // SPDX-License-Identifier: GPL-2.0
2 /*
3 * Copyright (C) 2007 Oracle. All rights reserved.
4 */
5
6 #include <linux/kernel.h>
7 #include <linux/bio.h>
8 #include <linux/file.h>
9 #include <linux/fs.h>
10 #include <linux/fsnotify.h>
11 #include <linux/pagemap.h>
12 #include <linux/highmem.h>
13 #include <linux/time.h>
14 #include <linux/string.h>
15 #include <linux/backing-dev.h>
16 #include <linux/mount.h>
17 #include <linux/namei.h>
18 #include <linux/writeback.h>
19 #include <linux/compat.h>
20 #include <linux/security.h>
21 #include <linux/xattr.h>
22 #include <linux/mm.h>
23 #include <linux/slab.h>
24 #include <linux/blkdev.h>
25 #include <linux/uuid.h>
26 #include <linux/btrfs.h>
27 #include <linux/uaccess.h>
28 #include <linux/iversion.h>
29 #include <linux/fileattr.h>
30 #include <linux/fsverity.h>
31 #include "ctree.h"
32 #include "disk-io.h"
33 #include "export.h"
34 #include "transaction.h"
35 #include "btrfs_inode.h"
36 #include "print-tree.h"
37 #include "volumes.h"
38 #include "locking.h"
39 #include "backref.h"
40 #include "rcu-string.h"
41 #include "send.h"
42 #include "dev-replace.h"
43 #include "props.h"
44 #include "sysfs.h"
45 #include "qgroup.h"
46 #include "tree-log.h"
47 #include "compression.h"
48 #include "space-info.h"
49 #include "delalloc-space.h"
50 #include "block-group.h"
51 #include "subpage.h"
52
53 #ifdef CONFIG_64BIT
54 /* If we have a 32-bit userspace and 64-bit kernel, then the UAPI
55 * structures are incorrect, as the timespec structure from userspace
56 * is 4 bytes too small. We define these alternatives here to teach
57 * the kernel about the 32-bit struct packing.
58 */
59 struct btrfs_ioctl_timespec_32 {
60 __u64 sec;
61 __u32 nsec;
62 } __attribute__ ((__packed__));
63
64 struct btrfs_ioctl_received_subvol_args_32 {
65 char uuid[BTRFS_UUID_SIZE]; /* in */
66 __u64 stransid; /* in */
67 __u64 rtransid; /* out */
68 struct btrfs_ioctl_timespec_32 stime; /* in */
69 struct btrfs_ioctl_timespec_32 rtime; /* out */
70 __u64 flags; /* in */
71 __u64 reserved[16]; /* in */
72 } __attribute__ ((__packed__));
73
74 #define BTRFS_IOC_SET_RECEIVED_SUBVOL_32 _IOWR(BTRFS_IOCTL_MAGIC, 37, \
75 struct btrfs_ioctl_received_subvol_args_32)
76 #endif
77
78 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
79 struct btrfs_ioctl_send_args_32 {
80 __s64 send_fd; /* in */
81 __u64 clone_sources_count; /* in */
82 compat_uptr_t clone_sources; /* in */
83 __u64 parent_root; /* in */
84 __u64 flags; /* in */
85 __u32 version; /* in */
86 __u8 reserved[28]; /* in */
87 } __attribute__ ((__packed__));
88
89 #define BTRFS_IOC_SEND_32 _IOW(BTRFS_IOCTL_MAGIC, 38, \
90 struct btrfs_ioctl_send_args_32)
91 #endif
92
93 /* Mask out flags that are inappropriate for the given type of inode. */
94 static unsigned int btrfs_mask_fsflags_for_type(struct inode *inode,
95 unsigned int flags)
96 {
97 if (S_ISDIR(inode->i_mode))
98 return flags;
99 else if (S_ISREG(inode->i_mode))
100 return flags & ~FS_DIRSYNC_FL;
101 else
102 return flags & (FS_NODUMP_FL | FS_NOATIME_FL);
103 }
104
105 /*
106 * Export internal inode flags to the format expected by the FS_IOC_GETFLAGS
107 * ioctl.
108 */
109 static unsigned int btrfs_inode_flags_to_fsflags(struct btrfs_inode *binode)
110 {
111 unsigned int iflags = 0;
112 u32 flags = binode->flags;
113 u32 ro_flags = binode->ro_flags;
114
115 if (flags & BTRFS_INODE_SYNC)
116 iflags |= FS_SYNC_FL;
117 if (flags & BTRFS_INODE_IMMUTABLE)
118 iflags |= FS_IMMUTABLE_FL;
119 if (flags & BTRFS_INODE_APPEND)
120 iflags |= FS_APPEND_FL;
121 if (flags & BTRFS_INODE_NODUMP)
122 iflags |= FS_NODUMP_FL;
123 if (flags & BTRFS_INODE_NOATIME)
124 iflags |= FS_NOATIME_FL;
125 if (flags & BTRFS_INODE_DIRSYNC)
126 iflags |= FS_DIRSYNC_FL;
127 if (flags & BTRFS_INODE_NODATACOW)
128 iflags |= FS_NOCOW_FL;
129 if (ro_flags & BTRFS_INODE_RO_VERITY)
130 iflags |= FS_VERITY_FL;
131
132 if (flags & BTRFS_INODE_NOCOMPRESS)
133 iflags |= FS_NOCOMP_FL;
134 else if (flags & BTRFS_INODE_COMPRESS)
135 iflags |= FS_COMPR_FL;
136
137 return iflags;
138 }
139
140 /*
141 * Update inode->i_flags based on the btrfs internal flags.
142 */
143 void btrfs_sync_inode_flags_to_i_flags(struct inode *inode)
144 {
145 struct btrfs_inode *binode = BTRFS_I(inode);
146 unsigned int new_fl = 0;
147
148 if (binode->flags & BTRFS_INODE_SYNC)
149 new_fl |= S_SYNC;
150 if (binode->flags & BTRFS_INODE_IMMUTABLE)
151 new_fl |= S_IMMUTABLE;
152 if (binode->flags & BTRFS_INODE_APPEND)
153 new_fl |= S_APPEND;
154 if (binode->flags & BTRFS_INODE_NOATIME)
155 new_fl |= S_NOATIME;
156 if (binode->flags & BTRFS_INODE_DIRSYNC)
157 new_fl |= S_DIRSYNC;
158 if (binode->ro_flags & BTRFS_INODE_RO_VERITY)
159 new_fl |= S_VERITY;
160
161 set_mask_bits(&inode->i_flags,
162 S_SYNC | S_APPEND | S_IMMUTABLE | S_NOATIME | S_DIRSYNC |
163 S_VERITY, new_fl);
164 }
165
166 /*
167 * Check if @flags are a supported and valid set of FS_*_FL flags and that
168 * the old and new flags are not conflicting
169 */
170 static int check_fsflags(unsigned int old_flags, unsigned int flags)
171 {
172 if (flags & ~(FS_IMMUTABLE_FL | FS_APPEND_FL | \
173 FS_NOATIME_FL | FS_NODUMP_FL | \
174 FS_SYNC_FL | FS_DIRSYNC_FL | \
175 FS_NOCOMP_FL | FS_COMPR_FL |
176 FS_NOCOW_FL))
177 return -EOPNOTSUPP;
178
179 /* COMPR and NOCOMP on new/old are valid */
180 if ((flags & FS_NOCOMP_FL) && (flags & FS_COMPR_FL))
181 return -EINVAL;
182
183 if ((flags & FS_COMPR_FL) && (flags & FS_NOCOW_FL))
184 return -EINVAL;
185
186 /* NOCOW and compression options are mutually exclusive */
187 if ((old_flags & FS_NOCOW_FL) && (flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
188 return -EINVAL;
189 if ((flags & FS_NOCOW_FL) && (old_flags & (FS_COMPR_FL | FS_NOCOMP_FL)))
190 return -EINVAL;
191
192 return 0;
193 }
194
195 static int check_fsflags_compatible(struct btrfs_fs_info *fs_info,
196 unsigned int flags)
197 {
198 if (btrfs_is_zoned(fs_info) && (flags & FS_NOCOW_FL))
199 return -EPERM;
200
201 return 0;
202 }
203
204 /*
205 * Set flags/xflags from the internal inode flags. The remaining items of
206 * fsxattr are zeroed.
207 */
208 int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa)
209 {
210 struct btrfs_inode *binode = BTRFS_I(d_inode(dentry));
211
212 fileattr_fill_flags(fa, btrfs_inode_flags_to_fsflags(binode));
213 return 0;
214 }
215
216 int btrfs_fileattr_set(struct user_namespace *mnt_userns,
217 struct dentry *dentry, struct fileattr *fa)
218 {
219 struct inode *inode = d_inode(dentry);
220 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
221 struct btrfs_inode *binode = BTRFS_I(inode);
222 struct btrfs_root *root = binode->root;
223 struct btrfs_trans_handle *trans;
224 unsigned int fsflags, old_fsflags;
225 int ret;
226 const char *comp = NULL;
227 u32 binode_flags;
228
229 if (btrfs_root_readonly(root))
230 return -EROFS;
231
232 if (fileattr_has_fsx(fa))
233 return -EOPNOTSUPP;
234
235 fsflags = btrfs_mask_fsflags_for_type(inode, fa->flags);
236 old_fsflags = btrfs_inode_flags_to_fsflags(binode);
237 ret = check_fsflags(old_fsflags, fsflags);
238 if (ret)
239 return ret;
240
241 ret = check_fsflags_compatible(fs_info, fsflags);
242 if (ret)
243 return ret;
244
245 binode_flags = binode->flags;
246 if (fsflags & FS_SYNC_FL)
247 binode_flags |= BTRFS_INODE_SYNC;
248 else
249 binode_flags &= ~BTRFS_INODE_SYNC;
250 if (fsflags & FS_IMMUTABLE_FL)
251 binode_flags |= BTRFS_INODE_IMMUTABLE;
252 else
253 binode_flags &= ~BTRFS_INODE_IMMUTABLE;
254 if (fsflags & FS_APPEND_FL)
255 binode_flags |= BTRFS_INODE_APPEND;
256 else
257 binode_flags &= ~BTRFS_INODE_APPEND;
258 if (fsflags & FS_NODUMP_FL)
259 binode_flags |= BTRFS_INODE_NODUMP;
260 else
261 binode_flags &= ~BTRFS_INODE_NODUMP;
262 if (fsflags & FS_NOATIME_FL)
263 binode_flags |= BTRFS_INODE_NOATIME;
264 else
265 binode_flags &= ~BTRFS_INODE_NOATIME;
266
267 /* If coming from FS_IOC_FSSETXATTR then skip unconverted flags */
268 if (!fa->flags_valid) {
269 /* 1 item for the inode */
270 trans = btrfs_start_transaction(root, 1);
271 if (IS_ERR(trans))
272 return PTR_ERR(trans);
273 goto update_flags;
274 }
275
276 if (fsflags & FS_DIRSYNC_FL)
277 binode_flags |= BTRFS_INODE_DIRSYNC;
278 else
279 binode_flags &= ~BTRFS_INODE_DIRSYNC;
280 if (fsflags & FS_NOCOW_FL) {
281 if (S_ISREG(inode->i_mode)) {
282 /*
283 * It's safe to turn csums off here, no extents exist.
284 * Otherwise we want the flag to reflect the real COW
285 * status of the file and will not set it.
286 */
287 if (inode->i_size == 0)
288 binode_flags |= BTRFS_INODE_NODATACOW |
289 BTRFS_INODE_NODATASUM;
290 } else {
291 binode_flags |= BTRFS_INODE_NODATACOW;
292 }
293 } else {
294 /*
295 * Revert back under same assumptions as above
296 */
297 if (S_ISREG(inode->i_mode)) {
298 if (inode->i_size == 0)
299 binode_flags &= ~(BTRFS_INODE_NODATACOW |
300 BTRFS_INODE_NODATASUM);
301 } else {
302 binode_flags &= ~BTRFS_INODE_NODATACOW;
303 }
304 }
305
306 /*
307 * The COMPRESS flag can only be changed by users, while the NOCOMPRESS
308 * flag may be changed automatically if compression code won't make
309 * things smaller.
310 */
311 if (fsflags & FS_NOCOMP_FL) {
312 binode_flags &= ~BTRFS_INODE_COMPRESS;
313 binode_flags |= BTRFS_INODE_NOCOMPRESS;
314 } else if (fsflags & FS_COMPR_FL) {
315
316 if (IS_SWAPFILE(inode))
317 return -ETXTBSY;
318
319 binode_flags |= BTRFS_INODE_COMPRESS;
320 binode_flags &= ~BTRFS_INODE_NOCOMPRESS;
321
322 comp = btrfs_compress_type2str(fs_info->compress_type);
323 if (!comp || comp[0] == 0)
324 comp = btrfs_compress_type2str(BTRFS_COMPRESS_ZLIB);
325 } else {
326 binode_flags &= ~(BTRFS_INODE_COMPRESS | BTRFS_INODE_NOCOMPRESS);
327 }
328
329 /*
330 * 1 for inode item
331 * 2 for properties
332 */
333 trans = btrfs_start_transaction(root, 3);
334 if (IS_ERR(trans))
335 return PTR_ERR(trans);
336
337 if (comp) {
338 ret = btrfs_set_prop(trans, inode, "btrfs.compression", comp,
339 strlen(comp), 0);
340 if (ret) {
341 btrfs_abort_transaction(trans, ret);
342 goto out_end_trans;
343 }
344 } else {
345 ret = btrfs_set_prop(trans, inode, "btrfs.compression", NULL,
346 0, 0);
347 if (ret && ret != -ENODATA) {
348 btrfs_abort_transaction(trans, ret);
349 goto out_end_trans;
350 }
351 }
352
353 update_flags:
354 binode->flags = binode_flags;
355 btrfs_sync_inode_flags_to_i_flags(inode);
356 inode_inc_iversion(inode);
357 inode->i_ctime = current_time(inode);
358 ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
359
360 out_end_trans:
361 btrfs_end_transaction(trans);
362 return ret;
363 }
364
365 /*
366 * Start exclusive operation @type, return true on success
367 */
368 bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
369 enum btrfs_exclusive_operation type)
370 {
371 bool ret = false;
372
373 spin_lock(&fs_info->super_lock);
374 if (fs_info->exclusive_operation == BTRFS_EXCLOP_NONE) {
375 fs_info->exclusive_operation = type;
376 ret = true;
377 }
378 spin_unlock(&fs_info->super_lock);
379
380 return ret;
381 }
382
383 /*
384 * Conditionally allow to enter the exclusive operation in case it's compatible
385 * with the running one. This must be paired with btrfs_exclop_start_unlock and
386 * btrfs_exclop_finish.
387 *
388 * Compatibility:
389 * - the same type is already running
390 * - when trying to add a device and balance has been paused
391 * - not BTRFS_EXCLOP_NONE - this is intentionally incompatible and the caller
392 * must check the condition first that would allow none -> @type
393 */
394 bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
395 enum btrfs_exclusive_operation type)
396 {
397 spin_lock(&fs_info->super_lock);
398 if (fs_info->exclusive_operation == type ||
399 (fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED &&
400 type == BTRFS_EXCLOP_DEV_ADD))
401 return true;
402
403 spin_unlock(&fs_info->super_lock);
404 return false;
405 }
406
407 void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info)
408 {
409 spin_unlock(&fs_info->super_lock);
410 }
411
412 void btrfs_exclop_finish(struct btrfs_fs_info *fs_info)
413 {
414 spin_lock(&fs_info->super_lock);
415 WRITE_ONCE(fs_info->exclusive_operation, BTRFS_EXCLOP_NONE);
416 spin_unlock(&fs_info->super_lock);
417 sysfs_notify(&fs_info->fs_devices->fsid_kobj, NULL, "exclusive_operation");
418 }
419
420 void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
421 enum btrfs_exclusive_operation op)
422 {
423 switch (op) {
424 case BTRFS_EXCLOP_BALANCE_PAUSED:
425 spin_lock(&fs_info->super_lock);
426 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE ||
427 fs_info->exclusive_operation == BTRFS_EXCLOP_DEV_ADD);
428 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE_PAUSED;
429 spin_unlock(&fs_info->super_lock);
430 break;
431 case BTRFS_EXCLOP_BALANCE:
432 spin_lock(&fs_info->super_lock);
433 ASSERT(fs_info->exclusive_operation == BTRFS_EXCLOP_BALANCE_PAUSED);
434 fs_info->exclusive_operation = BTRFS_EXCLOP_BALANCE;
435 spin_unlock(&fs_info->super_lock);
436 break;
437 default:
438 btrfs_warn(fs_info,
439 "invalid exclop balance operation %d requested", op);
440 }
441 }
442
443 static int btrfs_ioctl_getversion(struct file *file, int __user *arg)
444 {
445 struct inode *inode = file_inode(file);
446
447 return put_user(inode->i_generation, arg);
448 }
449
450 static noinline int btrfs_ioctl_fitrim(struct btrfs_fs_info *fs_info,
451 void __user *arg)
452 {
453 struct btrfs_device *device;
454 struct request_queue *q;
455 struct fstrim_range range;
456 u64 minlen = ULLONG_MAX;
457 u64 num_devices = 0;
458 int ret;
459
460 if (!capable(CAP_SYS_ADMIN))
461 return -EPERM;
462
463 /*
464 * btrfs_trim_block_group() depends on space cache, which is not
465 * available in zoned filesystem. So, disallow fitrim on a zoned
466 * filesystem for now.
467 */
468 if (btrfs_is_zoned(fs_info))
469 return -EOPNOTSUPP;
470
471 /*
472 * If the fs is mounted with nologreplay, which requires it to be
473 * mounted in RO mode as well, we can not allow discard on free space
474 * inside block groups, because log trees refer to extents that are not
475 * pinned in a block group's free space cache (pinning the extents is
476 * precisely the first phase of replaying a log tree).
477 */
478 if (btrfs_test_opt(fs_info, NOLOGREPLAY))
479 return -EROFS;
480
481 rcu_read_lock();
482 list_for_each_entry_rcu(device, &fs_info->fs_devices->devices,
483 dev_list) {
484 if (!device->bdev)
485 continue;
486 q = bdev_get_queue(device->bdev);
487 if (blk_queue_discard(q)) {
488 num_devices++;
489 minlen = min_t(u64, q->limits.discard_granularity,
490 minlen);
491 }
492 }
493 rcu_read_unlock();
494
495 if (!num_devices)
496 return -EOPNOTSUPP;
497 if (copy_from_user(&range, arg, sizeof(range)))
498 return -EFAULT;
499
500 /*
501 * NOTE: Don't truncate the range using super->total_bytes. Bytenr of
502 * block group is in the logical address space, which can be any
503 * sectorsize aligned bytenr in the range [0, U64_MAX].
504 */
505 if (range.len < fs_info->sb->s_blocksize)
506 return -EINVAL;
507
508 range.minlen = max(range.minlen, minlen);
509 ret = btrfs_trim_fs(fs_info, &range);
510 if (ret < 0)
511 return ret;
512
513 if (copy_to_user(arg, &range, sizeof(range)))
514 return -EFAULT;
515
516 return 0;
517 }
518
519 int __pure btrfs_is_empty_uuid(u8 *uuid)
520 {
521 int i;
522
523 for (i = 0; i < BTRFS_UUID_SIZE; i++) {
524 if (uuid[i])
525 return 0;
526 }
527 return 1;
528 }
529
530 static noinline int create_subvol(struct user_namespace *mnt_userns,
531 struct inode *dir, struct dentry *dentry,
532 const char *name, int namelen,
533 struct btrfs_qgroup_inherit *inherit)
534 {
535 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
536 struct btrfs_trans_handle *trans;
537 struct btrfs_key key;
538 struct btrfs_root_item *root_item;
539 struct btrfs_inode_item *inode_item;
540 struct extent_buffer *leaf;
541 struct btrfs_root *root = BTRFS_I(dir)->root;
542 struct btrfs_root *new_root;
543 struct btrfs_block_rsv block_rsv;
544 struct timespec64 cur_time = current_time(dir);
545 struct inode *inode;
546 int ret;
547 dev_t anon_dev = 0;
548 u64 objectid;
549 u64 index = 0;
550
551 root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
552 if (!root_item)
553 return -ENOMEM;
554
555 ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
556 if (ret)
557 goto fail_free;
558
559 ret = get_anon_bdev(&anon_dev);
560 if (ret < 0)
561 goto fail_free;
562
563 /*
564 * Don't create subvolume whose level is not zero. Or qgroup will be
565 * screwed up since it assumes subvolume qgroup's level to be 0.
566 */
567 if (btrfs_qgroup_level(objectid)) {
568 ret = -ENOSPC;
569 goto fail_free;
570 }
571
572 btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
573 /*
574 * The same as the snapshot creation, please see the comment
575 * of create_snapshot().
576 */
577 ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
578 if (ret)
579 goto fail_free;
580
581 trans = btrfs_start_transaction(root, 0);
582 if (IS_ERR(trans)) {
583 ret = PTR_ERR(trans);
584 btrfs_subvolume_release_metadata(root, &block_rsv);
585 goto fail_free;
586 }
587 trans->block_rsv = &block_rsv;
588 trans->bytes_reserved = block_rsv.size;
589
590 ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
591 if (ret)
592 goto fail;
593
594 leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
595 BTRFS_NESTING_NORMAL);
596 if (IS_ERR(leaf)) {
597 ret = PTR_ERR(leaf);
598 goto fail;
599 }
600
601 btrfs_mark_buffer_dirty(leaf);
602
603 inode_item = &root_item->inode;
604 btrfs_set_stack_inode_generation(inode_item, 1);
605 btrfs_set_stack_inode_size(inode_item, 3);
606 btrfs_set_stack_inode_nlink(inode_item, 1);
607 btrfs_set_stack_inode_nbytes(inode_item,
608 fs_info->nodesize);
609 btrfs_set_stack_inode_mode(inode_item, S_IFDIR | 0755);
610
611 btrfs_set_root_flags(root_item, 0);
612 btrfs_set_root_limit(root_item, 0);
613 btrfs_set_stack_inode_flags(inode_item, BTRFS_INODE_ROOT_ITEM_INIT);
614
615 btrfs_set_root_bytenr(root_item, leaf->start);
616 btrfs_set_root_generation(root_item, trans->transid);
617 btrfs_set_root_level(root_item, 0);
618 btrfs_set_root_refs(root_item, 1);
619 btrfs_set_root_used(root_item, leaf->len);
620 btrfs_set_root_last_snapshot(root_item, 0);
621
622 btrfs_set_root_generation_v2(root_item,
623 btrfs_root_generation(root_item));
624 generate_random_guid(root_item->uuid);
625 btrfs_set_stack_timespec_sec(&root_item->otime, cur_time.tv_sec);
626 btrfs_set_stack_timespec_nsec(&root_item->otime, cur_time.tv_nsec);
627 root_item->ctime = root_item->otime;
628 btrfs_set_root_ctransid(root_item, trans->transid);
629 btrfs_set_root_otransid(root_item, trans->transid);
630
631 btrfs_tree_unlock(leaf);
632
633 btrfs_set_root_dirid(root_item, BTRFS_FIRST_FREE_OBJECTID);
634
635 key.objectid = objectid;
636 key.offset = 0;
637 key.type = BTRFS_ROOT_ITEM_KEY;
638 ret = btrfs_insert_root(trans, fs_info->tree_root, &key,
639 root_item);
640 if (ret) {
641 /*
642 * Since we don't abort the transaction in this case, free the
643 * tree block so that we don't leak space and leave the
644 * filesystem in an inconsistent state (an extent item in the
645 * extent tree with a backreference for a root that does not
646 * exists).
647 */
648 btrfs_tree_lock(leaf);
649 btrfs_clean_tree_block(leaf);
650 btrfs_tree_unlock(leaf);
651 btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
652 free_extent_buffer(leaf);
653 goto fail;
654 }
655
656 free_extent_buffer(leaf);
657 leaf = NULL;
658
659 key.offset = (u64)-1;
660 new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
661 if (IS_ERR(new_root)) {
662 free_anon_bdev(anon_dev);
663 ret = PTR_ERR(new_root);
664 btrfs_abort_transaction(trans, ret);
665 goto fail;
666 }
667 /* Freeing will be done in btrfs_put_root() of new_root */
668 anon_dev = 0;
669
670 ret = btrfs_record_root_in_trans(trans, new_root);
671 if (ret) {
672 btrfs_put_root(new_root);
673 btrfs_abort_transaction(trans, ret);
674 goto fail;
675 }
676
677 ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
678 btrfs_put_root(new_root);
679 if (ret) {
680 /* We potentially lose an unused inode item here */
681 btrfs_abort_transaction(trans, ret);
682 goto fail;
683 }
684
685 /*
686 * insert the directory item
687 */
688 ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
689 if (ret) {
690 btrfs_abort_transaction(trans, ret);
691 goto fail;
692 }
693
694 ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
695 BTRFS_FT_DIR, index);
696 if (ret) {
697 btrfs_abort_transaction(trans, ret);
698 goto fail;
699 }
700
701 btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
702 ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
703 if (ret) {
704 btrfs_abort_transaction(trans, ret);
705 goto fail;
706 }
707
708 ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
709 btrfs_ino(BTRFS_I(dir)), index, name, namelen);
710 if (ret) {
711 btrfs_abort_transaction(trans, ret);
712 goto fail;
713 }
714
715 ret = btrfs_uuid_tree_add(trans, root_item->uuid,
716 BTRFS_UUID_KEY_SUBVOL, objectid);
717 if (ret)
718 btrfs_abort_transaction(trans, ret);
719
720 fail:
721 kfree(root_item);
722 trans->block_rsv = NULL;
723 trans->bytes_reserved = 0;
724 btrfs_subvolume_release_metadata(root, &block_rsv);
725
726 if (ret)
727 btrfs_end_transaction(trans);
728 else
729 ret = btrfs_commit_transaction(trans);
730
731 if (!ret) {
732 inode = btrfs_lookup_dentry(dir, dentry);
733 if (IS_ERR(inode))
734 return PTR_ERR(inode);
735 d_instantiate(dentry, inode);
736 }
737 return ret;
738
739 fail_free:
740 if (anon_dev)
741 free_anon_bdev(anon_dev);
742 kfree(root_item);
743 return ret;
744 }
745
746 static int create_snapshot(struct btrfs_root *root, struct inode *dir,
747 struct dentry *dentry, bool readonly,
748 struct btrfs_qgroup_inherit *inherit)
749 {
750 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
751 struct inode *inode;
752 struct btrfs_pending_snapshot *pending_snapshot;
753 struct btrfs_trans_handle *trans;
754 int ret;
755
756 if (!test_bit(BTRFS_ROOT_SHAREABLE, &root->state))
757 return -EINVAL;
758
759 if (atomic_read(&root->nr_swapfiles)) {
760 btrfs_warn(fs_info,
761 "cannot snapshot subvolume with active swapfile");
762 return -ETXTBSY;
763 }
764
765 pending_snapshot = kzalloc(sizeof(*pending_snapshot), GFP_KERNEL);
766 if (!pending_snapshot)
767 return -ENOMEM;
768
769 ret = get_anon_bdev(&pending_snapshot->anon_dev);
770 if (ret < 0)
771 goto free_pending;
772 pending_snapshot->root_item = kzalloc(sizeof(struct btrfs_root_item),
773 GFP_KERNEL);
774 pending_snapshot->path = btrfs_alloc_path();
775 if (!pending_snapshot->root_item || !pending_snapshot->path) {
776 ret = -ENOMEM;
777 goto free_pending;
778 }
779
780 btrfs_init_block_rsv(&pending_snapshot->block_rsv,
781 BTRFS_BLOCK_RSV_TEMP);
782 /*
783 * 1 - parent dir inode
784 * 2 - dir entries
785 * 1 - root item
786 * 2 - root ref/backref
787 * 1 - root of snapshot
788 * 1 - UUID item
789 */
790 ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
791 &pending_snapshot->block_rsv, 8,
792 false);
793 if (ret)
794 goto free_pending;
795
796 pending_snapshot->dentry = dentry;
797 pending_snapshot->root = root;
798 pending_snapshot->readonly = readonly;
799 pending_snapshot->dir = dir;
800 pending_snapshot->inherit = inherit;
801
802 trans = btrfs_start_transaction(root, 0);
803 if (IS_ERR(trans)) {
804 ret = PTR_ERR(trans);
805 goto fail;
806 }
807
808 trans->pending_snapshot = pending_snapshot;
809
810 ret = btrfs_commit_transaction(trans);
811 if (ret)
812 goto fail;
813
814 ret = pending_snapshot->error;
815 if (ret)
816 goto fail;
817
818 ret = btrfs_orphan_cleanup(pending_snapshot->snap);
819 if (ret)
820 goto fail;
821
822 inode = btrfs_lookup_dentry(d_inode(dentry->d_parent), dentry);
823 if (IS_ERR(inode)) {
824 ret = PTR_ERR(inode);
825 goto fail;
826 }
827
828 d_instantiate(dentry, inode);
829 ret = 0;
830 pending_snapshot->anon_dev = 0;
831 fail:
832 /* Prevent double freeing of anon_dev */
833 if (ret && pending_snapshot->snap)
834 pending_snapshot->snap->anon_dev = 0;
835 btrfs_put_root(pending_snapshot->snap);
836 btrfs_subvolume_release_metadata(root, &pending_snapshot->block_rsv);
837 free_pending:
838 if (pending_snapshot->anon_dev)
839 free_anon_bdev(pending_snapshot->anon_dev);
840 kfree(pending_snapshot->root_item);
841 btrfs_free_path(pending_snapshot->path);
842 kfree(pending_snapshot);
843
844 return ret;
845 }
846
847 /* copy of may_delete in fs/namei.c()
848 * Check whether we can remove a link victim from directory dir, check
849 * whether the type of victim is right.
850 * 1. We can't do it if dir is read-only (done in permission())
851 * 2. We should have write and exec permissions on dir
852 * 3. We can't remove anything from append-only dir
853 * 4. We can't do anything with immutable dir (done in permission())
854 * 5. If the sticky bit on dir is set we should either
855 * a. be owner of dir, or
856 * b. be owner of victim, or
857 * c. have CAP_FOWNER capability
858 * 6. If the victim is append-only or immutable we can't do anything with
859 * links pointing to it.
860 * 7. If we were asked to remove a directory and victim isn't one - ENOTDIR.
861 * 8. If we were asked to remove a non-directory and victim isn't one - EISDIR.
862 * 9. We can't remove a root or mountpoint.
863 * 10. We don't allow removal of NFS sillyrenamed files; it's handled by
864 * nfs_async_unlink().
865 */
866
867 static int btrfs_may_delete(struct user_namespace *mnt_userns,
868 struct inode *dir, struct dentry *victim, int isdir)
869 {
870 int error;
871
872 if (d_really_is_negative(victim))
873 return -ENOENT;
874
875 BUG_ON(d_inode(victim->d_parent) != dir);
876 audit_inode_child(dir, victim, AUDIT_TYPE_CHILD_DELETE);
877
878 error = inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
879 if (error)
880 return error;
881 if (IS_APPEND(dir))
882 return -EPERM;
883 if (check_sticky(mnt_userns, dir, d_inode(victim)) ||
884 IS_APPEND(d_inode(victim)) || IS_IMMUTABLE(d_inode(victim)) ||
885 IS_SWAPFILE(d_inode(victim)))
886 return -EPERM;
887 if (isdir) {
888 if (!d_is_dir(victim))
889 return -ENOTDIR;
890 if (IS_ROOT(victim))
891 return -EBUSY;
892 } else if (d_is_dir(victim))
893 return -EISDIR;
894 if (IS_DEADDIR(dir))
895 return -ENOENT;
896 if (victim->d_flags & DCACHE_NFSFS_RENAMED)
897 return -EBUSY;
898 return 0;
899 }
900
901 /* copy of may_create in fs/namei.c() */
902 static inline int btrfs_may_create(struct user_namespace *mnt_userns,
903 struct inode *dir, struct dentry *child)
904 {
905 if (d_really_is_positive(child))
906 return -EEXIST;
907 if (IS_DEADDIR(dir))
908 return -ENOENT;
909 if (!fsuidgid_has_mapping(dir->i_sb, mnt_userns))
910 return -EOVERFLOW;
911 return inode_permission(mnt_userns, dir, MAY_WRITE | MAY_EXEC);
912 }
913
914 /*
915 * Create a new subvolume below @parent. This is largely modeled after
916 * sys_mkdirat and vfs_mkdir, but we only do a single component lookup
917 * inside this filesystem so it's quite a bit simpler.
918 */
919 static noinline int btrfs_mksubvol(const struct path *parent,
920 struct user_namespace *mnt_userns,
921 const char *name, int namelen,
922 struct btrfs_root *snap_src,
923 bool readonly,
924 struct btrfs_qgroup_inherit *inherit)
925 {
926 struct inode *dir = d_inode(parent->dentry);
927 struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
928 struct dentry *dentry;
929 int error;
930
931 error = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
932 if (error == -EINTR)
933 return error;
934
935 dentry = lookup_one(mnt_userns, name, parent->dentry, namelen);
936 error = PTR_ERR(dentry);
937 if (IS_ERR(dentry))
938 goto out_unlock;
939
940 error = btrfs_may_create(mnt_userns, dir, dentry);
941 if (error)
942 goto out_dput;
943
944 /*
945 * even if this name doesn't exist, we may get hash collisions.
946 * check for them now when we can safely fail
947 */
948 error = btrfs_check_dir_item_collision(BTRFS_I(dir)->root,
949 dir->i_ino, name,
950 namelen);
951 if (error)
952 goto out_dput;
953
954 down_read(&fs_info->subvol_sem);
955
956 if (btrfs_root_refs(&BTRFS_I(dir)->root->root_item) == 0)
957 goto out_up_read;
958
959 if (snap_src)
960 error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
961 else
962 error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
963
964 if (!error)
965 fsnotify_mkdir(dir, dentry);
966 out_up_read:
967 up_read(&fs_info->subvol_sem);
968 out_dput:
969 dput(dentry);
970 out_unlock:
971 btrfs_inode_unlock(dir, 0);
972 return error;
973 }
974
975 static noinline int btrfs_mksnapshot(const struct path *parent,
976 struct user_namespace *mnt_userns,
977 const char *name, int namelen,
978 struct btrfs_root *root,
979 bool readonly,
980 struct btrfs_qgroup_inherit *inherit)
981 {
982 int ret;
983 bool snapshot_force_cow = false;
984
985 /*
986 * Force new buffered writes to reserve space even when NOCOW is
987 * possible. This is to avoid later writeback (running dealloc) to
988 * fallback to COW mode and unexpectedly fail with ENOSPC.
989 */
990 btrfs_drew_read_lock(&root->snapshot_lock);
991
992 ret = btrfs_start_delalloc_snapshot(root, false);
993 if (ret)
994 goto out;
995
996 /*
997 * All previous writes have started writeback in NOCOW mode, so now
998 * we force future writes to fallback to COW mode during snapshot
999 * creation.
1000 */
1001 atomic_inc(&root->snapshot_force_cow);
1002 snapshot_force_cow = true;
1003
1004 btrfs_wait_ordered_extents(root, U64_MAX, 0, (u64)-1);
1005
1006 ret = btrfs_mksubvol(parent, mnt_userns, name, namelen,
1007 root, readonly, inherit);
1008 out:
1009 if (snapshot_force_cow)
1010 atomic_dec(&root->snapshot_force_cow);
1011 btrfs_drew_read_unlock(&root->snapshot_lock);
1012 return ret;
1013 }
1014
1015 static struct extent_map *defrag_lookup_extent(struct inode *inode, u64 start,
1016 bool locked)
1017 {
1018 struct extent_map_tree *em_tree = &BTRFS_I(inode)->extent_tree;
1019 struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
1020 struct extent_map *em;
1021 const u32 sectorsize = BTRFS_I(inode)->root->fs_info->sectorsize;
1022
1023 /*
1024 * hopefully we have this extent in the tree already, try without
1025 * the full extent lock
1026 */
1027 read_lock(&em_tree->lock);
1028 em = lookup_extent_mapping(em_tree, start, sectorsize);
1029 read_unlock(&em_tree->lock);
1030
1031 if (!em) {
1032 struct extent_state *cached = NULL;
1033 u64 end = start + sectorsize - 1;
1034
1035 /* get the big lock and read metadata off disk */
1036 if (!locked)
1037 lock_extent_bits(io_tree, start, end, &cached);
1038 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, start, sectorsize);
1039 if (!locked)
1040 unlock_extent_cached(io_tree, start, end, &cached);
1041
1042 if (IS_ERR(em))
1043 return NULL;
1044 }
1045
1046 return em;
1047 }
1048
1049 static bool defrag_check_next_extent(struct inode *inode, struct extent_map *em,
1050 bool locked)
1051 {
1052 struct extent_map *next;
1053 bool ret = true;
1054
1055 /* this is the last extent */
1056 if (em->start + em->len >= i_size_read(inode))
1057 return false;
1058
1059 next = defrag_lookup_extent(inode, em->start + em->len, locked);
1060 if (!next || next->block_start >= EXTENT_MAP_LAST_BYTE)
1061 ret = false;
1062 else if ((em->block_start + em->block_len == next->block_start) &&
1063 (em->block_len > SZ_128K && next->block_len > SZ_128K))
1064 ret = false;
1065
1066 free_extent_map(next);
1067 return ret;
1068 }
1069
1070 /*
1071 * Prepare one page to be defragged.
1072 *
1073 * This will ensure:
1074 *
1075 * - Returned page is locked and has been set up properly.
1076 * - No ordered extent exists in the page.
1077 * - The page is uptodate.
1078 *
1079 * NOTE: Caller should also wait for page writeback after the cluster is
1080 * prepared, here we don't do writeback wait for each page.
1081 */
1082 static struct page *defrag_prepare_one_page(struct btrfs_inode *inode,
1083 pgoff_t index)
1084 {
1085 struct address_space *mapping = inode->vfs_inode.i_mapping;
1086 gfp_t mask = btrfs_alloc_write_mask(mapping);
1087 u64 page_start = (u64)index << PAGE_SHIFT;
1088 u64 page_end = page_start + PAGE_SIZE - 1;
1089 struct extent_state *cached_state = NULL;
1090 struct page *page;
1091 int ret;
1092
1093 again:
1094 page = find_or_create_page(mapping, index, mask);
1095 if (!page)
1096 return ERR_PTR(-ENOMEM);
1097
1098 /*
1099 * Since we can defragment files opened read-only, we can encounter
1100 * transparent huge pages here (see CONFIG_READ_ONLY_THP_FOR_FS). We
1101 * can't do I/O using huge pages yet, so return an error for now.
1102 * Filesystem transparent huge pages are typically only used for
1103 * executables that explicitly enable them, so this isn't very
1104 * restrictive.
1105 */
1106 if (PageCompound(page)) {
1107 unlock_page(page);
1108 put_page(page);
1109 return ERR_PTR(-ETXTBSY);
1110 }
1111
1112 ret = set_page_extent_mapped(page);
1113 if (ret < 0) {
1114 unlock_page(page);
1115 put_page(page);
1116 return ERR_PTR(ret);
1117 }
1118
1119 /* Wait for any existing ordered extent in the range */
1120 while (1) {
1121 struct btrfs_ordered_extent *ordered;
1122
1123 lock_extent_bits(&inode->io_tree, page_start, page_end, &cached_state);
1124 ordered = btrfs_lookup_ordered_range(inode, page_start, PAGE_SIZE);
1125 unlock_extent_cached(&inode->io_tree, page_start, page_end,
1126 &cached_state);
1127 if (!ordered)
1128 break;
1129
1130 unlock_page(page);
1131 btrfs_start_ordered_extent(ordered, 1);
1132 btrfs_put_ordered_extent(ordered);
1133 lock_page(page);
1134 /*
1135 * We unlocked the page above, so we need check if it was
1136 * released or not.
1137 */
1138 if (page->mapping != mapping || !PagePrivate(page)) {
1139 unlock_page(page);
1140 put_page(page);
1141 goto again;
1142 }
1143 }
1144
1145 /*
1146 * Now the page range has no ordered extent any more. Read the page to
1147 * make it uptodate.
1148 */
1149 if (!PageUptodate(page)) {
1150 btrfs_readpage(NULL, page);
1151 lock_page(page);
1152 if (page->mapping != mapping || !PagePrivate(page)) {
1153 unlock_page(page);
1154 put_page(page);
1155 goto again;
1156 }
1157 if (!PageUptodate(page)) {
1158 unlock_page(page);
1159 put_page(page);
1160 return ERR_PTR(-EIO);
1161 }
1162 }
1163 return page;
1164 }
1165
1166 struct defrag_target_range {
1167 struct list_head list;
1168 u64 start;
1169 u64 len;
1170 };
1171
1172 /*
1173 * Collect all valid target extents.
1174 *
1175 * @start: file offset to lookup
1176 * @len: length to lookup
1177 * @extent_thresh: file extent size threshold, any extent size >= this value
1178 * will be ignored
1179 * @newer_than: only defrag extents newer than this value
1180 * @do_compress: whether the defrag is doing compression
1181 * if true, @extent_thresh will be ignored and all regular
1182 * file extents meeting @newer_than will be targets.
1183 * @locked: if the range has already held extent lock
1184 * @target_list: list of targets file extents
1185 */
1186 static int defrag_collect_targets(struct btrfs_inode *inode,
1187 u64 start, u64 len, u32 extent_thresh,
1188 u64 newer_than, bool do_compress,
1189 bool locked, struct list_head *target_list)
1190 {
1191 u64 cur = start;
1192 int ret = 0;
1193
1194 while (cur < start + len) {
1195 struct extent_map *em;
1196 struct defrag_target_range *new;
1197 bool next_mergeable = true;
1198 u64 range_len;
1199
1200 em = defrag_lookup_extent(&inode->vfs_inode, cur, locked);
1201 if (!em)
1202 break;
1203
1204 /* Skip hole/inline/preallocated extents */
1205 if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
1206 test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
1207 goto next;
1208
1209 /* Skip older extent */
1210 if (em->generation < newer_than)
1211 goto next;
1212
1213 /*
1214 * Our start offset might be in the middle of an existing extent
1215 * map, so take that into account.
1216 */
1217 range_len = em->len - (cur - em->start);
1218 /*
1219 * If this range of the extent map is already flagged for delalloc,
1220 * skip it, because:
1221 *
1222 * 1) We could deadlock later, when trying to reserve space for
1223 * delalloc, because in case we can't immediately reserve space
1224 * the flusher can start delalloc and wait for the respective
1225 * ordered extents to complete. The deadlock would happen
1226 * because we do the space reservation while holding the range
1227 * locked, and starting writeback, or finishing an ordered
1228 * extent, requires locking the range;
1229 *
1230 * 2) If there's delalloc there, it means there's dirty pages for
1231 * which writeback has not started yet (we clean the delalloc
1232 * flag when starting writeback and after creating an ordered
1233 * extent). If we mark pages in an adjacent range for defrag,
1234 * then we will have a larger contiguous range for delalloc,
1235 * very likely resulting in a larger extent after writeback is
1236 * triggered (except in a case of free space fragmentation).
1237 */
1238 if (test_range_bit(&inode->io_tree, cur, cur + range_len - 1,
1239 EXTENT_DELALLOC, 0, NULL))
1240 goto next;
1241
1242 /*
1243 * For do_compress case, we want to compress all valid file
1244 * extents, thus no @extent_thresh or mergeable check.
1245 */
1246 if (do_compress)
1247 goto add;
1248
1249 /* Skip too large extent */
1250 if (range_len >= extent_thresh)
1251 goto next;
1252
1253 next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
1254 locked);
1255 if (!next_mergeable) {
1256 struct defrag_target_range *last;
1257
1258 /* Empty target list, no way to merge with last entry */
1259 if (list_empty(target_list))
1260 goto next;
1261 last = list_entry(target_list->prev,
1262 struct defrag_target_range, list);
1263 /* Not mergeable with last entry */
1264 if (last->start + last->len != cur)
1265 goto next;
1266
1267 /* Mergeable, fall through to add it to @target_list. */
1268 }
1269
1270 add:
1271 range_len = min(extent_map_end(em), start + len) - cur;
1272 /*
1273 * This one is a good target, check if it can be merged into
1274 * last range of the target list.
1275 */
1276 if (!list_empty(target_list)) {
1277 struct defrag_target_range *last;
1278
1279 last = list_entry(target_list->prev,
1280 struct defrag_target_range, list);
1281 ASSERT(last->start + last->len <= cur);
1282 if (last->start + last->len == cur) {
1283 /* Mergeable, enlarge the last entry */
1284 last->len += range_len;
1285 goto next;
1286 }
1287 /* Fall through to allocate a new entry */
1288 }
1289
1290 /* Allocate new defrag_target_range */
1291 new = kmalloc(sizeof(*new), GFP_NOFS);
1292 if (!new) {
1293 free_extent_map(em);
1294 ret = -ENOMEM;
1295 break;
1296 }
1297 new->start = cur;
1298 new->len = range_len;
1299 list_add_tail(&new->list, target_list);
1300
1301 next:
1302 cur = extent_map_end(em);
1303 free_extent_map(em);
1304 }
1305 if (ret < 0) {
1306 struct defrag_target_range *entry;
1307 struct defrag_target_range *tmp;
1308
1309 list_for_each_entry_safe(entry, tmp, target_list, list) {
1310 list_del_init(&entry->list);
1311 kfree(entry);
1312 }
1313 }
1314 return ret;
1315 }
1316
1317 #define CLUSTER_SIZE (SZ_256K)
1318
1319 /*
1320 * Defrag one contiguous target range.
1321 *
1322 * @inode: target inode
1323 * @target: target range to defrag
1324 * @pages: locked pages covering the defrag range
1325 * @nr_pages: number of locked pages
1326 *
1327 * Caller should ensure:
1328 *
1329 * - Pages are prepared
1330 * Pages should be locked, no ordered extent in the pages range,
1331 * no writeback.
1332 *
1333 * - Extent bits are locked
1334 */
1335 static int defrag_one_locked_target(struct btrfs_inode *inode,
1336 struct defrag_target_range *target,
1337 struct page **pages, int nr_pages,
1338 struct extent_state **cached_state)
1339 {
1340 struct btrfs_fs_info *fs_info = inode->root->fs_info;
1341 struct extent_changeset *data_reserved = NULL;
1342 const u64 start = target->start;
1343 const u64 len = target->len;
1344 unsigned long last_index = (start + len - 1) >> PAGE_SHIFT;
1345 unsigned long start_index = start >> PAGE_SHIFT;
1346 unsigned long first_index = page_index(pages[0]);
1347 int ret = 0;
1348 int i;
1349
1350 ASSERT(last_index - first_index + 1 <= nr_pages);
1351
1352 ret = btrfs_delalloc_reserve_space(inode, &data_reserved, start, len);
1353 if (ret < 0)
1354 return ret;
1355 clear_extent_bit(&inode->io_tree, start, start + len - 1,
1356 EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING |
1357 EXTENT_DEFRAG, 0, 0, cached_state);
1358 set_extent_defrag(&inode->io_tree, start, start + len - 1, cached_state);
1359
1360 /* Update the page status */
1361 for (i = start_index - first_index; i <= last_index - first_index; i++) {
1362 ClearPageChecked(pages[i]);
1363 btrfs_page_clamp_set_dirty(fs_info, pages[i], start, len);
1364 }
1365 btrfs_delalloc_release_extents(inode, len);
1366 extent_changeset_free(data_reserved);
1367
1368 return ret;
1369 }
1370
1371 static int defrag_one_range(struct btrfs_inode *inode, u64 start, u32 len,
1372 u32 extent_thresh, u64 newer_than, bool do_compress)
1373 {
1374 struct extent_state *cached_state = NULL;
1375 struct defrag_target_range *entry;
1376 struct defrag_target_range *tmp;
1377 LIST_HEAD(target_list);
1378 struct page **pages;
1379 const u32 sectorsize = inode->root->fs_info->sectorsize;
1380 u64 last_index = (start + len - 1) >> PAGE_SHIFT;
1381 u64 start_index = start >> PAGE_SHIFT;
1382 unsigned int nr_pages = last_index - start_index + 1;
1383 int ret = 0;
1384 int i;
1385
1386 ASSERT(nr_pages <= CLUSTER_SIZE / PAGE_SIZE);
1387 ASSERT(IS_ALIGNED(start, sectorsize) && IS_ALIGNED(len, sectorsize));
1388
1389 pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS);
1390 if (!pages)
1391 return -ENOMEM;
1392
1393 /* Prepare all pages */
1394 for (i = 0; i < nr_pages; i++) {
1395 pages[i] = defrag_prepare_one_page(inode, start_index + i);
1396 if (IS_ERR(pages[i])) {
1397 ret = PTR_ERR(pages[i]);
1398 pages[i] = NULL;
1399 goto free_pages;
1400 }
1401 }
1402 for (i = 0; i < nr_pages; i++)
1403 wait_on_page_writeback(pages[i]);
1404
1405 /* Lock the pages range */
1406 lock_extent_bits(&inode->io_tree, start_index << PAGE_SHIFT,
1407 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1408 &cached_state);
1409 /*
1410 * Now we have a consistent view about the extent map, re-check
1411 * which range really needs to be defragged.
1412 *
1413 * And this time we have extent locked already, pass @locked = true
1414 * so that we won't relock the extent range and cause deadlock.
1415 */
1416 ret = defrag_collect_targets(inode, start, len, extent_thresh,
1417 newer_than, do_compress, true,
1418 &target_list);
1419 if (ret < 0)
1420 goto unlock_extent;
1421
1422 list_for_each_entry(entry, &target_list, list) {
1423 ret = defrag_one_locked_target(inode, entry, pages, nr_pages,
1424 &cached_state);
1425 if (ret < 0)
1426 break;
1427 }
1428
1429 list_for_each_entry_safe(entry, tmp, &target_list, list) {
1430 list_del_init(&entry->list);
1431 kfree(entry);
1432 }
1433 unlock_extent:
1434 unlock_extent_cached(&inode->io_tree, start_index << PAGE_SHIFT,
1435 (last_index << PAGE_SHIFT) + PAGE_SIZE - 1,
1436 &cached_state);
1437 free_pages:
1438 for (i = 0; i < nr_pages; i++) {
1439 if (pages[i]) {
1440 unlock_page(pages[i]);
1441 put_page(pages[i]);
1442 }
1443 }
1444 kfree(pages);
1445 return ret;
1446 }
1447
1448 static int defrag_one_cluster(struct btrfs_inode *inode,
1449 struct file_ra_state *ra,
1450 u64 start, u32 len, u32 extent_thresh,
1451 u64 newer_than, bool do_compress,
1452 unsigned long *sectors_defragged,
1453 unsigned long max_sectors)
1454 {
1455 const u32 sectorsize = inode->root->fs_info->sectorsize;
1456 struct defrag_target_range *entry;
1457 struct defrag_target_range *tmp;
1458 LIST_HEAD(target_list);
1459 int ret;
1460
1461 BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
1462 ret = defrag_collect_targets(inode, start, len, extent_thresh,
1463 newer_than, do_compress, false,
1464 &target_list);
1465 if (ret < 0)
1466 goto out;
1467
1468 list_for_each_entry(entry, &target_list, list) {
1469 u32 range_len = entry->len;
1470
1471 /* Reached or beyond the limit */
1472 if (max_sectors && *sectors_defragged >= max_sectors) {
1473 ret = 1;
1474 break;
1475 }
1476
1477 if (max_sectors)
1478 range_len = min_t(u32, range_len,
1479 (max_sectors - *sectors_defragged) * sectorsize);
1480
1481 if (ra)
1482 page_cache_sync_readahead(inode->vfs_inode.i_mapping,
1483 ra, NULL, entry->start >> PAGE_SHIFT,
1484 ((entry->start + range_len - 1) >> PAGE_SHIFT) -
1485 (entry->start >> PAGE_SHIFT) + 1);
1486 /*
1487 * Here we may not defrag any range if holes are punched before
1488 * we locked the pages.
1489 * But that's fine, it only affects the @sectors_defragged
1490 * accounting.
1491 */
1492 ret = defrag_one_range(inode, entry->start, range_len,
1493 extent_thresh, newer_than, do_compress);
1494 if (ret < 0)
1495 break;
1496 *sectors_defragged += range_len >>
1497 inode->root->fs_info->sectorsize_bits;
1498 }
1499 out:
1500 list_for_each_entry_safe(entry, tmp, &target_list, list) {
1501 list_del_init(&entry->list);
1502 kfree(entry);
1503 }
1504 return ret;
1505 }
1506
1507 /*
1508 * Entry point to file defragmentation.
1509 *
1510 * @inode: inode to be defragged
1511 * @ra: readahead state (can be NUL)
1512 * @range: defrag options including range and flags
1513 * @newer_than: minimum transid to defrag
1514 * @max_to_defrag: max number of sectors to be defragged, if 0, the whole inode
1515 * will be defragged.
1516 *
1517 * Return <0 for error.
1518 * Return >=0 for the number of sectors defragged, and range->start will be updated
1519 * to indicate the file offset where next defrag should be started at.
1520 * (Mostly for autodefrag, which sets @max_to_defrag thus we may exit early without
1521 * defragging all the range).
1522 */
1523 int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
1524 struct btrfs_ioctl_defrag_range_args *range,
1525 u64 newer_than, unsigned long max_to_defrag)
1526 {
1527 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1528 unsigned long sectors_defragged = 0;
1529 u64 isize = i_size_read(inode);
1530 u64 cur;
1531 u64 last_byte;
1532 bool do_compress = range->flags & BTRFS_DEFRAG_RANGE_COMPRESS;
1533 bool ra_allocated = false;
1534 int compress_type = BTRFS_COMPRESS_ZLIB;
1535 int ret = 0;
1536 u32 extent_thresh = range->extent_thresh;
1537 pgoff_t start_index;
1538
1539 if (isize == 0)
1540 return 0;
1541
1542 if (range->start >= isize)
1543 return -EINVAL;
1544
1545 if (do_compress) {
1546 if (range->compress_type >= BTRFS_NR_COMPRESS_TYPES)
1547 return -EINVAL;
1548 if (range->compress_type)
1549 compress_type = range->compress_type;
1550 }
1551
1552 if (extent_thresh == 0)
1553 extent_thresh = SZ_256K;
1554
1555 if (range->start + range->len > range->start) {
1556 /* Got a specific range */
1557 last_byte = min(isize, range->start + range->len);
1558 } else {
1559 /* Defrag until file end */
1560 last_byte = isize;
1561 }
1562
1563 /* Align the range */
1564 cur = round_down(range->start, fs_info->sectorsize);
1565 last_byte = round_up(last_byte, fs_info->sectorsize) - 1;
1566
1567 /*
1568 * If we were not given a ra, allocate a readahead context. As
1569 * readahead is just an optimization, defrag will work without it so
1570 * we don't error out.
1571 */
1572 if (!ra) {
1573 ra_allocated = true;
1574 ra = kzalloc(sizeof(*ra), GFP_KERNEL);
1575 if (ra)
1576 file_ra_state_init(ra, inode->i_mapping);
1577 }
1578
1579 /*
1580 * Make writeback start from the beginning of the range, so that the
1581 * defrag range can be written sequentially.
1582 */
1583 start_index = cur >> PAGE_SHIFT;
1584 if (start_index < inode->i_mapping->writeback_index)
1585 inode->i_mapping->writeback_index = start_index;
1586
1587 while (cur < last_byte) {
1588 const unsigned long prev_sectors_defragged = sectors_defragged;
1589 u64 cluster_end;
1590
1591 /* The cluster size 256K should always be page aligned */
1592 BUILD_BUG_ON(!IS_ALIGNED(CLUSTER_SIZE, PAGE_SIZE));
1593
1594 if (btrfs_defrag_cancelled(fs_info)) {
1595 ret = -EAGAIN;
1596 break;
1597 }
1598
1599 /* We want the cluster end at page boundary when possible */
1600 cluster_end = (((cur >> PAGE_SHIFT) +
1601 (SZ_256K >> PAGE_SHIFT)) << PAGE_SHIFT) - 1;
1602 cluster_end = min(cluster_end, last_byte);
1603
1604 btrfs_inode_lock(inode, 0);
1605 if (IS_SWAPFILE(inode)) {
1606 ret = -ETXTBSY;
1607 btrfs_inode_unlock(inode, 0);
1608 break;
1609 }
1610 if (!(inode->i_sb->s_flags & SB_ACTIVE)) {
1611 btrfs_inode_unlock(inode, 0);
1612 break;
1613 }
1614 if (do_compress)
1615 BTRFS_I(inode)->defrag_compress = compress_type;
1616 ret = defrag_one_cluster(BTRFS_I(inode), ra, cur,
1617 cluster_end + 1 - cur, extent_thresh,
1618 newer_than, do_compress,
1619 &sectors_defragged, max_to_defrag);
1620
1621 if (sectors_defragged > prev_sectors_defragged)
1622 balance_dirty_pages_ratelimited(inode->i_mapping);
1623
1624 btrfs_inode_unlock(inode, 0);
1625 if (ret < 0)
1626 break;
1627 cur = cluster_end + 1;
1628 if (ret > 0) {
1629 ret = 0;
1630 break;
1631 }
1632 }
1633
1634 if (ra_allocated)
1635 kfree(ra);
1636 /*
1637 * Update range.start for autodefrag, this will indicate where to start
1638 * in next run.
1639 */
1640 range->start = cur;
1641 if (sectors_defragged) {
1642 /*
1643 * We have defragged some sectors, for compression case they
1644 * need to be written back immediately.
1645 */
1646 if (range->flags & BTRFS_DEFRAG_RANGE_START_IO) {
1647 filemap_flush(inode->i_mapping);
1648 if (test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
1649 &BTRFS_I(inode)->runtime_flags))
1650 filemap_flush(inode->i_mapping);
1651 }
1652 if (range->compress_type == BTRFS_COMPRESS_LZO)
1653 btrfs_set_fs_incompat(fs_info, COMPRESS_LZO);
1654 else if (range->compress_type == BTRFS_COMPRESS_ZSTD)
1655 btrfs_set_fs_incompat(fs_info, COMPRESS_ZSTD);
1656 ret = sectors_defragged;
1657 }
1658 if (do_compress) {
1659 btrfs_inode_lock(inode, 0);
1660 BTRFS_I(inode)->defrag_compress = BTRFS_COMPRESS_NONE;
1661 btrfs_inode_unlock(inode, 0);
1662 }
1663 return ret;
1664 }
1665
1666 /*
1667 * Try to start exclusive operation @type or cancel it if it's running.
1668 *
1669 * Return:
1670 * 0 - normal mode, newly claimed op started
1671 * >0 - normal mode, something else is running,
1672 * return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS to user space
1673 * ECANCELED - cancel mode, successful cancel
1674 * ENOTCONN - cancel mode, operation not running anymore
1675 */
1676 static int exclop_start_or_cancel_reloc(struct btrfs_fs_info *fs_info,
1677 enum btrfs_exclusive_operation type, bool cancel)
1678 {
1679 if (!cancel) {
1680 /* Start normal op */
1681 if (!btrfs_exclop_start(fs_info, type))
1682 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
1683 /* Exclusive operation is now claimed */
1684 return 0;
1685 }
1686
1687 /* Cancel running op */
1688 if (btrfs_exclop_start_try_lock(fs_info, type)) {
1689 /*
1690 * This blocks any exclop finish from setting it to NONE, so we
1691 * request cancellation. Either it runs and we will wait for it,
1692 * or it has finished and no waiting will happen.
1693 */
1694 atomic_inc(&fs_info->reloc_cancel_req);
1695 btrfs_exclop_start_unlock(fs_info);
1696
1697 if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
1698 wait_on_bit(&fs_info->flags, BTRFS_FS_RELOC_RUNNING,
1699 TASK_INTERRUPTIBLE);
1700
1701 return -ECANCELED;
1702 }
1703
1704 /* Something else is running or none */
1705 return -ENOTCONN;
1706 }
1707
1708 static noinline int btrfs_ioctl_resize(struct file *file,
1709 void __user *arg)
1710 {
1711 BTRFS_DEV_LOOKUP_ARGS(args);
1712 struct inode *inode = file_inode(file);
1713 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1714 u64 new_size;
1715 u64 old_size;
1716 u64 devid = 1;
1717 struct btrfs_root *root = BTRFS_I(inode)->root;
1718 struct btrfs_ioctl_vol_args *vol_args;
1719 struct btrfs_trans_handle *trans;
1720 struct btrfs_device *device = NULL;
1721 char *sizestr;
1722 char *retptr;
1723 char *devstr = NULL;
1724 int ret = 0;
1725 int mod = 0;
1726 bool cancel;
1727
1728 if (!capable(CAP_SYS_ADMIN))
1729 return -EPERM;
1730
1731 ret = mnt_want_write_file(file);
1732 if (ret)
1733 return ret;
1734
1735 /*
1736 * Read the arguments before checking exclusivity to be able to
1737 * distinguish regular resize and cancel
1738 */
1739 vol_args = memdup_user(arg, sizeof(*vol_args));
1740 if (IS_ERR(vol_args)) {
1741 ret = PTR_ERR(vol_args);
1742 goto out_drop;
1743 }
1744 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1745 sizestr = vol_args->name;
1746 cancel = (strcmp("cancel", sizestr) == 0);
1747 ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_RESIZE, cancel);
1748 if (ret)
1749 goto out_free;
1750 /* Exclusive operation is now claimed */
1751
1752 devstr = strchr(sizestr, ':');
1753 if (devstr) {
1754 sizestr = devstr + 1;
1755 *devstr = '\0';
1756 devstr = vol_args->name;
1757 ret = kstrtoull(devstr, 10, &devid);
1758 if (ret)
1759 goto out_finish;
1760 if (!devid) {
1761 ret = -EINVAL;
1762 goto out_finish;
1763 }
1764 btrfs_info(fs_info, "resizing devid %llu", devid);
1765 }
1766
1767 args.devid = devid;
1768 device = btrfs_find_device(fs_info->fs_devices, &args);
1769 if (!device) {
1770 btrfs_info(fs_info, "resizer unable to find device %llu",
1771 devid);
1772 ret = -ENODEV;
1773 goto out_finish;
1774 }
1775
1776 if (!test_bit(BTRFS_DEV_STATE_WRITEABLE, &device->dev_state)) {
1777 btrfs_info(fs_info,
1778 "resizer unable to apply on readonly device %llu",
1779 devid);
1780 ret = -EPERM;
1781 goto out_finish;
1782 }
1783
1784 if (!strcmp(sizestr, "max"))
1785 new_size = bdev_nr_bytes(device->bdev);
1786 else {
1787 if (sizestr[0] == '-') {
1788 mod = -1;
1789 sizestr++;
1790 } else if (sizestr[0] == '+') {
1791 mod = 1;
1792 sizestr++;
1793 }
1794 new_size = memparse(sizestr, &retptr);
1795 if (*retptr != '\0' || new_size == 0) {
1796 ret = -EINVAL;
1797 goto out_finish;
1798 }
1799 }
1800
1801 if (test_bit(BTRFS_DEV_STATE_REPLACE_TGT, &device->dev_state)) {
1802 ret = -EPERM;
1803 goto out_finish;
1804 }
1805
1806 old_size = btrfs_device_get_total_bytes(device);
1807
1808 if (mod < 0) {
1809 if (new_size > old_size) {
1810 ret = -EINVAL;
1811 goto out_finish;
1812 }
1813 new_size = old_size - new_size;
1814 } else if (mod > 0) {
1815 if (new_size > ULLONG_MAX - old_size) {
1816 ret = -ERANGE;
1817 goto out_finish;
1818 }
1819 new_size = old_size + new_size;
1820 }
1821
1822 if (new_size < SZ_256M) {
1823 ret = -EINVAL;
1824 goto out_finish;
1825 }
1826 if (new_size > bdev_nr_bytes(device->bdev)) {
1827 ret = -EFBIG;
1828 goto out_finish;
1829 }
1830
1831 new_size = round_down(new_size, fs_info->sectorsize);
1832
1833 if (new_size > old_size) {
1834 trans = btrfs_start_transaction(root, 0);
1835 if (IS_ERR(trans)) {
1836 ret = PTR_ERR(trans);
1837 goto out_finish;
1838 }
1839 ret = btrfs_grow_device(trans, device, new_size);
1840 btrfs_commit_transaction(trans);
1841 } else if (new_size < old_size) {
1842 ret = btrfs_shrink_device(device, new_size);
1843 } /* equal, nothing need to do */
1844
1845 if (ret == 0 && new_size != old_size)
1846 btrfs_info_in_rcu(fs_info,
1847 "resize device %s (devid %llu) from %llu to %llu",
1848 rcu_str_deref(device->name), device->devid,
1849 old_size, new_size);
1850 out_finish:
1851 btrfs_exclop_finish(fs_info);
1852 out_free:
1853 kfree(vol_args);
1854 out_drop:
1855 mnt_drop_write_file(file);
1856 return ret;
1857 }
1858
1859 static noinline int __btrfs_ioctl_snap_create(struct file *file,
1860 struct user_namespace *mnt_userns,
1861 const char *name, unsigned long fd, int subvol,
1862 bool readonly,
1863 struct btrfs_qgroup_inherit *inherit)
1864 {
1865 int namelen;
1866 int ret = 0;
1867
1868 if (!S_ISDIR(file_inode(file)->i_mode))
1869 return -ENOTDIR;
1870
1871 ret = mnt_want_write_file(file);
1872 if (ret)
1873 goto out;
1874
1875 namelen = strlen(name);
1876 if (strchr(name, '/')) {
1877 ret = -EINVAL;
1878 goto out_drop_write;
1879 }
1880
1881 if (name[0] == '.' &&
1882 (namelen == 1 || (name[1] == '.' && namelen == 2))) {
1883 ret = -EEXIST;
1884 goto out_drop_write;
1885 }
1886
1887 if (subvol) {
1888 ret = btrfs_mksubvol(&file->f_path, mnt_userns, name,
1889 namelen, NULL, readonly, inherit);
1890 } else {
1891 struct fd src = fdget(fd);
1892 struct inode *src_inode;
1893 if (!src.file) {
1894 ret = -EINVAL;
1895 goto out_drop_write;
1896 }
1897
1898 src_inode = file_inode(src.file);
1899 if (src_inode->i_sb != file_inode(file)->i_sb) {
1900 btrfs_info(BTRFS_I(file_inode(file))->root->fs_info,
1901 "Snapshot src from another FS");
1902 ret = -EXDEV;
1903 } else if (!inode_owner_or_capable(mnt_userns, src_inode)) {
1904 /*
1905 * Subvolume creation is not restricted, but snapshots
1906 * are limited to own subvolumes only
1907 */
1908 ret = -EPERM;
1909 } else {
1910 ret = btrfs_mksnapshot(&file->f_path, mnt_userns,
1911 name, namelen,
1912 BTRFS_I(src_inode)->root,
1913 readonly, inherit);
1914 }
1915 fdput(src);
1916 }
1917 out_drop_write:
1918 mnt_drop_write_file(file);
1919 out:
1920 return ret;
1921 }
1922
1923 static noinline int btrfs_ioctl_snap_create(struct file *file,
1924 void __user *arg, int subvol)
1925 {
1926 struct btrfs_ioctl_vol_args *vol_args;
1927 int ret;
1928
1929 if (!S_ISDIR(file_inode(file)->i_mode))
1930 return -ENOTDIR;
1931
1932 vol_args = memdup_user(arg, sizeof(*vol_args));
1933 if (IS_ERR(vol_args))
1934 return PTR_ERR(vol_args);
1935 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
1936
1937 ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
1938 vol_args->name, vol_args->fd, subvol,
1939 false, NULL);
1940
1941 kfree(vol_args);
1942 return ret;
1943 }
1944
1945 static noinline int btrfs_ioctl_snap_create_v2(struct file *file,
1946 void __user *arg, int subvol)
1947 {
1948 struct btrfs_ioctl_vol_args_v2 *vol_args;
1949 int ret;
1950 bool readonly = false;
1951 struct btrfs_qgroup_inherit *inherit = NULL;
1952
1953 if (!S_ISDIR(file_inode(file)->i_mode))
1954 return -ENOTDIR;
1955
1956 vol_args = memdup_user(arg, sizeof(*vol_args));
1957 if (IS_ERR(vol_args))
1958 return PTR_ERR(vol_args);
1959 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
1960
1961 if (vol_args->flags & ~BTRFS_SUBVOL_CREATE_ARGS_MASK) {
1962 ret = -EOPNOTSUPP;
1963 goto free_args;
1964 }
1965
1966 if (vol_args->flags & BTRFS_SUBVOL_RDONLY)
1967 readonly = true;
1968 if (vol_args->flags & BTRFS_SUBVOL_QGROUP_INHERIT) {
1969 u64 nums;
1970
1971 if (vol_args->size < sizeof(*inherit) ||
1972 vol_args->size > PAGE_SIZE) {
1973 ret = -EINVAL;
1974 goto free_args;
1975 }
1976 inherit = memdup_user(vol_args->qgroup_inherit, vol_args->size);
1977 if (IS_ERR(inherit)) {
1978 ret = PTR_ERR(inherit);
1979 goto free_args;
1980 }
1981
1982 if (inherit->num_qgroups > PAGE_SIZE ||
1983 inherit->num_ref_copies > PAGE_SIZE ||
1984 inherit->num_excl_copies > PAGE_SIZE) {
1985 ret = -EINVAL;
1986 goto free_inherit;
1987 }
1988
1989 nums = inherit->num_qgroups + 2 * inherit->num_ref_copies +
1990 2 * inherit->num_excl_copies;
1991 if (vol_args->size != struct_size(inherit, qgroups, nums)) {
1992 ret = -EINVAL;
1993 goto free_inherit;
1994 }
1995 }
1996
1997 ret = __btrfs_ioctl_snap_create(file, file_mnt_user_ns(file),
1998 vol_args->name, vol_args->fd, subvol,
1999 readonly, inherit);
2000 if (ret)
2001 goto free_inherit;
2002 free_inherit:
2003 kfree(inherit);
2004 free_args:
2005 kfree(vol_args);
2006 return ret;
2007 }
2008
2009 static noinline int btrfs_ioctl_subvol_getflags(struct file *file,
2010 void __user *arg)
2011 {
2012 struct inode *inode = file_inode(file);
2013 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2014 struct btrfs_root *root = BTRFS_I(inode)->root;
2015 int ret = 0;
2016 u64 flags = 0;
2017
2018 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID)
2019 return -EINVAL;
2020
2021 down_read(&fs_info->subvol_sem);
2022 if (btrfs_root_readonly(root))
2023 flags |= BTRFS_SUBVOL_RDONLY;
2024 up_read(&fs_info->subvol_sem);
2025
2026 if (copy_to_user(arg, &flags, sizeof(flags)))
2027 ret = -EFAULT;
2028
2029 return ret;
2030 }
2031
2032 static noinline int btrfs_ioctl_subvol_setflags(struct file *file,
2033 void __user *arg)
2034 {
2035 struct inode *inode = file_inode(file);
2036 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2037 struct btrfs_root *root = BTRFS_I(inode)->root;
2038 struct btrfs_trans_handle *trans;
2039 u64 root_flags;
2040 u64 flags;
2041 int ret = 0;
2042
2043 if (!inode_owner_or_capable(file_mnt_user_ns(file), inode))
2044 return -EPERM;
2045
2046 ret = mnt_want_write_file(file);
2047 if (ret)
2048 goto out;
2049
2050 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
2051 ret = -EINVAL;
2052 goto out_drop_write;
2053 }
2054
2055 if (copy_from_user(&flags, arg, sizeof(flags))) {
2056 ret = -EFAULT;
2057 goto out_drop_write;
2058 }
2059
2060 if (flags & ~BTRFS_SUBVOL_RDONLY) {
2061 ret = -EOPNOTSUPP;
2062 goto out_drop_write;
2063 }
2064
2065 down_write(&fs_info->subvol_sem);
2066
2067 /* nothing to do */
2068 if (!!(flags & BTRFS_SUBVOL_RDONLY) == btrfs_root_readonly(root))
2069 goto out_drop_sem;
2070
2071 root_flags = btrfs_root_flags(&root->root_item);
2072 if (flags & BTRFS_SUBVOL_RDONLY) {
2073 btrfs_set_root_flags(&root->root_item,
2074 root_flags | BTRFS_ROOT_SUBVOL_RDONLY);
2075 } else {
2076 /*
2077 * Block RO -> RW transition if this subvolume is involved in
2078 * send
2079 */
2080 spin_lock(&root->root_item_lock);
2081 if (root->send_in_progress == 0) {
2082 btrfs_set_root_flags(&root->root_item,
2083 root_flags & ~BTRFS_ROOT_SUBVOL_RDONLY);
2084 spin_unlock(&root->root_item_lock);
2085 } else {
2086 spin_unlock(&root->root_item_lock);
2087 btrfs_warn(fs_info,
2088 "Attempt to set subvolume %llu read-write during send",
2089 root->root_key.objectid);
2090 ret = -EPERM;
2091 goto out_drop_sem;
2092 }
2093 }
2094
2095 trans = btrfs_start_transaction(root, 1);
2096 if (IS_ERR(trans)) {
2097 ret = PTR_ERR(trans);
2098 goto out_reset;
2099 }
2100
2101 ret = btrfs_update_root(trans, fs_info->tree_root,
2102 &root->root_key, &root->root_item);
2103 if (ret < 0) {
2104 btrfs_end_transaction(trans);
2105 goto out_reset;
2106 }
2107
2108 ret = btrfs_commit_transaction(trans);
2109
2110 out_reset:
2111 if (ret)
2112 btrfs_set_root_flags(&root->root_item, root_flags);
2113 out_drop_sem:
2114 up_write(&fs_info->subvol_sem);
2115 out_drop_write:
2116 mnt_drop_write_file(file);
2117 out:
2118 return ret;
2119 }
2120
2121 static noinline int key_in_sk(struct btrfs_key *key,
2122 struct btrfs_ioctl_search_key *sk)
2123 {
2124 struct btrfs_key test;
2125 int ret;
2126
2127 test.objectid = sk->min_objectid;
2128 test.type = sk->min_type;
2129 test.offset = sk->min_offset;
2130
2131 ret = btrfs_comp_cpu_keys(key, &test);
2132 if (ret < 0)
2133 return 0;
2134
2135 test.objectid = sk->max_objectid;
2136 test.type = sk->max_type;
2137 test.offset = sk->max_offset;
2138
2139 ret = btrfs_comp_cpu_keys(key, &test);
2140 if (ret > 0)
2141 return 0;
2142 return 1;
2143 }
2144
2145 static noinline int copy_to_sk(struct btrfs_path *path,
2146 struct btrfs_key *key,
2147 struct btrfs_ioctl_search_key *sk,
2148 size_t *buf_size,
2149 char __user *ubuf,
2150 unsigned long *sk_offset,
2151 int *num_found)
2152 {
2153 u64 found_transid;
2154 struct extent_buffer *leaf;
2155 struct btrfs_ioctl_search_header sh;
2156 struct btrfs_key test;
2157 unsigned long item_off;
2158 unsigned long item_len;
2159 int nritems;
2160 int i;
2161 int slot;
2162 int ret = 0;
2163
2164 leaf = path->nodes[0];
2165 slot = path->slots[0];
2166 nritems = btrfs_header_nritems(leaf);
2167
2168 if (btrfs_header_generation(leaf) > sk->max_transid) {
2169 i = nritems;
2170 goto advance_key;
2171 }
2172 found_transid = btrfs_header_generation(leaf);
2173
2174 for (i = slot; i < nritems; i++) {
2175 item_off = btrfs_item_ptr_offset(leaf, i);
2176 item_len = btrfs_item_size(leaf, i);
2177
2178 btrfs_item_key_to_cpu(leaf, key, i);
2179 if (!key_in_sk(key, sk))
2180 continue;
2181
2182 if (sizeof(sh) + item_len > *buf_size) {
2183 if (*num_found) {
2184 ret = 1;
2185 goto out;
2186 }
2187
2188 /*
2189 * return one empty item back for v1, which does not
2190 * handle -EOVERFLOW
2191 */
2192
2193 *buf_size = sizeof(sh) + item_len;
2194 item_len = 0;
2195 ret = -EOVERFLOW;
2196 }
2197
2198 if (sizeof(sh) + item_len + *sk_offset > *buf_size) {
2199 ret = 1;
2200 goto out;
2201 }
2202
2203 sh.objectid = key->objectid;
2204 sh.offset = key->offset;
2205 sh.type = key->type;
2206 sh.len = item_len;
2207 sh.transid = found_transid;
2208
2209 /*
2210 * Copy search result header. If we fault then loop again so we
2211 * can fault in the pages and -EFAULT there if there's a
2212 * problem. Otherwise we'll fault and then copy the buffer in
2213 * properly this next time through
2214 */
2215 if (copy_to_user_nofault(ubuf + *sk_offset, &sh, sizeof(sh))) {
2216 ret = 0;
2217 goto out;
2218 }
2219
2220 *sk_offset += sizeof(sh);
2221
2222 if (item_len) {
2223 char __user *up = ubuf + *sk_offset;
2224 /*
2225 * Copy the item, same behavior as above, but reset the
2226 * * sk_offset so we copy the full thing again.
2227 */
2228 if (read_extent_buffer_to_user_nofault(leaf, up,
2229 item_off, item_len)) {
2230 ret = 0;
2231 *sk_offset -= sizeof(sh);
2232 goto out;
2233 }
2234
2235 *sk_offset += item_len;
2236 }
2237 (*num_found)++;
2238
2239 if (ret) /* -EOVERFLOW from above */
2240 goto out;
2241
2242 if (*num_found >= sk->nr_items) {
2243 ret = 1;
2244 goto out;
2245 }
2246 }
2247 advance_key:
2248 ret = 0;
2249 test.objectid = sk->max_objectid;
2250 test.type = sk->max_type;
2251 test.offset = sk->max_offset;
2252 if (btrfs_comp_cpu_keys(key, &test) >= 0)
2253 ret = 1;
2254 else if (key->offset < (u64)-1)
2255 key->offset++;
2256 else if (key->type < (u8)-1) {
2257 key->offset = 0;
2258 key->type++;
2259 } else if (key->objectid < (u64)-1) {
2260 key->offset = 0;
2261 key->type = 0;
2262 key->objectid++;
2263 } else
2264 ret = 1;
2265 out:
2266 /*
2267 * 0: all items from this leaf copied, continue with next
2268 * 1: * more items can be copied, but unused buffer is too small
2269 * * all items were found
2270 * Either way, it will stops the loop which iterates to the next
2271 * leaf
2272 * -EOVERFLOW: item was to large for buffer
2273 * -EFAULT: could not copy extent buffer back to userspace
2274 */
2275 return ret;
2276 }
2277
2278 static noinline int search_ioctl(struct inode *inode,
2279 struct btrfs_ioctl_search_key *sk,
2280 size_t *buf_size,
2281 char __user *ubuf)
2282 {
2283 struct btrfs_fs_info *info = btrfs_sb(inode->i_sb);
2284 struct btrfs_root *root;
2285 struct btrfs_key key;
2286 struct btrfs_path *path;
2287 int ret;
2288 int num_found = 0;
2289 unsigned long sk_offset = 0;
2290
2291 if (*buf_size < sizeof(struct btrfs_ioctl_search_header)) {
2292 *buf_size = sizeof(struct btrfs_ioctl_search_header);
2293 return -EOVERFLOW;
2294 }
2295
2296 path = btrfs_alloc_path();
2297 if (!path)
2298 return -ENOMEM;
2299
2300 if (sk->tree_id == 0) {
2301 /* search the root of the inode that was passed */
2302 root = btrfs_grab_root(BTRFS_I(inode)->root);
2303 } else {
2304 root = btrfs_get_fs_root(info, sk->tree_id, true);
2305 if (IS_ERR(root)) {
2306 btrfs_free_path(path);
2307 return PTR_ERR(root);
2308 }
2309 }
2310
2311 key.objectid = sk->min_objectid;
2312 key.type = sk->min_type;
2313 key.offset = sk->min_offset;
2314
2315 while (1) {
2316 ret = -EFAULT;
2317 if (fault_in_writeable(ubuf + sk_offset, *buf_size - sk_offset))
2318 break;
2319
2320 ret = btrfs_search_forward(root, &key, path, sk->min_transid);
2321 if (ret != 0) {
2322 if (ret > 0)
2323 ret = 0;
2324 goto err;
2325 }
2326 ret = copy_to_sk(path, &key, sk, buf_size, ubuf,
2327 &sk_offset, &num_found);
2328 btrfs_release_path(path);
2329 if (ret)
2330 break;
2331
2332 }
2333 if (ret > 0)
2334 ret = 0;
2335 err:
2336 sk->nr_items = num_found;
2337 btrfs_put_root(root);
2338 btrfs_free_path(path);
2339 return ret;
2340 }
2341
2342 static noinline int btrfs_ioctl_tree_search(struct file *file,
2343 void __user *argp)
2344 {
2345 struct btrfs_ioctl_search_args __user *uargs;
2346 struct btrfs_ioctl_search_key sk;
2347 struct inode *inode;
2348 int ret;
2349 size_t buf_size;
2350
2351 if (!capable(CAP_SYS_ADMIN))
2352 return -EPERM;
2353
2354 uargs = (struct btrfs_ioctl_search_args __user *)argp;
2355
2356 if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
2357 return -EFAULT;
2358
2359 buf_size = sizeof(uargs->buf);
2360
2361 inode = file_inode(file);
2362 ret = search_ioctl(inode, &sk, &buf_size, uargs->buf);
2363
2364 /*
2365 * In the origin implementation an overflow is handled by returning a
2366 * search header with a len of zero, so reset ret.
2367 */
2368 if (ret == -EOVERFLOW)
2369 ret = 0;
2370
2371 if (ret == 0 && copy_to_user(&uargs->key, &sk, sizeof(sk)))
2372 ret = -EFAULT;
2373 return ret;
2374 }
2375
2376 static noinline int btrfs_ioctl_tree_search_v2(struct file *file,
2377 void __user *argp)
2378 {
2379 struct btrfs_ioctl_search_args_v2 __user *uarg;
2380 struct btrfs_ioctl_search_args_v2 args;
2381 struct inode *inode;
2382 int ret;
2383 size_t buf_size;
2384 const size_t buf_limit = SZ_16M;
2385
2386 if (!capable(CAP_SYS_ADMIN))
2387 return -EPERM;
2388
2389 /* copy search header and buffer size */
2390 uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
2391 if (copy_from_user(&args, uarg, sizeof(args)))
2392 return -EFAULT;
2393
2394 buf_size = args.buf_size;
2395
2396 /* limit result size to 16MB */
2397 if (buf_size > buf_limit)
2398 buf_size = buf_limit;
2399
2400 inode = file_inode(file);
2401 ret = search_ioctl(inode, &args.key, &buf_size,
2402 (char __user *)(&uarg->buf[0]));
2403 if (ret == 0 && copy_to_user(&uarg->key, &args.key, sizeof(args.key)))
2404 ret = -EFAULT;
2405 else if (ret == -EOVERFLOW &&
2406 copy_to_user(&uarg->buf_size, &buf_size, sizeof(buf_size)))
2407 ret = -EFAULT;
2408
2409 return ret;
2410 }
2411
2412 /*
2413 * Search INODE_REFs to identify path name of 'dirid' directory
2414 * in a 'tree_id' tree. and sets path name to 'name'.
2415 */
2416 static noinline int btrfs_search_path_in_tree(struct btrfs_fs_info *info,
2417 u64 tree_id, u64 dirid, char *name)
2418 {
2419 struct btrfs_root *root;
2420 struct btrfs_key key;
2421 char *ptr;
2422 int ret = -1;
2423 int slot;
2424 int len;
2425 int total_len = 0;
2426 struct btrfs_inode_ref *iref;
2427 struct extent_buffer *l;
2428 struct btrfs_path *path;
2429
2430 if (dirid == BTRFS_FIRST_FREE_OBJECTID) {
2431 name[0]='\0';
2432 return 0;
2433 }
2434
2435 path = btrfs_alloc_path();
2436 if (!path)
2437 return -ENOMEM;
2438
2439 ptr = &name[BTRFS_INO_LOOKUP_PATH_MAX - 1];
2440
2441 root = btrfs_get_fs_root(info, tree_id, true);
2442 if (IS_ERR(root)) {
2443 ret = PTR_ERR(root);
2444 root = NULL;
2445 goto out;
2446 }
2447
2448 key.objectid = dirid;
2449 key.type = BTRFS_INODE_REF_KEY;
2450 key.offset = (u64)-1;
2451
2452 while (1) {
2453 ret = btrfs_search_backwards(root, &key, path);
2454 if (ret < 0)
2455 goto out;
2456 else if (ret > 0) {
2457 ret = -ENOENT;
2458 goto out;
2459 }
2460
2461 l = path->nodes[0];
2462 slot = path->slots[0];
2463
2464 iref = btrfs_item_ptr(l, slot, struct btrfs_inode_ref);
2465 len = btrfs_inode_ref_name_len(l, iref);
2466 ptr -= len + 1;
2467 total_len += len + 1;
2468 if (ptr < name) {
2469 ret = -ENAMETOOLONG;
2470 goto out;
2471 }
2472
2473 *(ptr + len) = '/';
2474 read_extent_buffer(l, ptr, (unsigned long)(iref + 1), len);
2475
2476 if (key.offset == BTRFS_FIRST_FREE_OBJECTID)
2477 break;
2478
2479 btrfs_release_path(path);
2480 key.objectid = key.offset;
2481 key.offset = (u64)-1;
2482 dirid = key.objectid;
2483 }
2484 memmove(name, ptr, total_len);
2485 name[total_len] = '\0';
2486 ret = 0;
2487 out:
2488 btrfs_put_root(root);
2489 btrfs_free_path(path);
2490 return ret;
2491 }
2492
2493 static int btrfs_search_path_in_tree_user(struct user_namespace *mnt_userns,
2494 struct inode *inode,
2495 struct btrfs_ioctl_ino_lookup_user_args *args)
2496 {
2497 struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2498 struct super_block *sb = inode->i_sb;
2499 struct btrfs_key upper_limit = BTRFS_I(inode)->location;
2500 u64 treeid = BTRFS_I(inode)->root->root_key.objectid;
2501 u64 dirid = args->dirid;
2502 unsigned long item_off;
2503 unsigned long item_len;
2504 struct btrfs_inode_ref *iref;
2505 struct btrfs_root_ref *rref;
2506 struct btrfs_root *root = NULL;
2507 struct btrfs_path *path;
2508 struct btrfs_key key, key2;
2509 struct extent_buffer *leaf;
2510 struct inode *temp_inode;
2511 char *ptr;
2512 int slot;
2513 int len;
2514 int total_len = 0;
2515 int ret;
2516
2517 path = btrfs_alloc_path();
2518 if (!path)
2519 return -ENOMEM;
2520
2521 /*
2522 * If the bottom subvolume does not exist directly under upper_limit,
2523 * construct the path in from the bottom up.
2524 */
2525 if (dirid != upper_limit.objectid) {
2526 ptr = &args->path[BTRFS_INO_LOOKUP_USER_PATH_MAX - 1];
2527
2528 root = btrfs_get_fs_root(fs_info, treeid, true);
2529 if (IS_ERR(root)) {
2530 ret = PTR_ERR(root);
2531 goto out;
2532 }
2533
2534 key.objectid = dirid;
2535 key.type = BTRFS_INODE_REF_KEY;
2536 key.offset = (u64)-1;
2537 while (1) {
2538 ret = btrfs_search_backwards(root, &key, path);
2539 if (ret < 0)
2540 goto out_put;
2541 else if (ret > 0) {
2542 ret = -ENOENT;
2543 goto out_put;
2544 }
2545
2546 leaf = path->nodes[0];
2547 slot = path->slots[0];
2548
2549 iref = btrfs_item_ptr(leaf, slot, struct btrfs_inode_ref);
2550 len = btrfs_inode_ref_name_len(leaf, iref);
2551 ptr -= len + 1;
2552 total_len += len + 1;
2553 if (ptr < args->path) {
2554 ret = -ENAMETOOLONG;
2555 goto out_put;
2556 }
2557
2558 *(ptr + len) = '/';
2559 read_extent_buffer(leaf, ptr,
2560 (unsigned long)(iref + 1), len);
2561
2562 /* Check the read+exec permission of this directory */
2563 ret = btrfs_previous_item(root, path, dirid,
2564 BTRFS_INODE_ITEM_KEY);
2565 if (ret < 0) {
2566 goto out_put;
2567 } else if (ret > 0) {
2568 ret = -ENOENT;
2569 goto out_put;
2570 }
2571
2572 leaf = path->nodes[0];
2573 slot = path->slots[0];
2574 btrfs_item_key_to_cpu(leaf, &key2, slot);
2575 if (key2.objectid != dirid) {
2576 ret = -ENOENT;
2577 goto out_put;
2578 }
2579
2580 temp_inode = btrfs_iget(sb, key2.objectid, root);
2581 if (IS_ERR(temp_inode)) {
2582 ret = PTR_ERR(temp_inode);
2583 goto out_put;
2584 }
2585 ret = inode_permission(mnt_userns, temp_inode,
2586 MAY_READ | MAY_EXEC);
2587 iput(temp_inode);
2588 if (ret) {
2589 ret = -EACCES;
2590 goto out_put;
2591 }
2592
2593 if (key.offset == upper_limit.objectid)
2594 break;
2595 if (key.objectid == BTRFS_FIRST_FREE_OBJECTID) {
2596 ret = -EACCES;
2597 goto out_put;
2598 }
2599
2600 btrfs_release_path(path);
2601 key.objectid = key.offset;
2602 key.offset = (u64)-1;
2603 dirid = key.objectid;
2604 }
2605
2606 memmove(args->path, ptr, total_len);
2607 args->path[total_len] = '\0';
2608 btrfs_put_root(root);
2609 root = NULL;
2610 btrfs_release_path(path);
2611 }
2612
2613 /* Get the bottom subvolume's name from ROOT_REF */
2614 key.objectid = treeid;
2615 key.type = BTRFS_ROOT_REF_KEY;
2616 key.offset = args->treeid;
2617 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2618 if (ret < 0) {
2619 goto out;
2620 } else if (ret > 0) {
2621 ret = -ENOENT;
2622 goto out;
2623 }
2624
2625 leaf = path->nodes[0];
2626 slot = path->slots[0];
2627 btrfs_item_key_to_cpu(leaf, &key, slot);
2628
2629 item_off = btrfs_item_ptr_offset(leaf, slot);
2630 item_len = btrfs_item_size(leaf, slot);
2631 /* Check if dirid in ROOT_REF corresponds to passed dirid */
2632 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2633 if (args->dirid != btrfs_root_ref_dirid(leaf, rref)) {
2634 ret = -EINVAL;
2635 goto out;
2636 }
2637
2638 /* Copy subvolume's name */
2639 item_off += sizeof(struct btrfs_root_ref);
2640 item_len -= sizeof(struct btrfs_root_ref);
2641 read_extent_buffer(leaf, args->name, item_off, item_len);
2642 args->name[item_len] = 0;
2643
2644 out_put:
2645 btrfs_put_root(root);
2646 out:
2647 btrfs_free_path(path);
2648 return ret;
2649 }
2650
2651 static noinline int btrfs_ioctl_ino_lookup(struct file *file,
2652 void __user *argp)
2653 {
2654 struct btrfs_ioctl_ino_lookup_args *args;
2655 struct inode *inode;
2656 int ret = 0;
2657
2658 args = memdup_user(argp, sizeof(*args));
2659 if (IS_ERR(args))
2660 return PTR_ERR(args);
2661
2662 inode = file_inode(file);
2663
2664 /*
2665 * Unprivileged query to obtain the containing subvolume root id. The
2666 * path is reset so it's consistent with btrfs_search_path_in_tree.
2667 */
2668 if (args->treeid == 0)
2669 args->treeid = BTRFS_I(inode)->root->root_key.objectid;
2670
2671 if (args->objectid == BTRFS_FIRST_FREE_OBJECTID) {
2672 args->name[0] = 0;
2673 goto out;
2674 }
2675
2676 if (!capable(CAP_SYS_ADMIN)) {
2677 ret = -EPERM;
2678 goto out;
2679 }
2680
2681 ret = btrfs_search_path_in_tree(BTRFS_I(inode)->root->fs_info,
2682 args->treeid, args->objectid,
2683 args->name);
2684
2685 out:
2686 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2687 ret = -EFAULT;
2688
2689 kfree(args);
2690 return ret;
2691 }
2692
2693 /*
2694 * Version of ino_lookup ioctl (unprivileged)
2695 *
2696 * The main differences from ino_lookup ioctl are:
2697 *
2698 * 1. Read + Exec permission will be checked using inode_permission() during
2699 * path construction. -EACCES will be returned in case of failure.
2700 * 2. Path construction will be stopped at the inode number which corresponds
2701 * to the fd with which this ioctl is called. If constructed path does not
2702 * exist under fd's inode, -EACCES will be returned.
2703 * 3. The name of bottom subvolume is also searched and filled.
2704 */
2705 static int btrfs_ioctl_ino_lookup_user(struct file *file, void __user *argp)
2706 {
2707 struct btrfs_ioctl_ino_lookup_user_args *args;
2708 struct inode *inode;
2709 int ret;
2710
2711 args = memdup_user(argp, sizeof(*args));
2712 if (IS_ERR(args))
2713 return PTR_ERR(args);
2714
2715 inode = file_inode(file);
2716
2717 if (args->dirid == BTRFS_FIRST_FREE_OBJECTID &&
2718 BTRFS_I(inode)->location.objectid != BTRFS_FIRST_FREE_OBJECTID) {
2719 /*
2720 * The subvolume does not exist under fd with which this is
2721 * called
2722 */
2723 kfree(args);
2724 return -EACCES;
2725 }
2726
2727 ret = btrfs_search_path_in_tree_user(file_mnt_user_ns(file), inode, args);
2728
2729 if (ret == 0 && copy_to_user(argp, args, sizeof(*args)))
2730 ret = -EFAULT;
2731
2732 kfree(args);
2733 return ret;
2734 }
2735
2736 /* Get the subvolume information in BTRFS_ROOT_ITEM and BTRFS_ROOT_BACKREF */
2737 static int btrfs_ioctl_get_subvol_info(struct file *file, void __user *argp)
2738 {
2739 struct btrfs_ioctl_get_subvol_info_args *subvol_info;
2740 struct btrfs_fs_info *fs_info;
2741 struct btrfs_root *root;
2742 struct btrfs_path *path;
2743 struct btrfs_key key;
2744 struct btrfs_root_item *root_item;
2745 struct btrfs_root_ref *rref;
2746 struct extent_buffer *leaf;
2747 unsigned long item_off;
2748 unsigned long item_len;
2749 struct inode *inode;
2750 int slot;
2751 int ret = 0;
2752
2753 path = btrfs_alloc_path();
2754 if (!path)
2755 return -ENOMEM;
2756
2757 subvol_info = kzalloc(sizeof(*subvol_info), GFP_KERNEL);
2758 if (!subvol_info) {
2759 btrfs_free_path(path);
2760 return -ENOMEM;
2761 }
2762
2763 inode = file_inode(file);
2764 fs_info = BTRFS_I(inode)->root->fs_info;
2765
2766 /* Get root_item of inode's subvolume */
2767 key.objectid = BTRFS_I(inode)->root->root_key.objectid;
2768 root = btrfs_get_fs_root(fs_info, key.objectid, true);
2769 if (IS_ERR(root)) {
2770 ret = PTR_ERR(root);
2771 goto out_free;
2772 }
2773 root_item = &root->root_item;
2774
2775 subvol_info->treeid = key.objectid;
2776
2777 subvol_info->generation = btrfs_root_generation(root_item);
2778 subvol_info->flags = btrfs_root_flags(root_item);
2779
2780 memcpy(subvol_info->uuid, root_item->uuid, BTRFS_UUID_SIZE);
2781 memcpy(subvol_info->parent_uuid, root_item->parent_uuid,
2782 BTRFS_UUID_SIZE);
2783 memcpy(subvol_info->received_uuid, root_item->received_uuid,
2784 BTRFS_UUID_SIZE);
2785
2786 subvol_info->ctransid = btrfs_root_ctransid(root_item);
2787 subvol_info->ctime.sec = btrfs_stack_timespec_sec(&root_item->ctime);
2788 subvol_info->ctime.nsec = btrfs_stack_timespec_nsec(&root_item->ctime);
2789
2790 subvol_info->otransid = btrfs_root_otransid(root_item);
2791 subvol_info->otime.sec = btrfs_stack_timespec_sec(&root_item->otime);
2792 subvol_info->otime.nsec = btrfs_stack_timespec_nsec(&root_item->otime);
2793
2794 subvol_info->stransid = btrfs_root_stransid(root_item);
2795 subvol_info->stime.sec = btrfs_stack_timespec_sec(&root_item->stime);
2796 subvol_info->stime.nsec = btrfs_stack_timespec_nsec(&root_item->stime);
2797
2798 subvol_info->rtransid = btrfs_root_rtransid(root_item);
2799 subvol_info->rtime.sec = btrfs_stack_timespec_sec(&root_item->rtime);
2800 subvol_info->rtime.nsec = btrfs_stack_timespec_nsec(&root_item->rtime);
2801
2802 if (key.objectid != BTRFS_FS_TREE_OBJECTID) {
2803 /* Search root tree for ROOT_BACKREF of this subvolume */
2804 key.type = BTRFS_ROOT_BACKREF_KEY;
2805 key.offset = 0;
2806 ret = btrfs_search_slot(NULL, fs_info->tree_root, &key, path, 0, 0);
2807 if (ret < 0) {
2808 goto out;
2809 } else if (path->slots[0] >=
2810 btrfs_header_nritems(path->nodes[0])) {
2811 ret = btrfs_next_leaf(fs_info->tree_root, path);
2812 if (ret < 0) {
2813 goto out;
2814 } else if (ret > 0) {
2815 ret = -EUCLEAN;
2816 goto out;
2817 }
2818 }
2819
2820 leaf = path->nodes[0];
2821 slot = path->slots[0];
2822 btrfs_item_key_to_cpu(leaf, &key, slot);
2823 if (key.objectid == subvol_info->treeid &&
2824 key.type == BTRFS_ROOT_BACKREF_KEY) {
2825 subvol_info->parent_id = key.offset;
2826
2827 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2828 subvol_info->dirid = btrfs_root_ref_dirid(leaf, rref);
2829
2830 item_off = btrfs_item_ptr_offset(leaf, slot)
2831 + sizeof(struct btrfs_root_ref);
2832 item_len = btrfs_item_size(leaf, slot)
2833 - sizeof(struct btrfs_root_ref);
2834 read_extent_buffer(leaf, subvol_info->name,
2835 item_off, item_len);
2836 } else {
2837 ret = -ENOENT;
2838 goto out;
2839 }
2840 }
2841
2842 if (copy_to_user(argp, subvol_info, sizeof(*subvol_info)))
2843 ret = -EFAULT;
2844
2845 out:
2846 btrfs_put_root(root);
2847 out_free:
2848 btrfs_free_path(path);
2849 kfree(subvol_info);
2850 return ret;
2851 }
2852
2853 /*
2854 * Return ROOT_REF information of the subvolume containing this inode
2855 * except the subvolume name.
2856 */
2857 static int btrfs_ioctl_get_subvol_rootref(struct file *file, void __user *argp)
2858 {
2859 struct btrfs_ioctl_get_subvol_rootref_args *rootrefs;
2860 struct btrfs_root_ref *rref;
2861 struct btrfs_root *root;
2862 struct btrfs_path *path;
2863 struct btrfs_key key;
2864 struct extent_buffer *leaf;
2865 struct inode *inode;
2866 u64 objectid;
2867 int slot;
2868 int ret;
2869 u8 found;
2870
2871 path = btrfs_alloc_path();
2872 if (!path)
2873 return -ENOMEM;
2874
2875 rootrefs = memdup_user(argp, sizeof(*rootrefs));
2876 if (IS_ERR(rootrefs)) {
2877 btrfs_free_path(path);
2878 return PTR_ERR(rootrefs);
2879 }
2880
2881 inode = file_inode(file);
2882 root = BTRFS_I(inode)->root->fs_info->tree_root;
2883 objectid = BTRFS_I(inode)->root->root_key.objectid;
2884
2885 key.objectid = objectid;
2886 key.type = BTRFS_ROOT_REF_KEY;
2887 key.offset = rootrefs->min_treeid;
2888 found = 0;
2889
2890 ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
2891 if (ret < 0) {
2892 goto out;
2893 } else if (path->slots[0] >=
2894 btrfs_header_nritems(path->nodes[0])) {
2895 ret = btrfs_next_leaf(root, path);
2896 if (ret < 0) {
2897 goto out;
2898 } else if (ret > 0) {
2899 ret = -EUCLEAN;
2900 goto out;
2901 }
2902 }
2903 while (1) {
2904 leaf = path->nodes[0];
2905 slot = path->slots[0];
2906
2907 btrfs_item_key_to_cpu(leaf, &key, slot);
2908 if (key.objectid != objectid || key.type != BTRFS_ROOT_REF_KEY) {
2909 ret = 0;
2910 goto out;
2911 }
2912
2913 if (found == BTRFS_MAX_ROOTREF_BUFFER_NUM) {
2914 ret = -EOVERFLOW;
2915 goto out;
2916 }
2917
2918 rref = btrfs_item_ptr(leaf, slot, struct btrfs_root_ref);
2919 rootrefs->rootref[found].treeid = key.offset;
2920 rootrefs->rootref[found].dirid =
2921 btrfs_root_ref_dirid(leaf, rref);
2922 found++;
2923
2924 ret = btrfs_next_item(root, path);
2925 if (ret < 0) {
2926 goto out;
2927 } else if (ret > 0) {
2928 ret = -EUCLEAN;
2929 goto out;
2930 }
2931 }
2932
2933 out:
2934 if (!ret || ret == -EOVERFLOW) {
2935 rootrefs->num_items = found;
2936 /* update min_treeid for next search */
2937 if (found)
2938 rootrefs->min_treeid =
2939 rootrefs->rootref[found - 1].treeid + 1;
2940 if (copy_to_user(argp, rootrefs, sizeof(*rootrefs)))
2941 ret = -EFAULT;
2942 }
2943
2944 kfree(rootrefs);
2945 btrfs_free_path(path);
2946
2947 return ret;
2948 }
2949
2950 static noinline int btrfs_ioctl_snap_destroy(struct file *file,
2951 void __user *arg,
2952 bool destroy_v2)
2953 {
2954 struct dentry *parent = file->f_path.dentry;
2955 struct btrfs_fs_info *fs_info = btrfs_sb(parent->d_sb);
2956 struct dentry *dentry;
2957 struct inode *dir = d_inode(parent);
2958 struct inode *inode;
2959 struct btrfs_root *root = BTRFS_I(dir)->root;
2960 struct btrfs_root *dest = NULL;
2961 struct btrfs_ioctl_vol_args *vol_args = NULL;
2962 struct btrfs_ioctl_vol_args_v2 *vol_args2 = NULL;
2963 struct user_namespace *mnt_userns = file_mnt_user_ns(file);
2964 char *subvol_name, *subvol_name_ptr = NULL;
2965 int subvol_namelen;
2966 int err = 0;
2967 bool destroy_parent = false;
2968
2969 if (destroy_v2) {
2970 vol_args2 = memdup_user(arg, sizeof(*vol_args2));
2971 if (IS_ERR(vol_args2))
2972 return PTR_ERR(vol_args2);
2973
2974 if (vol_args2->flags & ~BTRFS_SUBVOL_DELETE_ARGS_MASK) {
2975 err = -EOPNOTSUPP;
2976 goto out;
2977 }
2978
2979 /*
2980 * If SPEC_BY_ID is not set, we are looking for the subvolume by
2981 * name, same as v1 currently does.
2982 */
2983 if (!(vol_args2->flags & BTRFS_SUBVOL_SPEC_BY_ID)) {
2984 vol_args2->name[BTRFS_SUBVOL_NAME_MAX] = 0;
2985 subvol_name = vol_args2->name;
2986
2987 err = mnt_want_write_file(file);
2988 if (err)
2989 goto out;
2990 } else {
2991 struct inode *old_dir;
2992
2993 if (vol_args2->subvolid < BTRFS_FIRST_FREE_OBJECTID) {
2994 err = -EINVAL;
2995 goto out;
2996 }
2997
2998 err = mnt_want_write_file(file);
2999 if (err)
3000 goto out;
3001
3002 dentry = btrfs_get_dentry(fs_info->sb,
3003 BTRFS_FIRST_FREE_OBJECTID,
3004 vol_args2->subvolid, 0, 0);
3005 if (IS_ERR(dentry)) {
3006 err = PTR_ERR(dentry);
3007 goto out_drop_write;
3008 }
3009
3010 /*
3011 * Change the default parent since the subvolume being
3012 * deleted can be outside of the current mount point.
3013 */
3014 parent = btrfs_get_parent(dentry);
3015
3016 /*
3017 * At this point dentry->d_name can point to '/' if the
3018 * subvolume we want to destroy is outsite of the
3019 * current mount point, so we need to release the
3020 * current dentry and execute the lookup to return a new
3021 * one with ->d_name pointing to the
3022 * <mount point>/subvol_name.
3023 */
3024 dput(dentry);
3025 if (IS_ERR(parent)) {
3026 err = PTR_ERR(parent);
3027 goto out_drop_write;
3028 }
3029 old_dir = dir;
3030 dir = d_inode(parent);
3031
3032 /*
3033 * If v2 was used with SPEC_BY_ID, a new parent was
3034 * allocated since the subvolume can be outside of the
3035 * current mount point. Later on we need to release this
3036 * new parent dentry.
3037 */
3038 destroy_parent = true;
3039
3040 /*
3041 * On idmapped mounts, deletion via subvolid is
3042 * restricted to subvolumes that are immediate
3043 * ancestors of the inode referenced by the file
3044 * descriptor in the ioctl. Otherwise the idmapping
3045 * could potentially be abused to delete subvolumes
3046 * anywhere in the filesystem the user wouldn't be able
3047 * to delete without an idmapped mount.
3048 */
3049 if (old_dir != dir && mnt_userns != &init_user_ns) {
3050 err = -EOPNOTSUPP;
3051 goto free_parent;
3052 }
3053
3054 subvol_name_ptr = btrfs_get_subvol_name_from_objectid(
3055 fs_info, vol_args2->subvolid);
3056 if (IS_ERR(subvol_name_ptr)) {
3057 err = PTR_ERR(subvol_name_ptr);
3058 goto free_parent;
3059 }
3060 /* subvol_name_ptr is already nul terminated */
3061 subvol_name = (char *)kbasename(subvol_name_ptr);
3062 }
3063 } else {
3064 vol_args = memdup_user(arg, sizeof(*vol_args));
3065 if (IS_ERR(vol_args))
3066 return PTR_ERR(vol_args);
3067
3068 vol_args->name[BTRFS_PATH_NAME_MAX] = 0;
3069 subvol_name = vol_args->name;
3070
3071 err = mnt_want_write_file(file);
3072 if (err)
3073 goto out;
3074 }
3075
3076 subvol_namelen = strlen(subvol_name);
3077
3078 if (strchr(subvol_name, '/') ||
3079 strncmp(subvol_name, "..", subvol_namelen) == 0) {
3080 err = -EINVAL;
3081 goto free_subvol_name;
3082 }
3083
3084 if (!S_ISDIR(dir->i_mode)) {
3085 err = -ENOTDIR;
3086 goto free_subvol_name;
3087 }
3088
3089 err = down_write_killable_nested(&dir->i_rwsem, I_MUTEX_PARENT);
3090 if (err == -EINTR)
3091 goto free_subvol_name;
3092 dentry = lookup_one(mnt_userns, subvol_name, parent, subvol_namelen);
3093 if (IS_ERR(dentry)) {
3094 err = PTR_ERR(dentry);
3095 goto out_unlock_dir;
3096 }
3097
3098 if (d_really_is_negative(dentry)) {
3099 err = -ENOENT;
3100 goto out_dput;
3101 }
3102
3103 inode = d_inode(dentry);
3104 dest = BTRFS_I(inode)->root;
3105 if (!capable(CAP_SYS_ADMIN)) {
3106 /*
3107 * Regular user. Only allow this with a special mount
3108 * option, when the user has write+exec access to the
3109 * subvol root, and when rmdir(2) would have been
3110 * allowed.
3111 *
3112 * Note that this is _not_ check that the subvol is
3113 * empty or doesn't contain data that we wouldn't
3114 * otherwise be able to delete.
3115 *
3116 * Users who want to delete empty subvols should try
3117 * rmdir(2).
3118 */
3119 err = -EPERM;
3120 if (!btrfs_test_opt(fs_info, USER_SUBVOL_RM_ALLOWED))
3121 goto out_dput;
3122
3123 /*
3124 * Do not allow deletion if the parent dir is the same
3125 * as the dir to be deleted. That means the ioctl
3126 * must be called on the dentry referencing the root
3127 * of the subvol, not a random directory contained
3128 * within it.
3129 */
3130 err = -EINVAL;
3131 if (root == dest)
3132 goto out_dput;
3133
3134 err = inode_permission(mnt_userns, inode, MAY_WRITE | MAY_EXEC);
3135 if (err)
3136 goto out_dput;
3137 }
3138
3139 /* check if subvolume may be deleted by a user */
3140 err = btrfs_may_delete(mnt_userns, dir, dentry, 1);
3141 if (err)
3142 goto out_dput;
3143
3144 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
3145 err = -EINVAL;
3146 goto out_dput;
3147 }
3148
3149 btrfs_inode_lock(inode, 0);
3150 err = btrfs_delete_subvolume(dir, dentry);
3151 btrfs_inode_unlock(inode, 0);
3152 if (!err) {
3153 fsnotify_rmdir(dir, dentry);
3154 d_delete(dentry);
3155 }
3156
3157 out_dput:
3158 dput(dentry);
3159 out_unlock_dir:
3160 btrfs_inode_unlock(dir, 0);
3161 free_subvol_name:
3162 kfree(subvol_name_ptr);
3163 free_parent:
3164 if (destroy_parent)
3165 dput(parent);
3166 out_drop_write:
3167 mnt_drop_write_file(file);
3168 out:
3169 kfree(vol_args2);
3170 kfree(vol_args);
3171 return err;
3172 }
3173
3174 static int btrfs_ioctl_defrag(struct file *file, void __user *argp)
3175 {
3176 struct inode *inode = file_inode(file);
3177 struct btrfs_root *root = BTRFS_I(inode)->root;
3178 struct btrfs_ioctl_defrag_range_args range = {0};
3179 int ret;
3180
3181 ret = mnt_want_write_file(file);
3182 if (ret)
3183 return ret;
3184
3185 if (btrfs_root_readonly(root)) {
3186 ret = -EROFS;
3187 goto out;
3188 }
3189
3190 switch (inode->i_mode & S_IFMT) {
3191 case S_IFDIR:
3192 if (!capable(CAP_SYS_ADMIN)) {
3193 ret = -EPERM;
3194 goto out;
3195 }
3196 ret = btrfs_defrag_root(root);
3197 break;
3198 case S_IFREG:
3199 /*
3200 * Note that this does not check the file descriptor for write
3201 * access. This prevents defragmenting executables that are
3202 * running and allows defrag on files open in read-only mode.
3203 */
3204 if (!capable(CAP_SYS_ADMIN) &&
3205 inode_permission(&init_user_ns, inode, MAY_WRITE)) {
3206 ret = -EPERM;
3207 goto out;
3208 }
3209
3210 if (argp) {
3211 if (copy_from_user(&range, argp, sizeof(range))) {
3212 ret = -EFAULT;
3213 goto out;
3214 }
3215 /* compression requires us to start the IO */
3216 if ((range.flags & BTRFS_DEFRAG_RANGE_COMPRESS)) {
3217 range.flags |= BTRFS_DEFRAG_RANGE_START_IO;
3218 range.extent_thresh = (u32)-1;
3219 }
3220 } else {
3221 /* the rest are all set to zero by kzalloc */
3222 range.len = (u64)-1;
3223 }
3224 ret = btrfs_defrag_file(file_inode(file), &file->f_ra,
3225 &range, BTRFS_OLDEST_GENERATION, 0);
3226 if (ret > 0)
3227 ret = 0;
3228 break;
3229 default:
3230 ret = -EINVAL;
3231 }
3232 out:
3233 mnt_drop_write_file(file);
3234 return ret;
3235 }
3236
3237 static long btrfs_ioctl_add_dev(struct btrfs_fs_info *fs_info, void __user *arg)
3238 {
3239 struct btrfs_ioctl_vol_args *vol_args;
3240 bool restore_op = false;
3241 int ret;
3242
3243 if (!capable(CAP_SYS_ADMIN))
3244 return -EPERM;
3245
3246 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_ADD)) {
3247 if (!btrfs_exclop_start_try_lock(fs_info, BTRFS_EXCLOP_DEV_ADD))
3248 return BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3249
3250 /*
3251 * We can do the device add because we have a paused balanced,
3252 * change the exclusive op type and remember we should bring
3253 * back the paused balance
3254 */
3255 fs_info->exclusive_operation = BTRFS_EXCLOP_DEV_ADD;
3256 btrfs_exclop_start_unlock(fs_info);
3257 restore_op = true;
3258 }
3259
3260 vol_args = memdup_user(arg, sizeof(*vol_args));
3261 if (IS_ERR(vol_args)) {
3262 ret = PTR_ERR(vol_args);
3263 goto out;
3264 }
3265
3266 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
3267 ret = btrfs_init_new_device(fs_info, vol_args->name);
3268
3269 if (!ret)
3270 btrfs_info(fs_info, "disk added %s", vol_args->name);
3271
3272 kfree(vol_args);
3273 out:
3274 if (restore_op)
3275 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE_PAUSED);
3276 else
3277 btrfs_exclop_finish(fs_info);
3278 return ret;
3279 }
3280
3281 static long btrfs_ioctl_rm_dev_v2(struct file *file, void __user *arg)
3282 {
3283 BTRFS_DEV_LOOKUP_ARGS(args);
3284 struct inode *inode = file_inode(file);
3285 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3286 struct btrfs_ioctl_vol_args_v2 *vol_args;
3287 struct block_device *bdev = NULL;
3288 fmode_t mode;
3289 int ret;
3290 bool cancel = false;
3291
3292 if (!capable(CAP_SYS_ADMIN))
3293 return -EPERM;
3294
3295 vol_args = memdup_user(arg, sizeof(*vol_args));
3296 if (IS_ERR(vol_args))
3297 return PTR_ERR(vol_args);
3298
3299 if (vol_args->flags & ~BTRFS_DEVICE_REMOVE_ARGS_MASK) {
3300 ret = -EOPNOTSUPP;
3301 goto out;
3302 }
3303
3304 vol_args->name[BTRFS_SUBVOL_NAME_MAX] = '\0';
3305 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID) {
3306 args.devid = vol_args->devid;
3307 } else if (!strcmp("cancel", vol_args->name)) {
3308 cancel = true;
3309 } else {
3310 ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
3311 if (ret)
3312 goto out;
3313 }
3314
3315 ret = mnt_want_write_file(file);
3316 if (ret)
3317 goto out;
3318
3319 ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
3320 cancel);
3321 if (ret)
3322 goto err_drop;
3323
3324 /* Exclusive operation is now claimed */
3325 ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
3326
3327 btrfs_exclop_finish(fs_info);
3328
3329 if (!ret) {
3330 if (vol_args->flags & BTRFS_DEVICE_SPEC_BY_ID)
3331 btrfs_info(fs_info, "device deleted: id %llu",
3332 vol_args->devid);
3333 else
3334 btrfs_info(fs_info, "device deleted: %s",
3335 vol_args->name);
3336 }
3337 err_drop:
3338 mnt_drop_write_file(file);
3339 if (bdev)
3340 blkdev_put(bdev, mode);
3341 out:
3342 btrfs_put_dev_args_from_path(&args);
3343 kfree(vol_args);
3344 return ret;
3345 }
3346
3347 static long btrfs_ioctl_rm_dev(struct file *file, void __user *arg)
3348 {
3349 BTRFS_DEV_LOOKUP_ARGS(args);
3350 struct inode *inode = file_inode(file);
3351 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3352 struct btrfs_ioctl_vol_args *vol_args;
3353 struct block_device *bdev = NULL;
3354 fmode_t mode;
3355 int ret;
3356 bool cancel;
3357
3358 if (!capable(CAP_SYS_ADMIN))
3359 return -EPERM;
3360
3361 vol_args = memdup_user(arg, sizeof(*vol_args));
3362 if (IS_ERR(vol_args))
3363 return PTR_ERR(vol_args);
3364
3365 vol_args->name[BTRFS_PATH_NAME_MAX] = '\0';
3366 if (!strcmp("cancel", vol_args->name)) {
3367 cancel = true;
3368 } else {
3369 ret = btrfs_get_dev_args_from_path(fs_info, &args, vol_args->name);
3370 if (ret)
3371 goto out;
3372 }
3373
3374 ret = mnt_want_write_file(file);
3375 if (ret)
3376 goto out;
3377
3378 ret = exclop_start_or_cancel_reloc(fs_info, BTRFS_EXCLOP_DEV_REMOVE,
3379 cancel);
3380 if (ret == 0) {
3381 ret = btrfs_rm_device(fs_info, &args, &bdev, &mode);
3382 if (!ret)
3383 btrfs_info(fs_info, "disk deleted %s", vol_args->name);
3384 btrfs_exclop_finish(fs_info);
3385 }
3386
3387 mnt_drop_write_file(file);
3388 if (bdev)
3389 blkdev_put(bdev, mode);
3390 out:
3391 btrfs_put_dev_args_from_path(&args);
3392 kfree(vol_args);
3393 return ret;
3394 }
3395
3396 static long btrfs_ioctl_fs_info(struct btrfs_fs_info *fs_info,
3397 void __user *arg)
3398 {
3399 struct btrfs_ioctl_fs_info_args *fi_args;
3400 struct btrfs_device *device;
3401 struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
3402 u64 flags_in;
3403 int ret = 0;
3404
3405 fi_args = memdup_user(arg, sizeof(*fi_args));
3406 if (IS_ERR(fi_args))
3407 return PTR_ERR(fi_args);
3408
3409 flags_in = fi_args->flags;
3410 memset(fi_args, 0, sizeof(*fi_args));
3411
3412 rcu_read_lock();
3413 fi_args->num_devices = fs_devices->num_devices;
3414
3415 list_for_each_entry_rcu(device, &fs_devices->devices, dev_list) {
3416 if (device->devid > fi_args->max_id)
3417 fi_args->max_id = device->devid;
3418 }
3419 rcu_read_unlock();
3420
3421 memcpy(&fi_args->fsid, fs_devices->fsid, sizeof(fi_args->fsid));
3422 fi_args->nodesize = fs_info->nodesize;
3423 fi_args->sectorsize = fs_info->sectorsize;
3424 fi_args->clone_alignment = fs_info->sectorsize;
3425
3426 if (flags_in & BTRFS_FS_INFO_FLAG_CSUM_INFO) {
3427 fi_args->csum_type = btrfs_super_csum_type(fs_info->super_copy);
3428 fi_args->csum_size = btrfs_super_csum_size(fs_info->super_copy);
3429 fi_args->flags |= BTRFS_FS_INFO_FLAG_CSUM_INFO;
3430 }
3431
3432 if (flags_in & BTRFS_FS_INFO_FLAG_GENERATION) {
3433 fi_args->generation = fs_info->generation;
3434 fi_args->flags |= BTRFS_FS_INFO_FLAG_GENERATION;
3435 }
3436
3437 if (flags_in & BTRFS_FS_INFO_FLAG_METADATA_UUID) {
3438 memcpy(&fi_args->metadata_uuid, fs_devices->metadata_uuid,
3439 sizeof(fi_args->metadata_uuid));
3440 fi_args->flags |= BTRFS_FS_INFO_FLAG_METADATA_UUID;
3441 }
3442
3443 if (copy_to_user(arg, fi_args, sizeof(*fi_args)))
3444 ret = -EFAULT;
3445
3446 kfree(fi_args);
3447 return ret;
3448 }
3449
3450 static long btrfs_ioctl_dev_info(struct btrfs_fs_info *fs_info,
3451 void __user *arg)
3452 {
3453 BTRFS_DEV_LOOKUP_ARGS(args);
3454 struct btrfs_ioctl_dev_info_args *di_args;
3455 struct btrfs_device *dev;
3456 int ret = 0;
3457
3458 di_args = memdup_user(arg, sizeof(*di_args));
3459 if (IS_ERR(di_args))
3460 return PTR_ERR(di_args);
3461
3462 args.devid = di_args->devid;
3463 if (!btrfs_is_empty_uuid(di_args->uuid))
3464 args.uuid = di_args->uuid;
3465
3466 rcu_read_lock();
3467 dev = btrfs_find_device(fs_info->fs_devices, &args);
3468 if (!dev) {
3469 ret = -ENODEV;
3470 goto out;
3471 }
3472
3473 di_args->devid = dev->devid;
3474 di_args->bytes_used = btrfs_device_get_bytes_used(dev);
3475 di_args->total_bytes = btrfs_device_get_total_bytes(dev);
3476 memcpy(di_args->uuid, dev->uuid, sizeof(di_args->uuid));
3477 if (dev->name) {
3478 strncpy(di_args->path, rcu_str_deref(dev->name),
3479 sizeof(di_args->path) - 1);
3480 di_args->path[sizeof(di_args->path) - 1] = 0;
3481 } else {
3482 di_args->path[0] = '\0';
3483 }
3484
3485 out:
3486 rcu_read_unlock();
3487 if (ret == 0 && copy_to_user(arg, di_args, sizeof(*di_args)))
3488 ret = -EFAULT;
3489
3490 kfree(di_args);
3491 return ret;
3492 }
3493
3494 static long btrfs_ioctl_default_subvol(struct file *file, void __user *argp)
3495 {
3496 struct inode *inode = file_inode(file);
3497 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
3498 struct btrfs_root *root = BTRFS_I(inode)->root;
3499 struct btrfs_root *new_root;
3500 struct btrfs_dir_item *di;
3501 struct btrfs_trans_handle *trans;
3502 struct btrfs_path *path = NULL;
3503 struct btrfs_disk_key disk_key;
3504 u64 objectid = 0;
3505 u64 dir_id;
3506 int ret;
3507
3508 if (!capable(CAP_SYS_ADMIN))
3509 return -EPERM;
3510
3511 ret = mnt_want_write_file(file);
3512 if (ret)
3513 return ret;
3514
3515 if (copy_from_user(&objectid, argp, sizeof(objectid))) {
3516 ret = -EFAULT;
3517 goto out;
3518 }
3519
3520 if (!objectid)
3521 objectid = BTRFS_FS_TREE_OBJECTID;
3522
3523 new_root = btrfs_get_fs_root(fs_info, objectid, true);
3524 if (IS_ERR(new_root)) {
3525 ret = PTR_ERR(new_root);
3526 goto out;
3527 }
3528 if (!is_fstree(new_root->root_key.objectid)) {
3529 ret = -ENOENT;
3530 goto out_free;
3531 }
3532
3533 path = btrfs_alloc_path();
3534 if (!path) {
3535 ret = -ENOMEM;
3536 goto out_free;
3537 }
3538
3539 trans = btrfs_start_transaction(root, 1);
3540 if (IS_ERR(trans)) {
3541 ret = PTR_ERR(trans);
3542 goto out_free;
3543 }
3544
3545 dir_id = btrfs_super_root_dir(fs_info->super_copy);
3546 di = btrfs_lookup_dir_item(trans, fs_info->tree_root, path,
3547 dir_id, "default", 7, 1);
3548 if (IS_ERR_OR_NULL(di)) {
3549 btrfs_release_path(path);
3550 btrfs_end_transaction(trans);
3551 btrfs_err(fs_info,
3552 "Umm, you don't have the default diritem, this isn't going to work");
3553 ret = -ENOENT;
3554 goto out_free;
3555 }
3556
3557 btrfs_cpu_key_to_disk(&disk_key, &new_root->root_key);
3558 btrfs_set_dir_item_key(path->nodes[0], di, &disk_key);
3559 btrfs_mark_buffer_dirty(path->nodes[0]);
3560 btrfs_release_path(path);
3561
3562 btrfs_set_fs_incompat(fs_info, DEFAULT_SUBVOL);
3563 btrfs_end_transaction(trans);
3564 out_free:
3565 btrfs_put_root(new_root);
3566 btrfs_free_path(path);
3567 out:
3568 mnt_drop_write_file(file);
3569 return ret;
3570 }
3571
3572 static void get_block_group_info(struct list_head *groups_list,
3573 struct btrfs_ioctl_space_info *space)
3574 {
3575 struct btrfs_block_group *block_group;
3576
3577 space->total_bytes = 0;
3578 space->used_bytes = 0;
3579 space->flags = 0;
3580 list_for_each_entry(block_group, groups_list, list) {
3581 space->flags = block_group->flags;
3582 space->total_bytes += block_group->length;
3583 space->used_bytes += block_group->used;
3584 }
3585 }
3586
3587 static long btrfs_ioctl_space_info(struct btrfs_fs_info *fs_info,
3588 void __user *arg)
3589 {
3590 struct btrfs_ioctl_space_args space_args;
3591 struct btrfs_ioctl_space_info space;
3592 struct btrfs_ioctl_space_info *dest;
3593 struct btrfs_ioctl_space_info *dest_orig;
3594 struct btrfs_ioctl_space_info __user *user_dest;
3595 struct btrfs_space_info *info;
3596 static const u64 types[] = {
3597 BTRFS_BLOCK_GROUP_DATA,
3598 BTRFS_BLOCK_GROUP_SYSTEM,
3599 BTRFS_BLOCK_GROUP_METADATA,
3600 BTRFS_BLOCK_GROUP_DATA | BTRFS_BLOCK_GROUP_METADATA
3601 };
3602 int num_types = 4;
3603 int alloc_size;
3604 int ret = 0;
3605 u64 slot_count = 0;
3606 int i, c;
3607
3608 if (copy_from_user(&space_args,
3609 (struct btrfs_ioctl_space_args __user *)arg,
3610 sizeof(space_args)))
3611 return -EFAULT;
3612
3613 for (i = 0; i < num_types; i++) {
3614 struct btrfs_space_info *tmp;
3615
3616 info = NULL;
3617 list_for_each_entry(tmp, &fs_info->space_info, list) {
3618 if (tmp->flags == types[i]) {
3619 info = tmp;
3620 break;
3621 }
3622 }
3623
3624 if (!info)
3625 continue;
3626
3627 down_read(&info->groups_sem);
3628 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3629 if (!list_empty(&info->block_groups[c]))
3630 slot_count++;
3631 }
3632 up_read(&info->groups_sem);
3633 }
3634
3635 /*
3636 * Global block reserve, exported as a space_info
3637 */
3638 slot_count++;
3639
3640 /* space_slots == 0 means they are asking for a count */
3641 if (space_args.space_slots == 0) {
3642 space_args.total_spaces = slot_count;
3643 goto out;
3644 }
3645
3646 slot_count = min_t(u64, space_args.space_slots, slot_count);
3647
3648 alloc_size = sizeof(*dest) * slot_count;
3649
3650 /* we generally have at most 6 or so space infos, one for each raid
3651 * level. So, a whole page should be more than enough for everyone
3652 */
3653 if (alloc_size > PAGE_SIZE)
3654 return -ENOMEM;
3655
3656 space_args.total_spaces = 0;
3657 dest = kmalloc(alloc_size, GFP_KERNEL);
3658 if (!dest)
3659 return -ENOMEM;
3660 dest_orig = dest;
3661
3662 /* now we have a buffer to copy into */
3663 for (i = 0; i < num_types; i++) {
3664 struct btrfs_space_info *tmp;
3665
3666 if (!slot_count)
3667 break;
3668
3669 info = NULL;
3670 list_for_each_entry(tmp, &fs_info->space_info, list) {
3671 if (tmp->flags == types[i]) {
3672 info = tmp;
3673 break;
3674 }
3675 }
3676
3677 if (!info)
3678 continue;
3679 down_read(&info->groups_sem);
3680 for (c = 0; c < BTRFS_NR_RAID_TYPES; c++) {
3681 if (!list_empty(&info->block_groups[c])) {
3682 get_block_group_info(&info->block_groups[c],
3683 &space);
3684 memcpy(dest, &space, sizeof(space));
3685 dest++;
3686 space_args.total_spaces++;
3687 slot_count--;
3688 }
3689 if (!slot_count)
3690 break;
3691 }
3692 up_read(&info->groups_sem);
3693 }
3694
3695 /*
3696 * Add global block reserve
3697 */
3698 if (slot_count) {
3699 struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
3700
3701 spin_lock(&block_rsv->lock);
3702 space.total_bytes = block_rsv->size;
3703 space.used_bytes = block_rsv->size - block_rsv->reserved;
3704 spin_unlock(&block_rsv->lock);
3705 space.flags = BTRFS_SPACE_INFO_GLOBAL_RSV;
3706 memcpy(dest, &space, sizeof(space));
3707 space_args.total_spaces++;
3708 }
3709
3710 user_dest = (struct btrfs_ioctl_space_info __user *)
3711 (arg + sizeof(struct btrfs_ioctl_space_args));
3712
3713 if (copy_to_user(user_dest, dest_orig, alloc_size))
3714 ret = -EFAULT;
3715
3716 kfree(dest_orig);
3717 out:
3718 if (ret == 0 && copy_to_user(arg, &space_args, sizeof(space_args)))
3719 ret = -EFAULT;
3720
3721 return ret;
3722 }
3723
3724 static noinline long btrfs_ioctl_start_sync(struct btrfs_root *root,
3725 void __user *argp)
3726 {
3727 struct btrfs_trans_handle *trans;
3728 u64 transid;
3729
3730 trans = btrfs_attach_transaction_barrier(root);
3731 if (IS_ERR(trans)) {
3732 if (PTR_ERR(trans) != -ENOENT)
3733 return PTR_ERR(trans);
3734
3735 /* No running transaction, don't bother */
3736 transid = root->fs_info->last_trans_committed;
3737 goto out;
3738 }
3739 transid = trans->transid;
3740 btrfs_commit_transaction_async(trans);
3741 out:
3742 if (argp)
3743 if (copy_to_user(argp, &transid, sizeof(transid)))
3744 return -EFAULT;
3745 return 0;
3746 }
3747
3748 static noinline long btrfs_ioctl_wait_sync(struct btrfs_fs_info *fs_info,
3749 void __user *argp)
3750 {
3751 u64 transid;
3752
3753 if (argp) {
3754 if (copy_from_user(&transid, argp, sizeof(transid)))
3755 return -EFAULT;
3756 } else {
3757 transid = 0; /* current trans */
3758 }
3759 return btrfs_wait_for_commit(fs_info, transid);
3760 }
3761
3762 static long btrfs_ioctl_scrub(struct file *file, void __user *arg)
3763 {
3764 struct btrfs_fs_info *fs_info = btrfs_sb(file_inode(file)->i_sb);
3765 struct btrfs_ioctl_scrub_args *sa;
3766 int ret;
3767
3768 if (!capable(CAP_SYS_ADMIN))
3769 return -EPERM;
3770
3771 sa = memdup_user(arg, sizeof(*sa));
3772 if (IS_ERR(sa))
3773 return PTR_ERR(sa);
3774
3775 if (!(sa->flags & BTRFS_SCRUB_READONLY)) {
3776 ret = mnt_want_write_file(file);
3777 if (ret)
3778 goto out;
3779 }
3780
3781 ret = btrfs_scrub_dev(fs_info, sa->devid, sa->start, sa->end,
3782 &sa->progress, sa->flags & BTRFS_SCRUB_READONLY,
3783 0);
3784
3785 /*
3786 * Copy scrub args to user space even if btrfs_scrub_dev() returned an
3787 * error. This is important as it allows user space to know how much
3788 * progress scrub has done. For example, if scrub is canceled we get
3789 * -ECANCELED from btrfs_scrub_dev() and return that error back to user
3790 * space. Later user space can inspect the progress from the structure
3791 * btrfs_ioctl_scrub_args and resume scrub from where it left off
3792 * previously (btrfs-progs does this).
3793 * If we fail to copy the btrfs_ioctl_scrub_args structure to user space
3794 * then return -EFAULT to signal the structure was not copied or it may
3795 * be corrupt and unreliable due to a partial copy.
3796 */
3797 if (copy_to_user(arg, sa, sizeof(*sa)))
3798 ret = -EFAULT;
3799
3800 if (!(sa->flags & BTRFS_SCRUB_READONLY))
3801 mnt_drop_write_file(file);
3802 out:
3803 kfree(sa);
3804 return ret;
3805 }
3806
3807 static long btrfs_ioctl_scrub_cancel(struct btrfs_fs_info *fs_info)
3808 {
3809 if (!capable(CAP_SYS_ADMIN))
3810 return -EPERM;
3811
3812 return btrfs_scrub_cancel(fs_info);
3813 }
3814
3815 static long btrfs_ioctl_scrub_progress(struct btrfs_fs_info *fs_info,
3816 void __user *arg)
3817 {
3818 struct btrfs_ioctl_scrub_args *sa;
3819 int ret;
3820
3821 if (!capable(CAP_SYS_ADMIN))
3822 return -EPERM;
3823
3824 sa = memdup_user(arg, sizeof(*sa));
3825 if (IS_ERR(sa))
3826 return PTR_ERR(sa);
3827
3828 ret = btrfs_scrub_progress(fs_info, sa->devid, &sa->progress);
3829
3830 if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
3831 ret = -EFAULT;
3832
3833 kfree(sa);
3834 return ret;
3835 }
3836
3837 static long btrfs_ioctl_get_dev_stats(struct btrfs_fs_info *fs_info,
3838 void __user *arg)
3839 {
3840 struct btrfs_ioctl_get_dev_stats *sa;
3841 int ret;
3842
3843 sa = memdup_user(arg, sizeof(*sa));
3844 if (IS_ERR(sa))
3845 return PTR_ERR(sa);
3846
3847 if ((sa->flags & BTRFS_DEV_STATS_RESET) && !capable(CAP_SYS_ADMIN)) {
3848 kfree(sa);
3849 return -EPERM;
3850 }
3851
3852 ret = btrfs_get_dev_stats(fs_info, sa);
3853
3854 if (ret == 0 && copy_to_user(arg, sa, sizeof(*sa)))
3855 ret = -EFAULT;
3856
3857 kfree(sa);
3858 return ret;
3859 }
3860
3861 static long btrfs_ioctl_dev_replace(struct btrfs_fs_info *fs_info,
3862 void __user *arg)
3863 {
3864 struct btrfs_ioctl_dev_replace_args *p;
3865 int ret;
3866
3867 if (!capable(CAP_SYS_ADMIN))
3868 return -EPERM;
3869
3870 p = memdup_user(arg, sizeof(*p));
3871 if (IS_ERR(p))
3872 return PTR_ERR(p);
3873
3874 switch (p->cmd) {
3875 case BTRFS_IOCTL_DEV_REPLACE_CMD_START:
3876 if (sb_rdonly(fs_info->sb)) {
3877 ret = -EROFS;
3878 goto out;
3879 }
3880 if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_DEV_REPLACE)) {
3881 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
3882 } else {
3883 ret = btrfs_dev_replace_by_ioctl(fs_info, p);
3884 btrfs_exclop_finish(fs_info);
3885 }
3886 break;
3887 case BTRFS_IOCTL_DEV_REPLACE_CMD_STATUS:
3888 btrfs_dev_replace_status(fs_info, p);
3889 ret = 0;
3890 break;
3891 case BTRFS_IOCTL_DEV_REPLACE_CMD_CANCEL:
3892 p->result = btrfs_dev_replace_cancel(fs_info);
3893 ret = 0;
3894 break;
3895 default:
3896 ret = -EINVAL;
3897 break;
3898 }
3899
3900 if ((ret == 0 || ret == -ECANCELED) && copy_to_user(arg, p, sizeof(*p)))
3901 ret = -EFAULT;
3902 out:
3903 kfree(p);
3904 return ret;
3905 }
3906
3907 static long btrfs_ioctl_ino_to_path(struct btrfs_root *root, void __user *arg)
3908 {
3909 int ret = 0;
3910 int i;
3911 u64 rel_ptr;
3912 int size;
3913 struct btrfs_ioctl_ino_path_args *ipa = NULL;
3914 struct inode_fs_paths *ipath = NULL;
3915 struct btrfs_path *path;
3916
3917 if (!capable(CAP_DAC_READ_SEARCH))
3918 return -EPERM;
3919
3920 path = btrfs_alloc_path();
3921 if (!path) {
3922 ret = -ENOMEM;
3923 goto out;
3924 }
3925
3926 ipa = memdup_user(arg, sizeof(*ipa));
3927 if (IS_ERR(ipa)) {
3928 ret = PTR_ERR(ipa);
3929 ipa = NULL;
3930 goto out;
3931 }
3932
3933 size = min_t(u32, ipa->size, 4096);
3934 ipath = init_ipath(size, root, path);
3935 if (IS_ERR(ipath)) {
3936 ret = PTR_ERR(ipath);
3937 ipath = NULL;
3938 goto out;
3939 }
3940
3941 ret = paths_from_inode(ipa->inum, ipath);
3942 if (ret < 0)
3943 goto out;
3944
3945 for (i = 0; i < ipath->fspath->elem_cnt; ++i) {
3946 rel_ptr = ipath->fspath->val[i] -
3947 (u64)(unsigned long)ipath->fspath->val;
3948 ipath->fspath->val[i] = rel_ptr;
3949 }
3950
3951 ret = copy_to_user((void __user *)(unsigned long)ipa->fspath,
3952 ipath->fspath, size);
3953 if (ret) {
3954 ret = -EFAULT;
3955 goto out;
3956 }
3957
3958 out:
3959 btrfs_free_path(path);
3960 free_ipath(ipath);
3961 kfree(ipa);
3962
3963 return ret;
3964 }
3965
3966 static int build_ino_list(u64 inum, u64 offset, u64 root, void *ctx)
3967 {
3968 struct btrfs_data_container *inodes = ctx;
3969 const size_t c = 3 * sizeof(u64);
3970
3971 if (inodes->bytes_left >= c) {
3972 inodes->bytes_left -= c;
3973 inodes->val[inodes->elem_cnt] = inum;
3974 inodes->val[inodes->elem_cnt + 1] = offset;
3975 inodes->val[inodes->elem_cnt + 2] = root;
3976 inodes->elem_cnt += 3;
3977 } else {
3978 inodes->bytes_missing += c - inodes->bytes_left;
3979 inodes->bytes_left = 0;
3980 inodes->elem_missed += 3;
3981 }
3982
3983 return 0;
3984 }
3985
3986 static long btrfs_ioctl_logical_to_ino(struct btrfs_fs_info *fs_info,
3987 void __user *arg, int version)
3988 {
3989 int ret = 0;
3990 int size;
3991 struct btrfs_ioctl_logical_ino_args *loi;
3992 struct btrfs_data_container *inodes = NULL;
3993 struct btrfs_path *path = NULL;
3994 bool ignore_offset;
3995
3996 if (!capable(CAP_SYS_ADMIN))
3997 return -EPERM;
3998
3999 loi = memdup_user(arg, sizeof(*loi));
4000 if (IS_ERR(loi))
4001 return PTR_ERR(loi);
4002
4003 if (version == 1) {
4004 ignore_offset = false;
4005 size = min_t(u32, loi->size, SZ_64K);
4006 } else {
4007 /* All reserved bits must be 0 for now */
4008 if (memchr_inv(loi->reserved, 0, sizeof(loi->reserved))) {
4009 ret = -EINVAL;
4010 goto out_loi;
4011 }
4012 /* Only accept flags we have defined so far */
4013 if (loi->flags & ~(BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET)) {
4014 ret = -EINVAL;
4015 goto out_loi;
4016 }
4017 ignore_offset = loi->flags & BTRFS_LOGICAL_INO_ARGS_IGNORE_OFFSET;
4018 size = min_t(u32, loi->size, SZ_16M);
4019 }
4020
4021 path = btrfs_alloc_path();
4022 if (!path) {
4023 ret = -ENOMEM;
4024 goto out;
4025 }
4026
4027 inodes = init_data_container(size);
4028 if (IS_ERR(inodes)) {
4029 ret = PTR_ERR(inodes);
4030 inodes = NULL;
4031 goto out;
4032 }
4033
4034 ret = iterate_inodes_from_logical(loi->logical, fs_info, path,
4035 build_ino_list, inodes, ignore_offset);
4036 if (ret == -EINVAL)
4037 ret = -ENOENT;
4038 if (ret < 0)
4039 goto out;
4040
4041 ret = copy_to_user((void __user *)(unsigned long)loi->inodes, inodes,
4042 size);
4043 if (ret)
4044 ret = -EFAULT;
4045
4046 out:
4047 btrfs_free_path(path);
4048 kvfree(inodes);
4049 out_loi:
4050 kfree(loi);
4051
4052 return ret;
4053 }
4054
4055 void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
4056 struct btrfs_ioctl_balance_args *bargs)
4057 {
4058 struct btrfs_balance_control *bctl = fs_info->balance_ctl;
4059
4060 bargs->flags = bctl->flags;
4061
4062 if (test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags))
4063 bargs->state |= BTRFS_BALANCE_STATE_RUNNING;
4064 if (atomic_read(&fs_info->balance_pause_req))
4065 bargs->state |= BTRFS_BALANCE_STATE_PAUSE_REQ;
4066 if (atomic_read(&fs_info->balance_cancel_req))
4067 bargs->state |= BTRFS_BALANCE_STATE_CANCEL_REQ;
4068
4069 memcpy(&bargs->data, &bctl->data, sizeof(bargs->data));
4070 memcpy(&bargs->meta, &bctl->meta, sizeof(bargs->meta));
4071 memcpy(&bargs->sys, &bctl->sys, sizeof(bargs->sys));
4072
4073 spin_lock(&fs_info->balance_lock);
4074 memcpy(&bargs->stat, &bctl->stat, sizeof(bargs->stat));
4075 spin_unlock(&fs_info->balance_lock);
4076 }
4077
4078 static long btrfs_ioctl_balance(struct file *file, void __user *arg)
4079 {
4080 struct btrfs_root *root = BTRFS_I(file_inode(file))->root;
4081 struct btrfs_fs_info *fs_info = root->fs_info;
4082 struct btrfs_ioctl_balance_args *bargs;
4083 struct btrfs_balance_control *bctl;
4084 bool need_unlock; /* for mut. excl. ops lock */
4085 int ret;
4086
4087 if (!arg)
4088 btrfs_warn(fs_info,
4089 "IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
4090
4091 if (!capable(CAP_SYS_ADMIN))
4092 return -EPERM;
4093
4094 ret = mnt_want_write_file(file);
4095 if (ret)
4096 return ret;
4097
4098 again:
4099 if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
4100 mutex_lock(&fs_info->balance_mutex);
4101 need_unlock = true;
4102 goto locked;
4103 }
4104
4105 /*
4106 * mut. excl. ops lock is locked. Three possibilities:
4107 * (1) some other op is running
4108 * (2) balance is running
4109 * (3) balance is paused -- special case (think resume)
4110 */
4111 mutex_lock(&fs_info->balance_mutex);
4112 if (fs_info->balance_ctl) {
4113 /* this is either (2) or (3) */
4114 if (!test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4115 mutex_unlock(&fs_info->balance_mutex);
4116 /*
4117 * Lock released to allow other waiters to continue,
4118 * we'll reexamine the status again.
4119 */
4120 mutex_lock(&fs_info->balance_mutex);
4121
4122 if (fs_info->balance_ctl &&
4123 !test_bit(BTRFS_FS_BALANCE_RUNNING, &fs_info->flags)) {
4124 /* this is (3) */
4125 need_unlock = false;
4126 goto locked;
4127 }
4128
4129 mutex_unlock(&fs_info->balance_mutex);
4130 goto again;
4131 } else {
4132 /* this is (2) */
4133 mutex_unlock(&fs_info->balance_mutex);
4134 ret = -EINPROGRESS;
4135 goto out;
4136 }
4137 } else {
4138 /* this is (1) */
4139 mutex_unlock(&fs_info->balance_mutex);
4140 ret = BTRFS_ERROR_DEV_EXCL_RUN_IN_PROGRESS;
4141 goto out;
4142 }
4143
4144 locked:
4145
4146 if (arg) {
4147 bargs = memdup_user(arg, sizeof(*bargs));
4148 if (IS_ERR(bargs)) {
4149 ret = PTR_ERR(bargs);
4150 goto out_unlock;
4151 }
4152
4153 if (bargs->flags & BTRFS_BALANCE_RESUME) {
4154 if (!fs_info->balance_ctl) {
4155 ret = -ENOTCONN;
4156 goto out_bargs;
4157 }
4158
4159 bctl = fs_info->balance_ctl;
4160 spin_lock(&fs_info->balance_lock);
4161 bctl->flags |= BTRFS_BALANCE_RESUME;
4162 spin_unlock(&fs_info->balance_lock);
4163 btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
4164
4165 goto do_balance;
4166 }
4167 } else {
4168 bargs = NULL;
4169 }
4170
4171 if (fs_info->balance_ctl) {
4172 ret = -EINPROGRESS;
4173 goto out_bargs;
4174 }
4175
4176 bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
4177 if (!bctl) {
4178 ret = -ENOMEM;
4179 goto out_bargs;
4180 }
4181
4182 if (arg) {
4183 memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
4184 memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
4185 memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
4186
4187 bctl->flags = bargs->flags;
4188 } else {
4189 /* balance everything - no filters */
4190 bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
4191 }
4192
4193 if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
4194 ret = -EINVAL;
4195 goto out_bctl;
4196 }
4197
4198 do_balance:
4199 /*
4200 * Ownership of bctl and exclusive operation goes to btrfs_balance.
4201 * bctl is freed in reset_balance_state, or, if restriper was paused
4202 * all the way until unmount, in free_fs_info. The flag should be
4203 * cleared after reset_balance_state.
4204 */
4205 need_unlock = false;
4206
4207 ret = btrfs_balance(fs_info, bctl, bargs);
4208 bctl = NULL;
4209
4210 if ((ret == 0 || ret == -ECANCELED) && arg) {
4211 if (copy_to_user(arg, bargs, sizeof(*bargs)))
4212 ret = -EFAULT;
4213 }
4214
4215 out_bctl:
4216 kfree(bctl);
4217 out_bargs:
4218 kfree(bargs);
4219 out_unlock:
4220 mutex_unlock(&fs_info->balance_mutex);
4221 if (need_unlock)
4222 btrfs_exclop_finish(fs_info);
4223 out:
4224 mnt_drop_write_file(file);
4225 return ret;
4226 }
4227
4228 static long btrfs_ioctl_balance_ctl(struct btrfs_fs_info *fs_info, int cmd)
4229 {
4230 if (!capable(CAP_SYS_ADMIN))
4231 return -EPERM;
4232
4233 switch (cmd) {
4234 case BTRFS_BALANCE_CTL_PAUSE:
4235 return btrfs_pause_balance(fs_info);
4236 case BTRFS_BALANCE_CTL_CANCEL:
4237 return btrfs_cancel_balance(fs_info);
4238 }
4239
4240 return -EINVAL;
4241 }
4242
4243 static long btrfs_ioctl_balance_progress(struct btrfs_fs_info *fs_info,
4244 void __user *arg)
4245 {
4246 struct btrfs_ioctl_balance_args *bargs;
4247 int ret = 0;
4248
4249 if (!capable(CAP_SYS_ADMIN))
4250 return -EPERM;
4251
4252 mutex_lock(&fs_info->balance_mutex);
4253 if (!fs_info->balance_ctl) {
4254 ret = -ENOTCONN;
4255 goto out;
4256 }
4257
4258 bargs = kzalloc(sizeof(*bargs), GFP_KERNEL);
4259 if (!bargs) {
4260 ret = -ENOMEM;
4261 goto out;
4262 }
4263
4264 btrfs_update_ioctl_balance_args(fs_info, bargs);
4265
4266 if (copy_to_user(arg, bargs, sizeof(*bargs)))
4267 ret = -EFAULT;
4268
4269 kfree(bargs);
4270 out:
4271 mutex_unlock(&fs_info->balance_mutex);
4272 return ret;
4273 }
4274
4275 static long btrfs_ioctl_quota_ctl(struct file *file, void __user *arg)
4276 {
4277 struct inode *inode = file_inode(file);
4278 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4279 struct btrfs_ioctl_quota_ctl_args *sa;
4280 int ret;
4281
4282 if (!capable(CAP_SYS_ADMIN))
4283 return -EPERM;
4284
4285 ret = mnt_want_write_file(file);
4286 if (ret)
4287 return ret;
4288
4289 sa = memdup_user(arg, sizeof(*sa));
4290 if (IS_ERR(sa)) {
4291 ret = PTR_ERR(sa);
4292 goto drop_write;
4293 }
4294
4295 down_write(&fs_info->subvol_sem);
4296
4297 switch (sa->cmd) {
4298 case BTRFS_QUOTA_CTL_ENABLE:
4299 ret = btrfs_quota_enable(fs_info);
4300 break;
4301 case BTRFS_QUOTA_CTL_DISABLE:
4302 ret = btrfs_quota_disable(fs_info);
4303 break;
4304 default:
4305 ret = -EINVAL;
4306 break;
4307 }
4308
4309 kfree(sa);
4310 up_write(&fs_info->subvol_sem);
4311 drop_write:
4312 mnt_drop_write_file(file);
4313 return ret;
4314 }
4315
4316 static long btrfs_ioctl_qgroup_assign(struct file *file, void __user *arg)
4317 {
4318 struct inode *inode = file_inode(file);
4319 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4320 struct btrfs_root *root = BTRFS_I(inode)->root;
4321 struct btrfs_ioctl_qgroup_assign_args *sa;
4322 struct btrfs_trans_handle *trans;
4323 int ret;
4324 int err;
4325
4326 if (!capable(CAP_SYS_ADMIN))
4327 return -EPERM;
4328
4329 ret = mnt_want_write_file(file);
4330 if (ret)
4331 return ret;
4332
4333 sa = memdup_user(arg, sizeof(*sa));
4334 if (IS_ERR(sa)) {
4335 ret = PTR_ERR(sa);
4336 goto drop_write;
4337 }
4338
4339 trans = btrfs_join_transaction(root);
4340 if (IS_ERR(trans)) {
4341 ret = PTR_ERR(trans);
4342 goto out;
4343 }
4344
4345 if (sa->assign) {
4346 ret = btrfs_add_qgroup_relation(trans, sa->src, sa->dst);
4347 } else {
4348 ret = btrfs_del_qgroup_relation(trans, sa->src, sa->dst);
4349 }
4350
4351 /* update qgroup status and info */
4352 err = btrfs_run_qgroups(trans);
4353 if (err < 0)
4354 btrfs_handle_fs_error(fs_info, err,
4355 "failed to update qgroup status and info");
4356 err = btrfs_end_transaction(trans);
4357 if (err && !ret)
4358 ret = err;
4359
4360 out:
4361 kfree(sa);
4362 drop_write:
4363 mnt_drop_write_file(file);
4364 return ret;
4365 }
4366
4367 static long btrfs_ioctl_qgroup_create(struct file *file, void __user *arg)
4368 {
4369 struct inode *inode = file_inode(file);
4370 struct btrfs_root *root = BTRFS_I(inode)->root;
4371 struct btrfs_ioctl_qgroup_create_args *sa;
4372 struct btrfs_trans_handle *trans;
4373 int ret;
4374 int err;
4375
4376 if (!capable(CAP_SYS_ADMIN))
4377 return -EPERM;
4378
4379 ret = mnt_want_write_file(file);
4380 if (ret)
4381 return ret;
4382
4383 sa = memdup_user(arg, sizeof(*sa));
4384 if (IS_ERR(sa)) {
4385 ret = PTR_ERR(sa);
4386 goto drop_write;
4387 }
4388
4389 if (!sa->qgroupid) {
4390 ret = -EINVAL;
4391 goto out;
4392 }
4393
4394 trans = btrfs_join_transaction(root);
4395 if (IS_ERR(trans)) {
4396 ret = PTR_ERR(trans);
4397 goto out;
4398 }
4399
4400 if (sa->create) {
4401 ret = btrfs_create_qgroup(trans, sa->qgroupid);
4402 } else {
4403 ret = btrfs_remove_qgroup(trans, sa->qgroupid);
4404 }
4405
4406 err = btrfs_end_transaction(trans);
4407 if (err && !ret)
4408 ret = err;
4409
4410 out:
4411 kfree(sa);
4412 drop_write:
4413 mnt_drop_write_file(file);
4414 return ret;
4415 }
4416
4417 static long btrfs_ioctl_qgroup_limit(struct file *file, void __user *arg)
4418 {
4419 struct inode *inode = file_inode(file);
4420 struct btrfs_root *root = BTRFS_I(inode)->root;
4421 struct btrfs_ioctl_qgroup_limit_args *sa;
4422 struct btrfs_trans_handle *trans;
4423 int ret;
4424 int err;
4425 u64 qgroupid;
4426
4427 if (!capable(CAP_SYS_ADMIN))
4428 return -EPERM;
4429
4430 ret = mnt_want_write_file(file);
4431 if (ret)
4432 return ret;
4433
4434 sa = memdup_user(arg, sizeof(*sa));
4435 if (IS_ERR(sa)) {
4436 ret = PTR_ERR(sa);
4437 goto drop_write;
4438 }
4439
4440 trans = btrfs_join_transaction(root);
4441 if (IS_ERR(trans)) {
4442 ret = PTR_ERR(trans);
4443 goto out;
4444 }
4445
4446 qgroupid = sa->qgroupid;
4447 if (!qgroupid) {
4448 /* take the current subvol as qgroup */
4449 qgroupid = root->root_key.objectid;
4450 }
4451
4452 ret = btrfs_limit_qgroup(trans, qgroupid, &sa->lim);
4453
4454 err = btrfs_end_transaction(trans);
4455 if (err && !ret)
4456 ret = err;
4457
4458 out:
4459 kfree(sa);
4460 drop_write:
4461 mnt_drop_write_file(file);
4462 return ret;
4463 }
4464
4465 static long btrfs_ioctl_quota_rescan(struct file *file, void __user *arg)
4466 {
4467 struct inode *inode = file_inode(file);
4468 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4469 struct btrfs_ioctl_quota_rescan_args *qsa;
4470 int ret;
4471
4472 if (!capable(CAP_SYS_ADMIN))
4473 return -EPERM;
4474
4475 ret = mnt_want_write_file(file);
4476 if (ret)
4477 return ret;
4478
4479 qsa = memdup_user(arg, sizeof(*qsa));
4480 if (IS_ERR(qsa)) {
4481 ret = PTR_ERR(qsa);
4482 goto drop_write;
4483 }
4484
4485 if (qsa->flags) {
4486 ret = -EINVAL;
4487 goto out;
4488 }
4489
4490 ret = btrfs_qgroup_rescan(fs_info);
4491
4492 out:
4493 kfree(qsa);
4494 drop_write:
4495 mnt_drop_write_file(file);
4496 return ret;
4497 }
4498
4499 static long btrfs_ioctl_quota_rescan_status(struct btrfs_fs_info *fs_info,
4500 void __user *arg)
4501 {
4502 struct btrfs_ioctl_quota_rescan_args qsa = {0};
4503
4504 if (!capable(CAP_SYS_ADMIN))
4505 return -EPERM;
4506
4507 if (fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_RESCAN) {
4508 qsa.flags = 1;
4509 qsa.progress = fs_info->qgroup_rescan_progress.objectid;
4510 }
4511
4512 if (copy_to_user(arg, &qsa, sizeof(qsa)))
4513 return -EFAULT;
4514
4515 return 0;
4516 }
4517
4518 static long btrfs_ioctl_quota_rescan_wait(struct btrfs_fs_info *fs_info,
4519 void __user *arg)
4520 {
4521 if (!capable(CAP_SYS_ADMIN))
4522 return -EPERM;
4523
4524 return btrfs_qgroup_wait_for_completion(fs_info, true);
4525 }
4526
4527 static long _btrfs_ioctl_set_received_subvol(struct file *file,
4528 struct user_namespace *mnt_userns,
4529 struct btrfs_ioctl_received_subvol_args *sa)
4530 {
4531 struct inode *inode = file_inode(file);
4532 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4533 struct btrfs_root *root = BTRFS_I(inode)->root;
4534 struct btrfs_root_item *root_item = &root->root_item;
4535 struct btrfs_trans_handle *trans;
4536 struct timespec64 ct = current_time(inode);
4537 int ret = 0;
4538 int received_uuid_changed;
4539
4540 if (!inode_owner_or_capable(mnt_userns, inode))
4541 return -EPERM;
4542
4543 ret = mnt_want_write_file(file);
4544 if (ret < 0)
4545 return ret;
4546
4547 down_write(&fs_info->subvol_sem);
4548
4549 if (btrfs_ino(BTRFS_I(inode)) != BTRFS_FIRST_FREE_OBJECTID) {
4550 ret = -EINVAL;
4551 goto out;
4552 }
4553
4554 if (btrfs_root_readonly(root)) {
4555 ret = -EROFS;
4556 goto out;
4557 }
4558
4559 /*
4560 * 1 - root item
4561 * 2 - uuid items (received uuid + subvol uuid)
4562 */
4563 trans = btrfs_start_transaction(root, 3);
4564 if (IS_ERR(trans)) {
4565 ret = PTR_ERR(trans);
4566 trans = NULL;
4567 goto out;
4568 }
4569
4570 sa->rtransid = trans->transid;
4571 sa->rtime.sec = ct.tv_sec;
4572 sa->rtime.nsec = ct.tv_nsec;
4573
4574 received_uuid_changed = memcmp(root_item->received_uuid, sa->uuid,
4575 BTRFS_UUID_SIZE);
4576 if (received_uuid_changed &&
4577 !btrfs_is_empty_uuid(root_item->received_uuid)) {
4578 ret = btrfs_uuid_tree_remove(trans, root_item->received_uuid,
4579 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4580 root->root_key.objectid);
4581 if (ret && ret != -ENOENT) {
4582 btrfs_abort_transaction(trans, ret);
4583 btrfs_end_transaction(trans);
4584 goto out;
4585 }
4586 }
4587 memcpy(root_item->received_uuid, sa->uuid, BTRFS_UUID_SIZE);
4588 btrfs_set_root_stransid(root_item, sa->stransid);
4589 btrfs_set_root_rtransid(root_item, sa->rtransid);
4590 btrfs_set_stack_timespec_sec(&root_item->stime, sa->stime.sec);
4591 btrfs_set_stack_timespec_nsec(&root_item->stime, sa->stime.nsec);
4592 btrfs_set_stack_timespec_sec(&root_item->rtime, sa->rtime.sec);
4593 btrfs_set_stack_timespec_nsec(&root_item->rtime, sa->rtime.nsec);
4594
4595 ret = btrfs_update_root(trans, fs_info->tree_root,
4596 &root->root_key, &root->root_item);
4597 if (ret < 0) {
4598 btrfs_end_transaction(trans);
4599 goto out;
4600 }
4601 if (received_uuid_changed && !btrfs_is_empty_uuid(sa->uuid)) {
4602 ret = btrfs_uuid_tree_add(trans, sa->uuid,
4603 BTRFS_UUID_KEY_RECEIVED_SUBVOL,
4604 root->root_key.objectid);
4605 if (ret < 0 && ret != -EEXIST) {
4606 btrfs_abort_transaction(trans, ret);
4607 btrfs_end_transaction(trans);
4608 goto out;
4609 }
4610 }
4611 ret = btrfs_commit_transaction(trans);
4612 out:
4613 up_write(&fs_info->subvol_sem);
4614 mnt_drop_write_file(file);
4615 return ret;
4616 }
4617
4618 #ifdef CONFIG_64BIT
4619 static long btrfs_ioctl_set_received_subvol_32(struct file *file,
4620 void __user *arg)
4621 {
4622 struct btrfs_ioctl_received_subvol_args_32 *args32 = NULL;
4623 struct btrfs_ioctl_received_subvol_args *args64 = NULL;
4624 int ret = 0;
4625
4626 args32 = memdup_user(arg, sizeof(*args32));
4627 if (IS_ERR(args32))
4628 return PTR_ERR(args32);
4629
4630 args64 = kmalloc(sizeof(*args64), GFP_KERNEL);
4631 if (!args64) {
4632 ret = -ENOMEM;
4633 goto out;
4634 }
4635
4636 memcpy(args64->uuid, args32->uuid, BTRFS_UUID_SIZE);
4637 args64->stransid = args32->stransid;
4638 args64->rtransid = args32->rtransid;
4639 args64->stime.sec = args32->stime.sec;
4640 args64->stime.nsec = args32->stime.nsec;
4641 args64->rtime.sec = args32->rtime.sec;
4642 args64->rtime.nsec = args32->rtime.nsec;
4643 args64->flags = args32->flags;
4644
4645 ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), args64);
4646 if (ret)
4647 goto out;
4648
4649 memcpy(args32->uuid, args64->uuid, BTRFS_UUID_SIZE);
4650 args32->stransid = args64->stransid;
4651 args32->rtransid = args64->rtransid;
4652 args32->stime.sec = args64->stime.sec;
4653 args32->stime.nsec = args64->stime.nsec;
4654 args32->rtime.sec = args64->rtime.sec;
4655 args32->rtime.nsec = args64->rtime.nsec;
4656 args32->flags = args64->flags;
4657
4658 ret = copy_to_user(arg, args32, sizeof(*args32));
4659 if (ret)
4660 ret = -EFAULT;
4661
4662 out:
4663 kfree(args32);
4664 kfree(args64);
4665 return ret;
4666 }
4667 #endif
4668
4669 static long btrfs_ioctl_set_received_subvol(struct file *file,
4670 void __user *arg)
4671 {
4672 struct btrfs_ioctl_received_subvol_args *sa = NULL;
4673 int ret = 0;
4674
4675 sa = memdup_user(arg, sizeof(*sa));
4676 if (IS_ERR(sa))
4677 return PTR_ERR(sa);
4678
4679 ret = _btrfs_ioctl_set_received_subvol(file, file_mnt_user_ns(file), sa);
4680
4681 if (ret)
4682 goto out;
4683
4684 ret = copy_to_user(arg, sa, sizeof(*sa));
4685 if (ret)
4686 ret = -EFAULT;
4687
4688 out:
4689 kfree(sa);
4690 return ret;
4691 }
4692
4693 static int btrfs_ioctl_get_fslabel(struct btrfs_fs_info *fs_info,
4694 void __user *arg)
4695 {
4696 size_t len;
4697 int ret;
4698 char label[BTRFS_LABEL_SIZE];
4699
4700 spin_lock(&fs_info->super_lock);
4701 memcpy(label, fs_info->super_copy->label, BTRFS_LABEL_SIZE);
4702 spin_unlock(&fs_info->super_lock);
4703
4704 len = strnlen(label, BTRFS_LABEL_SIZE);
4705
4706 if (len == BTRFS_LABEL_SIZE) {
4707 btrfs_warn(fs_info,
4708 "label is too long, return the first %zu bytes",
4709 --len);
4710 }
4711
4712 ret = copy_to_user(arg, label, len);
4713
4714 return ret ? -EFAULT : 0;
4715 }
4716
4717 static int btrfs_ioctl_set_fslabel(struct file *file, void __user *arg)
4718 {
4719 struct inode *inode = file_inode(file);
4720 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4721 struct btrfs_root *root = BTRFS_I(inode)->root;
4722 struct btrfs_super_block *super_block = fs_info->super_copy;
4723 struct btrfs_trans_handle *trans;
4724 char label[BTRFS_LABEL_SIZE];
4725 int ret;
4726
4727 if (!capable(CAP_SYS_ADMIN))
4728 return -EPERM;
4729
4730 if (copy_from_user(label, arg, sizeof(label)))
4731 return -EFAULT;
4732
4733 if (strnlen(label, BTRFS_LABEL_SIZE) == BTRFS_LABEL_SIZE) {
4734 btrfs_err(fs_info,
4735 "unable to set label with more than %d bytes",
4736 BTRFS_LABEL_SIZE - 1);
4737 return -EINVAL;
4738 }
4739
4740 ret = mnt_want_write_file(file);
4741 if (ret)
4742 return ret;
4743
4744 trans = btrfs_start_transaction(root, 0);
4745 if (IS_ERR(trans)) {
4746 ret = PTR_ERR(trans);
4747 goto out_unlock;
4748 }
4749
4750 spin_lock(&fs_info->super_lock);
4751 strcpy(super_block->label, label);
4752 spin_unlock(&fs_info->super_lock);
4753 ret = btrfs_commit_transaction(trans);
4754
4755 out_unlock:
4756 mnt_drop_write_file(file);
4757 return ret;
4758 }
4759
4760 #define INIT_FEATURE_FLAGS(suffix) \
4761 { .compat_flags = BTRFS_FEATURE_COMPAT_##suffix, \
4762 .compat_ro_flags = BTRFS_FEATURE_COMPAT_RO_##suffix, \
4763 .incompat_flags = BTRFS_FEATURE_INCOMPAT_##suffix }
4764
4765 int btrfs_ioctl_get_supported_features(void __user *arg)
4766 {
4767 static const struct btrfs_ioctl_feature_flags features[3] = {
4768 INIT_FEATURE_FLAGS(SUPP),
4769 INIT_FEATURE_FLAGS(SAFE_SET),
4770 INIT_FEATURE_FLAGS(SAFE_CLEAR)
4771 };
4772
4773 if (copy_to_user(arg, &features, sizeof(features)))
4774 return -EFAULT;
4775
4776 return 0;
4777 }
4778
4779 static int btrfs_ioctl_get_features(struct btrfs_fs_info *fs_info,
4780 void __user *arg)
4781 {
4782 struct btrfs_super_block *super_block = fs_info->super_copy;
4783 struct btrfs_ioctl_feature_flags features;
4784
4785 features.compat_flags = btrfs_super_compat_flags(super_block);
4786 features.compat_ro_flags = btrfs_super_compat_ro_flags(super_block);
4787 features.incompat_flags = btrfs_super_incompat_flags(super_block);
4788
4789 if (copy_to_user(arg, &features, sizeof(features)))
4790 return -EFAULT;
4791
4792 return 0;
4793 }
4794
4795 static int check_feature_bits(struct btrfs_fs_info *fs_info,
4796 enum btrfs_feature_set set,
4797 u64 change_mask, u64 flags, u64 supported_flags,
4798 u64 safe_set, u64 safe_clear)
4799 {
4800 const char *type = btrfs_feature_set_name(set);
4801 char *names;
4802 u64 disallowed, unsupported;
4803 u64 set_mask = flags & change_mask;
4804 u64 clear_mask = ~flags & change_mask;
4805
4806 unsupported = set_mask & ~supported_flags;
4807 if (unsupported) {
4808 names = btrfs_printable_features(set, unsupported);
4809 if (names) {
4810 btrfs_warn(fs_info,
4811 "this kernel does not support the %s feature bit%s",
4812 names, strchr(names, ',') ? "s" : "");
4813 kfree(names);
4814 } else
4815 btrfs_warn(fs_info,
4816 "this kernel does not support %s bits 0x%llx",
4817 type, unsupported);
4818 return -EOPNOTSUPP;
4819 }
4820
4821 disallowed = set_mask & ~safe_set;
4822 if (disallowed) {
4823 names = btrfs_printable_features(set, disallowed);
4824 if (names) {
4825 btrfs_warn(fs_info,
4826 "can't set the %s feature bit%s while mounted",
4827 names, strchr(names, ',') ? "s" : "");
4828 kfree(names);
4829 } else
4830 btrfs_warn(fs_info,
4831 "can't set %s bits 0x%llx while mounted",
4832 type, disallowed);
4833 return -EPERM;
4834 }
4835
4836 disallowed = clear_mask & ~safe_clear;
4837 if (disallowed) {
4838 names = btrfs_printable_features(set, disallowed);
4839 if (names) {
4840 btrfs_warn(fs_info,
4841 "can't clear the %s feature bit%s while mounted",
4842 names, strchr(names, ',') ? "s" : "");
4843 kfree(names);
4844 } else
4845 btrfs_warn(fs_info,
4846 "can't clear %s bits 0x%llx while mounted",
4847 type, disallowed);
4848 return -EPERM;
4849 }
4850
4851 return 0;
4852 }
4853
4854 #define check_feature(fs_info, change_mask, flags, mask_base) \
4855 check_feature_bits(fs_info, FEAT_##mask_base, change_mask, flags, \
4856 BTRFS_FEATURE_ ## mask_base ## _SUPP, \
4857 BTRFS_FEATURE_ ## mask_base ## _SAFE_SET, \
4858 BTRFS_FEATURE_ ## mask_base ## _SAFE_CLEAR)
4859
4860 static int btrfs_ioctl_set_features(struct file *file, void __user *arg)
4861 {
4862 struct inode *inode = file_inode(file);
4863 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4864 struct btrfs_root *root = BTRFS_I(inode)->root;
4865 struct btrfs_super_block *super_block = fs_info->super_copy;
4866 struct btrfs_ioctl_feature_flags flags[2];
4867 struct btrfs_trans_handle *trans;
4868 u64 newflags;
4869 int ret;
4870
4871 if (!capable(CAP_SYS_ADMIN))
4872 return -EPERM;
4873
4874 if (copy_from_user(flags, arg, sizeof(flags)))
4875 return -EFAULT;
4876
4877 /* Nothing to do */
4878 if (!flags[0].compat_flags && !flags[0].compat_ro_flags &&
4879 !flags[0].incompat_flags)
4880 return 0;
4881
4882 ret = check_feature(fs_info, flags[0].compat_flags,
4883 flags[1].compat_flags, COMPAT);
4884 if (ret)
4885 return ret;
4886
4887 ret = check_feature(fs_info, flags[0].compat_ro_flags,
4888 flags[1].compat_ro_flags, COMPAT_RO);
4889 if (ret)
4890 return ret;
4891
4892 ret = check_feature(fs_info, flags[0].incompat_flags,
4893 flags[1].incompat_flags, INCOMPAT);
4894 if (ret)
4895 return ret;
4896
4897 ret = mnt_want_write_file(file);
4898 if (ret)
4899 return ret;
4900
4901 trans = btrfs_start_transaction(root, 0);
4902 if (IS_ERR(trans)) {
4903 ret = PTR_ERR(trans);
4904 goto out_drop_write;
4905 }
4906
4907 spin_lock(&fs_info->super_lock);
4908 newflags = btrfs_super_compat_flags(super_block);
4909 newflags |= flags[0].compat_flags & flags[1].compat_flags;
4910 newflags &= ~(flags[0].compat_flags & ~flags[1].compat_flags);
4911 btrfs_set_super_compat_flags(super_block, newflags);
4912
4913 newflags = btrfs_super_compat_ro_flags(super_block);
4914 newflags |= flags[0].compat_ro_flags & flags[1].compat_ro_flags;
4915 newflags &= ~(flags[0].compat_ro_flags & ~flags[1].compat_ro_flags);
4916 btrfs_set_super_compat_ro_flags(super_block, newflags);
4917
4918 newflags = btrfs_super_incompat_flags(super_block);
4919 newflags |= flags[0].incompat_flags & flags[1].incompat_flags;
4920 newflags &= ~(flags[0].incompat_flags & ~flags[1].incompat_flags);
4921 btrfs_set_super_incompat_flags(super_block, newflags);
4922 spin_unlock(&fs_info->super_lock);
4923
4924 ret = btrfs_commit_transaction(trans);
4925 out_drop_write:
4926 mnt_drop_write_file(file);
4927
4928 return ret;
4929 }
4930
4931 static int _btrfs_ioctl_send(struct file *file, void __user *argp, bool compat)
4932 {
4933 struct btrfs_ioctl_send_args *arg;
4934 int ret;
4935
4936 if (compat) {
4937 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
4938 struct btrfs_ioctl_send_args_32 args32;
4939
4940 ret = copy_from_user(&args32, argp, sizeof(args32));
4941 if (ret)
4942 return -EFAULT;
4943 arg = kzalloc(sizeof(*arg), GFP_KERNEL);
4944 if (!arg)
4945 return -ENOMEM;
4946 arg->send_fd = args32.send_fd;
4947 arg->clone_sources_count = args32.clone_sources_count;
4948 arg->clone_sources = compat_ptr(args32.clone_sources);
4949 arg->parent_root = args32.parent_root;
4950 arg->flags = args32.flags;
4951 memcpy(arg->reserved, args32.reserved,
4952 sizeof(args32.reserved));
4953 #else
4954 return -ENOTTY;
4955 #endif
4956 } else {
4957 arg = memdup_user(argp, sizeof(*arg));
4958 if (IS_ERR(arg))
4959 return PTR_ERR(arg);
4960 }
4961 ret = btrfs_ioctl_send(file, arg);
4962 kfree(arg);
4963 return ret;
4964 }
4965
4966 long btrfs_ioctl(struct file *file, unsigned int
4967 cmd, unsigned long arg)
4968 {
4969 struct inode *inode = file_inode(file);
4970 struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
4971 struct btrfs_root *root = BTRFS_I(inode)->root;
4972 void __user *argp = (void __user *)arg;
4973
4974 switch (cmd) {
4975 case FS_IOC_GETVERSION:
4976 return btrfs_ioctl_getversion(file, argp);
4977 case FS_IOC_GETFSLABEL:
4978 return btrfs_ioctl_get_fslabel(fs_info, argp);
4979 case FS_IOC_SETFSLABEL:
4980 return btrfs_ioctl_set_fslabel(file, argp);
4981 case FITRIM:
4982 return btrfs_ioctl_fitrim(fs_info, argp);
4983 case BTRFS_IOC_SNAP_CREATE:
4984 return btrfs_ioctl_snap_create(file, argp, 0);
4985 case BTRFS_IOC_SNAP_CREATE_V2:
4986 return btrfs_ioctl_snap_create_v2(file, argp, 0);
4987 case BTRFS_IOC_SUBVOL_CREATE:
4988 return btrfs_ioctl_snap_create(file, argp, 1);
4989 case BTRFS_IOC_SUBVOL_CREATE_V2:
4990 return btrfs_ioctl_snap_create_v2(file, argp, 1);
4991 case BTRFS_IOC_SNAP_DESTROY:
4992 return btrfs_ioctl_snap_destroy(file, argp, false);
4993 case BTRFS_IOC_SNAP_DESTROY_V2:
4994 return btrfs_ioctl_snap_destroy(file, argp, true);
4995 case BTRFS_IOC_SUBVOL_GETFLAGS:
4996 return btrfs_ioctl_subvol_getflags(file, argp);
4997 case BTRFS_IOC_SUBVOL_SETFLAGS:
4998 return btrfs_ioctl_subvol_setflags(file, argp);
4999 case BTRFS_IOC_DEFAULT_SUBVOL:
5000 return btrfs_ioctl_default_subvol(file, argp);
5001 case BTRFS_IOC_DEFRAG:
5002 return btrfs_ioctl_defrag(file, NULL);
5003 case BTRFS_IOC_DEFRAG_RANGE:
5004 return btrfs_ioctl_defrag(file, argp);
5005 case BTRFS_IOC_RESIZE:
5006 return btrfs_ioctl_resize(file, argp);
5007 case BTRFS_IOC_ADD_DEV:
5008 return btrfs_ioctl_add_dev(fs_info, argp);
5009 case BTRFS_IOC_RM_DEV:
5010 return btrfs_ioctl_rm_dev(file, argp);
5011 case BTRFS_IOC_RM_DEV_V2:
5012 return btrfs_ioctl_rm_dev_v2(file, argp);
5013 case BTRFS_IOC_FS_INFO:
5014 return btrfs_ioctl_fs_info(fs_info, argp);
5015 case BTRFS_IOC_DEV_INFO:
5016 return btrfs_ioctl_dev_info(fs_info, argp);
5017 case BTRFS_IOC_BALANCE:
5018 return btrfs_ioctl_balance(file, NULL);
5019 case BTRFS_IOC_TREE_SEARCH:
5020 return btrfs_ioctl_tree_search(file, argp);
5021 case BTRFS_IOC_TREE_SEARCH_V2:
5022 return btrfs_ioctl_tree_search_v2(file, argp);
5023 case BTRFS_IOC_INO_LOOKUP:
5024 return btrfs_ioctl_ino_lookup(file, argp);
5025 case BTRFS_IOC_INO_PATHS:
5026 return btrfs_ioctl_ino_to_path(root, argp);
5027 case BTRFS_IOC_LOGICAL_INO:
5028 return btrfs_ioctl_logical_to_ino(fs_info, argp, 1);
5029 case BTRFS_IOC_LOGICAL_INO_V2:
5030 return btrfs_ioctl_logical_to_ino(fs_info, argp, 2);
5031 case BTRFS_IOC_SPACE_INFO:
5032 return btrfs_ioctl_space_info(fs_info, argp);
5033 case BTRFS_IOC_SYNC: {
5034 int ret;
5035
5036 ret = btrfs_start_delalloc_roots(fs_info, LONG_MAX, false);
5037 if (ret)
5038 return ret;
5039 ret = btrfs_sync_fs(inode->i_sb, 1);
5040 /*
5041 * The transaction thread may want to do more work,
5042 * namely it pokes the cleaner kthread that will start
5043 * processing uncleaned subvols.
5044 */
5045 wake_up_process(fs_info->transaction_kthread);
5046 return ret;
5047 }
5048 case BTRFS_IOC_START_SYNC:
5049 return btrfs_ioctl_start_sync(root, argp);
5050 case BTRFS_IOC_WAIT_SYNC:
5051 return btrfs_ioctl_wait_sync(fs_info, argp);
5052 case BTRFS_IOC_SCRUB:
5053 return btrfs_ioctl_scrub(file, argp);
5054 case BTRFS_IOC_SCRUB_CANCEL:
5055 return btrfs_ioctl_scrub_cancel(fs_info);
5056 case BTRFS_IOC_SCRUB_PROGRESS:
5057 return btrfs_ioctl_scrub_progress(fs_info, argp);
5058 case BTRFS_IOC_BALANCE_V2:
5059 return btrfs_ioctl_balance(file, argp);
5060 case BTRFS_IOC_BALANCE_CTL:
5061 return btrfs_ioctl_balance_ctl(fs_info, arg);
5062 case BTRFS_IOC_BALANCE_PROGRESS:
5063 return btrfs_ioctl_balance_progress(fs_info, argp);
5064 case BTRFS_IOC_SET_RECEIVED_SUBVOL:
5065 return btrfs_ioctl_set_received_subvol(file, argp);
5066 #ifdef CONFIG_64BIT
5067 case BTRFS_IOC_SET_RECEIVED_SUBVOL_32:
5068 return btrfs_ioctl_set_received_subvol_32(file, argp);
5069 #endif
5070 case BTRFS_IOC_SEND:
5071 return _btrfs_ioctl_send(file, argp, false);
5072 #if defined(CONFIG_64BIT) && defined(CONFIG_COMPAT)
5073 case BTRFS_IOC_SEND_32:
5074 return _btrfs_ioctl_send(file, argp, true);
5075 #endif
5076 case BTRFS_IOC_GET_DEV_STATS:
5077 return btrfs_ioctl_get_dev_stats(fs_info, argp);
5078 case BTRFS_IOC_QUOTA_CTL:
5079 return btrfs_ioctl_quota_ctl(file, argp);
5080 case BTRFS_IOC_QGROUP_ASSIGN:
5081 return btrfs_ioctl_qgroup_assign(file, argp);
5082 case BTRFS_IOC_QGROUP_CREATE:
5083 return btrfs_ioctl_qgroup_create(file, argp);
5084 case BTRFS_IOC_QGROUP_LIMIT:
5085 return btrfs_ioctl_qgroup_limit(file, argp);
5086 case BTRFS_IOC_QUOTA_RESCAN:
5087 return btrfs_ioctl_quota_rescan(file, argp);
5088 case BTRFS_IOC_QUOTA_RESCAN_STATUS:
5089 return btrfs_ioctl_quota_rescan_status(fs_info, argp);
5090 case BTRFS_IOC_QUOTA_RESCAN_WAIT:
5091 return btrfs_ioctl_quota_rescan_wait(fs_info, argp);
5092 case BTRFS_IOC_DEV_REPLACE:
5093 return btrfs_ioctl_dev_replace(fs_info, argp);
5094 case BTRFS_IOC_GET_SUPPORTED_FEATURES:
5095 return btrfs_ioctl_get_supported_features(argp);
5096 case BTRFS_IOC_GET_FEATURES:
5097 return btrfs_ioctl_get_features(fs_info, argp);
5098 case BTRFS_IOC_SET_FEATURES:
5099 return btrfs_ioctl_set_features(file, argp);
5100 case BTRFS_IOC_GET_SUBVOL_INFO:
5101 return btrfs_ioctl_get_subvol_info(file, argp);
5102 case BTRFS_IOC_GET_SUBVOL_ROOTREF:
5103 return btrfs_ioctl_get_subvol_rootref(file, argp);
5104 case BTRFS_IOC_INO_LOOKUP_USER:
5105 return btrfs_ioctl_ino_lookup_user(file, argp);
5106 case FS_IOC_ENABLE_VERITY:
5107 return fsverity_ioctl_enable(file, (const void __user *)argp);
5108 case FS_IOC_MEASURE_VERITY:
5109 return fsverity_ioctl_measure(file, argp);
5110 }
5111
5112 return -ENOTTY;
5113 }
5114
5115 #ifdef CONFIG_COMPAT
5116 long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg)
5117 {
5118 /*
5119 * These all access 32-bit values anyway so no further
5120 * handling is necessary.
5121 */
5122 switch (cmd) {
5123 case FS_IOC32_GETVERSION:
5124 cmd = FS_IOC_GETVERSION;
5125 break;
5126 }
5127
5128 return btrfs_ioctl(file, cmd, (unsigned long) compat_ptr(arg));
5129 }
5130 #endif