fs/btrfs/file.c

   1 // SPDX-License-Identifier: GPL-2.0
   2 /*
   3  * Copyright (C) 2007 Oracle.  All rights reserved.
   4  */
   5
   6 #include <linux/fs.h>
   7 #include <linux/pagemap.h>
   8 #include <linux/time.h>
   9 #include <linux/init.h>
  10 #include <linux/string.h>
  11 #include <linux/backing-dev.h>
  12 #include <linux/falloc.h>
  13 #include <linux/writeback.h>
  14 #include <linux/compat.h>
  15 #include <linux/slab.h>
  16 #include <linux/btrfs.h>
  17 #include <linux/uio.h>
  18 #include <linux/iversion.h>
  19 #include <linux/fsverity.h>
  20 #include <linux/iomap.h>
  21 #include "ctree.h"
  22 #include "disk-io.h"
  23 #include "transaction.h"
  24 #include "btrfs_inode.h"
  25 #include "print-tree.h"
  26 #include "tree-log.h"
  27 #include "locking.h"
  28 #include "volumes.h"
  29 #include "qgroup.h"
  30 #include "compression.h"
  31 #include "delalloc-space.h"
  32 #include "reflink.h"
  33 #include "subpage.h"
  34 #include "fs.h"
  35 #include "accessors.h"
  36 #include "extent-tree.h"
  37 #include "file-item.h"
  38 #include "ioctl.h"
  39 #include "file.h"
  40 #include "super.h"
  41
  42 /* simple helper to fault in pages and copy.  This should go away
  43  * and be replaced with calls into generic code.
  44  */
  45 static noinline int btrfs_copy_from_user(loff_t pos, size_t write_bytes,
  46                                          struct page **prepared_pages,
  47                                          struct iov_iter *i)
  48 {
  49         size_t copied = 0;
  50         size_t total_copied = 0;
  51         int pg = 0;
  52         int offset = offset_in_page(pos);
  53
  54         while (write_bytes > 0) {
  55                 size_t count = min_t(size_t,
  56                                      PAGE_SIZE - offset, write_bytes);
  57                 struct page *page = prepared_pages[pg];
  58                 /*
  59                  * Copy data from userspace to the current page
  60                  */
  61                 copied = copy_page_from_iter_atomic(page, offset, count, i);
  62
  63                 /* Flush processor's dcache for this page */
  64                 flush_dcache_page(page);
  65
  66                 /*
  67                  * if we get a partial write, we can end up with
  68                  * partially up to date pages.  These add
  69                  * a lot of complexity, so make sure they don't
  70                  * happen by forcing this copy to be retried.
  71                  *
  72                  * The rest of the btrfs_file_write code will fall
  73                  * back to page at a time copies after we return 0.
  74                  */
  75                 if (unlikely(copied < count)) {
  76                         if (!PageUptodate(page)) {
  77                                 iov_iter_revert(i, copied);
  78                                 copied = 0;
  79                         }
  80                         if (!copied)
  81                                 break;
  82                 }
  83
  84                 write_bytes -= copied;
  85                 total_copied += copied;
  86                 offset += copied;
  87                 if (offset == PAGE_SIZE) {
  88                         pg++;
  89                         offset = 0;
  90                 }
  91         }
  92         return total_copied;
  93 }
  94
  95 /*
  96  * unlocks pages after btrfs_file_write is done with them
  97  */
  98 static void btrfs_drop_pages(struct btrfs_fs_info *fs_info,
  99                              struct page **pages, size_t num_pages,
 100                              u64 pos, u64 copied)
 101 {
 102         size_t i;
 103         u64 block_start = round_down(pos, fs_info->sectorsize);
 104         u64 block_len = round_up(pos + copied, fs_info->sectorsize) - block_start;
 105
 106         ASSERT(block_len <= U32_MAX);
 107         for (i = 0; i < num_pages; i++) {
 108                 /* page checked is some magic around finding pages that
 109                  * have been modified without going through btrfs_set_page_dirty
 110                  * clear it here. There should be no need to mark the pages
 111                  * accessed as prepare_pages should have marked them accessed
 112                  * in prepare_pages via find_or_create_page()
 113                  */
 114                 btrfs_page_clamp_clear_checked(fs_info, pages[i], block_start,
 115                                                block_len);
 116                 unlock_page(pages[i]);
 117                 put_page(pages[i]);
 118         }
 119 }
 120
 121 /*
 122  * After btrfs_copy_from_user(), update the following things for delalloc:
 123  * - Mark newly dirtied pages as DELALLOC in the io tree.
 124  *   Used to advise which range is to be written back.
 125  * - Mark modified pages as Uptodate/Dirty and not needing COW fixup
 126  * - Update inode size for past EOF write
 127  */
 128 int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
 129                       size_t num_pages, loff_t pos, size_t write_bytes,
 130                       struct extent_state **cached, bool noreserve)
 131 {
 132         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 133         int err = 0;
 134         int i;
 135         u64 num_bytes;
 136         u64 start_pos;
 137         u64 end_of_last_block;
 138         u64 end_pos = pos + write_bytes;
 139         loff_t isize = i_size_read(&inode->vfs_inode);
 140         unsigned int extra_bits = 0;
 141
 142         if (write_bytes == 0)
 143                 return 0;
 144
 145         if (noreserve)
 146                 extra_bits |= EXTENT_NORESERVE;
 147
 148         start_pos = round_down(pos, fs_info->sectorsize);
 149         num_bytes = round_up(write_bytes + pos - start_pos,
 150                              fs_info->sectorsize);
 151         ASSERT(num_bytes <= U32_MAX);
 152
 153         end_of_last_block = start_pos + num_bytes - 1;
 154
 155         /*
 156          * The pages may have already been dirty, clear out old accounting so
 157          * we can set things up properly
 158          */
 159         clear_extent_bit(&inode->io_tree, start_pos, end_of_last_block,
 160                          EXTENT_DELALLOC | EXTENT_DO_ACCOUNTING | EXTENT_DEFRAG,
 161                          cached);
 162
 163         err = btrfs_set_extent_delalloc(inode, start_pos, end_of_last_block,
 164                                         extra_bits, cached);
 165         if (err)
 166                 return err;
 167
 168         for (i = 0; i < num_pages; i++) {
 169                 struct page *p = pages[i];
 170
 171                 btrfs_page_clamp_set_uptodate(fs_info, p, start_pos, num_bytes);
 172                 btrfs_page_clamp_clear_checked(fs_info, p, start_pos, num_bytes);
 173                 btrfs_page_clamp_set_dirty(fs_info, p, start_pos, num_bytes);
 174         }
 175
 176         /*
 177          * we've only changed i_size in ram, and we haven't updated
 178          * the disk i_size.  There is no need to log the inode
 179          * at this time.
 180          */
 181         if (end_pos > isize)
 182                 i_size_write(&inode->vfs_inode, end_pos);
 183         return 0;
 184 }
 185
 186 /*
 187  * this is very complex, but the basic idea is to drop all extents
 188  * in the range start - end.  hint_block is filled in with a block number
 189  * that would be a good hint to the block allocator for this file.
 190  *
 191  * If an extent intersects the range but is not entirely inside the range
 192  * it is either truncated or split.  Anything entirely inside the range
 193  * is deleted from the tree.
 194  *
 195  * Note: the VFS' inode number of bytes is not updated, it's up to the caller
 196  * to deal with that. We set the field 'bytes_found' of the arguments structure
 197  * with the number of allocated bytes found in the target range, so that the
 198  * caller can update the inode's number of bytes in an atomic way when
 199  * replacing extents in a range to avoid races with stat(2).
 200  */
 201 int btrfs_drop_extents(struct btrfs_trans_handle *trans,
 202                        struct btrfs_root *root, struct btrfs_inode *inode,
 203                        struct btrfs_drop_extents_args *args)
 204 {
 205         struct btrfs_fs_info *fs_info = root->fs_info;
 206         struct extent_buffer *leaf;
 207         struct btrfs_file_extent_item *fi;
 208         struct btrfs_ref ref = { 0 };
 209         struct btrfs_key key;
 210         struct btrfs_key new_key;
 211         u64 ino = btrfs_ino(inode);
 212         u64 search_start = args->start;
 213         u64 disk_bytenr = 0;
 214         u64 num_bytes = 0;
 215         u64 extent_offset = 0;
 216         u64 extent_end = 0;
 217         u64 last_end = args->start;
 218         int del_nr = 0;
 219         int del_slot = 0;
 220         int extent_type;
 221         int recow;
 222         int ret;
 223         int modify_tree = -1;
 224         int update_refs;
 225         int found = 0;
 226         struct btrfs_path *path = args->path;
 227
 228         args->bytes_found = 0;
 229         args->extent_inserted = false;
 230
 231         /* Must always have a path if ->replace_extent is true */
 232         ASSERT(!(args->replace_extent && !args->path));
 233
 234         if (!path) {
 235                 path = btrfs_alloc_path();
 236                 if (!path) {
 237                         ret = -ENOMEM;
 238                         goto out;
 239                 }
 240         }
 241
 242         if (args->drop_cache)
 243                 btrfs_drop_extent_map_range(inode, args->start, args->end - 1, false);
 244
 245         if (args->start >= inode->disk_i_size && !args->replace_extent)
 246                 modify_tree = 0;
 247
 248         update_refs = (root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID);
 249         while (1) {
 250                 recow = 0;
 251                 ret = btrfs_lookup_file_extent(trans, root, path, ino,
 252                                                search_start, modify_tree);
 253                 if (ret < 0)
 254                         break;
 255                 if (ret > 0 && path->slots[0] > 0 && search_start == args->start) {
 256                         leaf = path->nodes[0];
 257                         btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
 258                         if (key.objectid == ino &&
 259                             key.type == BTRFS_EXTENT_DATA_KEY)
 260                                 path->slots[0]--;
 261                 }
 262                 ret = 0;
 263 next_slot:
 264                 leaf = path->nodes[0];
 265                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
 266                         BUG_ON(del_nr > 0);
 267                         ret = btrfs_next_leaf(root, path);
 268                         if (ret < 0)
 269                                 break;
 270                         if (ret > 0) {
 271                                 ret = 0;
 272                                 break;
 273                         }
 274                         leaf = path->nodes[0];
 275                         recow = 1;
 276                 }
 277
 278                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 279
 280                 if (key.objectid > ino)
 281                         break;
 282                 if (WARN_ON_ONCE(key.objectid < ino) ||
 283                     key.type < BTRFS_EXTENT_DATA_KEY) {
 284                         ASSERT(del_nr == 0);
 285                         path->slots[0]++;
 286                         goto next_slot;
 287                 }
 288                 if (key.type > BTRFS_EXTENT_DATA_KEY || key.offset >= args->end)
 289                         break;
 290
 291                 fi = btrfs_item_ptr(leaf, path->slots[0],
 292                                     struct btrfs_file_extent_item);
 293                 extent_type = btrfs_file_extent_type(leaf, fi);
 294
 295                 if (extent_type == BTRFS_FILE_EXTENT_REG ||
 296                     extent_type == BTRFS_FILE_EXTENT_PREALLOC) {
 297                         disk_bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 298                         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 299                         extent_offset = btrfs_file_extent_offset(leaf, fi);
 300                         extent_end = key.offset +
 301                                 btrfs_file_extent_num_bytes(leaf, fi);
 302                 } else if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 303                         extent_end = key.offset +
 304                                 btrfs_file_extent_ram_bytes(leaf, fi);
 305                 } else {
 306                         /* can't happen */
 307                         BUG();
 308                 }
 309
 310                 /*
 311                  * Don't skip extent items representing 0 byte lengths. They
 312                  * used to be created (bug) if while punching holes we hit
 313                  * -ENOSPC condition. So if we find one here, just ensure we
 314                  * delete it, otherwise we would insert a new file extent item
 315                  * with the same key (offset) as that 0 bytes length file
 316                  * extent item in the call to setup_items_for_insert() later
 317                  * in this function.
 318                  */
 319                 if (extent_end == key.offset && extent_end >= search_start) {
 320                         last_end = extent_end;
 321                         goto delete_extent_item;
 322                 }
 323
 324                 if (extent_end <= search_start) {
 325                         path->slots[0]++;
 326                         goto next_slot;
 327                 }
 328
 329                 found = 1;
 330                 search_start = max(key.offset, args->start);
 331                 if (recow || !modify_tree) {
 332                         modify_tree = -1;
 333                         btrfs_release_path(path);
 334                         continue;
 335                 }
 336
 337                 /*
 338                  *     | - range to drop - |
 339                  *  | -------- extent -------- |
 340                  */
 341                 if (args->start > key.offset && args->end < extent_end) {
 342                         BUG_ON(del_nr > 0);
 343                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 344                                 ret = -EOPNOTSUPP;
 345                                 break;
 346                         }
 347
 348                         memcpy(&new_key, &key, sizeof(new_key));
 349                         new_key.offset = args->start;
 350                         ret = btrfs_duplicate_item(trans, root, path,
 351                                                    &new_key);
 352                         if (ret == -EAGAIN) {
 353                                 btrfs_release_path(path);
 354                                 continue;
 355                         }
 356                         if (ret < 0)
 357                                 break;
 358
 359                         leaf = path->nodes[0];
 360                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 361                                             struct btrfs_file_extent_item);
 362                         btrfs_set_file_extent_num_bytes(leaf, fi,
 363                                                         args->start - key.offset);
 364
 365                         fi = btrfs_item_ptr(leaf, path->slots[0],
 366                                             struct btrfs_file_extent_item);
 367
 368                         extent_offset += args->start - key.offset;
 369                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 370                         btrfs_set_file_extent_num_bytes(leaf, fi,
 371                                                         extent_end - args->start);
 372                         btrfs_mark_buffer_dirty(trans, leaf);
 373
 374                         if (update_refs && disk_bytenr > 0) {
 375                                 btrfs_init_generic_ref(&ref,
 376                                                 BTRFS_ADD_DELAYED_REF,
 377                                                 disk_bytenr, num_bytes, 0,
 378                                                 root->root_key.objectid);
 379                                 btrfs_init_data_ref(&ref,
 380                                                 root->root_key.objectid,
 381                                                 new_key.objectid,
 382                                                 args->start - extent_offset,
 383                                                 0, false);
 384                                 ret = btrfs_inc_extent_ref(trans, &ref);
 385                                 if (ret) {
 386                                         btrfs_abort_transaction(trans, ret);
 387                                         break;
 388                                 }
 389                         }
 390                         key.offset = args->start;
 391                 }
 392                 /*
 393                  * From here on out we will have actually dropped something, so
 394                  * last_end can be updated.
 395                  */
 396                 last_end = extent_end;
 397
 398                 /*
 399                  *  | ---- range to drop ----- |
 400                  *      | -------- extent -------- |
 401                  */
 402                 if (args->start <= key.offset && args->end < extent_end) {
 403                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 404                                 ret = -EOPNOTSUPP;
 405                                 break;
 406                         }
 407
 408                         memcpy(&new_key, &key, sizeof(new_key));
 409                         new_key.offset = args->end;
 410                         btrfs_set_item_key_safe(trans, path, &new_key);
 411
 412                         extent_offset += args->end - key.offset;
 413                         btrfs_set_file_extent_offset(leaf, fi, extent_offset);
 414                         btrfs_set_file_extent_num_bytes(leaf, fi,
 415                                                         extent_end - args->end);
 416                         btrfs_mark_buffer_dirty(trans, leaf);
 417                         if (update_refs && disk_bytenr > 0)
 418                                 args->bytes_found += args->end - key.offset;
 419                         break;
 420                 }
 421
 422                 search_start = extent_end;
 423                 /*
 424                  *       | ---- range to drop ----- |
 425                  *  | -------- extent -------- |
 426                  */
 427                 if (args->start > key.offset && args->end >= extent_end) {
 428                         BUG_ON(del_nr > 0);
 429                         if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
 430                                 ret = -EOPNOTSUPP;
 431                                 break;
 432                         }
 433
 434                         btrfs_set_file_extent_num_bytes(leaf, fi,
 435                                                         args->start - key.offset);
 436                         btrfs_mark_buffer_dirty(trans, leaf);
 437                         if (update_refs && disk_bytenr > 0)
 438                                 args->bytes_found += extent_end - args->start;
 439                         if (args->end == extent_end)
 440                                 break;
 441
 442                         path->slots[0]++;
 443                         goto next_slot;
 444                 }
 445
 446                 /*
 447                  *  | ---- range to drop ----- |
 448                  *    | ------ extent ------ |
 449                  */
 450                 if (args->start <= key.offset && args->end >= extent_end) {
 451 delete_extent_item:
 452                         if (del_nr == 0) {
 453                                 del_slot = path->slots[0];
 454                                 del_nr = 1;
 455                         } else {
 456                                 BUG_ON(del_slot + del_nr != path->slots[0]);
 457                                 del_nr++;
 458                         }
 459
 460                         if (update_refs &&
 461                             extent_type == BTRFS_FILE_EXTENT_INLINE) {
 462                                 args->bytes_found += extent_end - key.offset;
 463                                 extent_end = ALIGN(extent_end,
 464                                                    fs_info->sectorsize);
 465                         } else if (update_refs && disk_bytenr > 0) {
 466                                 btrfs_init_generic_ref(&ref,
 467                                                 BTRFS_DROP_DELAYED_REF,
 468                                                 disk_bytenr, num_bytes, 0,
 469                                                 root->root_key.objectid);
 470                                 btrfs_init_data_ref(&ref,
 471                                                 root->root_key.objectid,
 472                                                 key.objectid,
 473                                                 key.offset - extent_offset, 0,
 474                                                 false);
 475                                 ret = btrfs_free_extent(trans, &ref);
 476                                 if (ret) {
 477                                         btrfs_abort_transaction(trans, ret);
 478                                         break;
 479                                 }
 480                                 args->bytes_found += extent_end - key.offset;
 481                         }
 482
 483                         if (args->end == extent_end)
 484                                 break;
 485
 486                         if (path->slots[0] + 1 < btrfs_header_nritems(leaf)) {
 487                                 path->slots[0]++;
 488                                 goto next_slot;
 489                         }
 490
 491                         ret = btrfs_del_items(trans, root, path, del_slot,
 492                                               del_nr);
 493                         if (ret) {
 494                                 btrfs_abort_transaction(trans, ret);
 495                                 break;
 496                         }
 497
 498                         del_nr = 0;
 499                         del_slot = 0;
 500
 501                         btrfs_release_path(path);
 502                         continue;
 503                 }
 504
 505                 BUG();
 506         }
 507
 508         if (!ret && del_nr > 0) {
 509                 /*
 510                  * Set path->slots[0] to first slot, so that after the delete
 511                  * if items are move off from our leaf to its immediate left or
 512                  * right neighbor leafs, we end up with a correct and adjusted
 513                  * path->slots[0] for our insertion (if args->replace_extent).
 514                  */
 515                 path->slots[0] = del_slot;
 516                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 517                 if (ret)
 518                         btrfs_abort_transaction(trans, ret);
 519         }
 520
 521         leaf = path->nodes[0];
 522         /*
 523          * If btrfs_del_items() was called, it might have deleted a leaf, in
 524          * which case it unlocked our path, so check path->locks[0] matches a
 525          * write lock.
 526          */
 527         if (!ret && args->replace_extent &&
 528             path->locks[0] == BTRFS_WRITE_LOCK &&
 529             btrfs_leaf_free_space(leaf) >=
 530             sizeof(struct btrfs_item) + args->extent_item_size) {
 531
 532                 key.objectid = ino;
 533                 key.type = BTRFS_EXTENT_DATA_KEY;
 534                 key.offset = args->start;
 535                 if (!del_nr && path->slots[0] < btrfs_header_nritems(leaf)) {
 536                         struct btrfs_key slot_key;
 537
 538                         btrfs_item_key_to_cpu(leaf, &slot_key, path->slots[0]);
 539                         if (btrfs_comp_cpu_keys(&key, &slot_key) > 0)
 540                                 path->slots[0]++;
 541                 }
 542                 btrfs_setup_item_for_insert(trans, root, path, &key,
 543                                             args->extent_item_size);
 544                 args->extent_inserted = true;
 545         }
 546
 547         if (!args->path)
 548                 btrfs_free_path(path);
 549         else if (!args->extent_inserted)
 550                 btrfs_release_path(path);
 551 out:
 552         args->drop_end = found ? min(args->end, last_end) : args->end;
 553
 554         return ret;
 555 }
 556
 557 static int extent_mergeable(struct extent_buffer *leaf, int slot,
 558                             u64 objectid, u64 bytenr, u64 orig_offset,
 559                             u64 *start, u64 *end)
 560 {
 561         struct btrfs_file_extent_item *fi;
 562         struct btrfs_key key;
 563         u64 extent_end;
 564
 565         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
 566                 return 0;
 567
 568         btrfs_item_key_to_cpu(leaf, &key, slot);
 569         if (key.objectid != objectid || key.type != BTRFS_EXTENT_DATA_KEY)
 570                 return 0;
 571
 572         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
 573         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG ||
 574             btrfs_file_extent_disk_bytenr(leaf, fi) != bytenr ||
 575             btrfs_file_extent_offset(leaf, fi) != key.offset - orig_offset ||
 576             btrfs_file_extent_compression(leaf, fi) ||
 577             btrfs_file_extent_encryption(leaf, fi) ||
 578             btrfs_file_extent_other_encoding(leaf, fi))
 579                 return 0;
 580
 581         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 582         if ((*start && *start != key.offset) || (*end && *end != extent_end))
 583                 return 0;
 584
 585         *start = key.offset;
 586         *end = extent_end;
 587         return 1;
 588 }
 589
 590 /*
 591  * Mark extent in the range start - end as written.
 592  *
 593  * This changes extent type from 'pre-allocated' to 'regular'. If only
 594  * part of extent is marked as written, the extent will be split into
 595  * two or three.
 596  */
 597 int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
 598                               struct btrfs_inode *inode, u64 start, u64 end)
 599 {
 600         struct btrfs_root *root = inode->root;
 601         struct extent_buffer *leaf;
 602         struct btrfs_path *path;
 603         struct btrfs_file_extent_item *fi;
 604         struct btrfs_ref ref = { 0 };
 605         struct btrfs_key key;
 606         struct btrfs_key new_key;
 607         u64 bytenr;
 608         u64 num_bytes;
 609         u64 extent_end;
 610         u64 orig_offset;
 611         u64 other_start;
 612         u64 other_end;
 613         u64 split;
 614         int del_nr = 0;
 615         int del_slot = 0;
 616         int recow;
 617         int ret = 0;
 618         u64 ino = btrfs_ino(inode);
 619
 620         path = btrfs_alloc_path();
 621         if (!path)
 622                 return -ENOMEM;
 623 again:
 624         recow = 0;
 625         split = start;
 626         key.objectid = ino;
 627         key.type = BTRFS_EXTENT_DATA_KEY;
 628         key.offset = split;
 629
 630         ret = btrfs_search_slot(trans, root, &key, path, -1, 1);
 631         if (ret < 0)
 632                 goto out;
 633         if (ret > 0 && path->slots[0] > 0)
 634                 path->slots[0]--;
 635
 636         leaf = path->nodes[0];
 637         btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
 638         if (key.objectid != ino ||
 639             key.type != BTRFS_EXTENT_DATA_KEY) {
 640                 ret = -EINVAL;
 641                 btrfs_abort_transaction(trans, ret);
 642                 goto out;
 643         }
 644         fi = btrfs_item_ptr(leaf, path->slots[0],
 645                             struct btrfs_file_extent_item);
 646         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_PREALLOC) {
 647                 ret = -EINVAL;
 648                 btrfs_abort_transaction(trans, ret);
 649                 goto out;
 650         }
 651         extent_end = key.offset + btrfs_file_extent_num_bytes(leaf, fi);
 652         if (key.offset > start || extent_end < end) {
 653                 ret = -EINVAL;
 654                 btrfs_abort_transaction(trans, ret);
 655                 goto out;
 656         }
 657
 658         bytenr = btrfs_file_extent_disk_bytenr(leaf, fi);
 659         num_bytes = btrfs_file_extent_disk_num_bytes(leaf, fi);
 660         orig_offset = key.offset - btrfs_file_extent_offset(leaf, fi);
 661         memcpy(&new_key, &key, sizeof(new_key));
 662
 663         if (start == key.offset && end < extent_end) {
 664                 other_start = 0;
 665                 other_end = start;
 666                 if (extent_mergeable(leaf, path->slots[0] - 1,
 667                                      ino, bytenr, orig_offset,
 668                                      &other_start, &other_end)) {
 669                         new_key.offset = end;
 670                         btrfs_set_item_key_safe(trans, path, &new_key);
 671                         fi = btrfs_item_ptr(leaf, path->slots[0],
 672                                             struct btrfs_file_extent_item);
 673                         btrfs_set_file_extent_generation(leaf, fi,
 674                                                          trans->transid);
 675                         btrfs_set_file_extent_num_bytes(leaf, fi,
 676                                                         extent_end - end);
 677                         btrfs_set_file_extent_offset(leaf, fi,
 678                                                      end - orig_offset);
 679                         fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 680                                             struct btrfs_file_extent_item);
 681                         btrfs_set_file_extent_generation(leaf, fi,
 682                                                          trans->transid);
 683                         btrfs_set_file_extent_num_bytes(leaf, fi,
 684                                                         end - other_start);
 685                         btrfs_mark_buffer_dirty(trans, leaf);
 686                         goto out;
 687                 }
 688         }
 689
 690         if (start > key.offset && end == extent_end) {
 691                 other_start = end;
 692                 other_end = 0;
 693                 if (extent_mergeable(leaf, path->slots[0] + 1,
 694                                      ino, bytenr, orig_offset,
 695                                      &other_start, &other_end)) {
 696                         fi = btrfs_item_ptr(leaf, path->slots[0],
 697                                             struct btrfs_file_extent_item);
 698                         btrfs_set_file_extent_num_bytes(leaf, fi,
 699                                                         start - key.offset);
 700                         btrfs_set_file_extent_generation(leaf, fi,
 701                                                          trans->transid);
 702                         path->slots[0]++;
 703                         new_key.offset = start;
 704                         btrfs_set_item_key_safe(trans, path, &new_key);
 705
 706                         fi = btrfs_item_ptr(leaf, path->slots[0],
 707                                             struct btrfs_file_extent_item);
 708                         btrfs_set_file_extent_generation(leaf, fi,
 709                                                          trans->transid);
 710                         btrfs_set_file_extent_num_bytes(leaf, fi,
 711                                                         other_end - start);
 712                         btrfs_set_file_extent_offset(leaf, fi,
 713                                                      start - orig_offset);
 714                         btrfs_mark_buffer_dirty(trans, leaf);
 715                         goto out;
 716                 }
 717         }
 718
 719         while (start > key.offset || end < extent_end) {
 720                 if (key.offset == start)
 721                         split = end;
 722
 723                 new_key.offset = split;
 724                 ret = btrfs_duplicate_item(trans, root, path, &new_key);
 725                 if (ret == -EAGAIN) {
 726                         btrfs_release_path(path);
 727                         goto again;
 728                 }
 729                 if (ret < 0) {
 730                         btrfs_abort_transaction(trans, ret);
 731                         goto out;
 732                 }
 733
 734                 leaf = path->nodes[0];
 735                 fi = btrfs_item_ptr(leaf, path->slots[0] - 1,
 736                                     struct btrfs_file_extent_item);
 737                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 738                 btrfs_set_file_extent_num_bytes(leaf, fi,
 739                                                 split - key.offset);
 740
 741                 fi = btrfs_item_ptr(leaf, path->slots[0],
 742                                     struct btrfs_file_extent_item);
 743
 744                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 745                 btrfs_set_file_extent_offset(leaf, fi, split - orig_offset);
 746                 btrfs_set_file_extent_num_bytes(leaf, fi,
 747                                                 extent_end - split);
 748                 btrfs_mark_buffer_dirty(trans, leaf);
 749
 750                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF, bytenr,
 751                                        num_bytes, 0, root->root_key.objectid);
 752                 btrfs_init_data_ref(&ref, root->root_key.objectid, ino,
 753                                     orig_offset, 0, false);
 754                 ret = btrfs_inc_extent_ref(trans, &ref);
 755                 if (ret) {
 756                         btrfs_abort_transaction(trans, ret);
 757                         goto out;
 758                 }
 759
 760                 if (split == start) {
 761                         key.offset = start;
 762                 } else {
 763                         if (start != key.offset) {
 764                                 ret = -EINVAL;
 765                                 btrfs_abort_transaction(trans, ret);
 766                                 goto out;
 767                         }
 768                         path->slots[0]--;
 769                         extent_end = end;
 770                 }
 771                 recow = 1;
 772         }
 773
 774         other_start = end;
 775         other_end = 0;
 776         btrfs_init_generic_ref(&ref, BTRFS_DROP_DELAYED_REF, bytenr,
 777                                num_bytes, 0, root->root_key.objectid);
 778         btrfs_init_data_ref(&ref, root->root_key.objectid, ino, orig_offset,
 779                             0, false);
 780         if (extent_mergeable(leaf, path->slots[0] + 1,
 781                              ino, bytenr, orig_offset,
 782                              &other_start, &other_end)) {
 783                 if (recow) {
 784                         btrfs_release_path(path);
 785                         goto again;
 786                 }
 787                 extent_end = other_end;
 788                 del_slot = path->slots[0] + 1;
 789                 del_nr++;
 790                 ret = btrfs_free_extent(trans, &ref);
 791                 if (ret) {
 792                         btrfs_abort_transaction(trans, ret);
 793                         goto out;
 794                 }
 795         }
 796         other_start = 0;
 797         other_end = start;
 798         if (extent_mergeable(leaf, path->slots[0] - 1,
 799                              ino, bytenr, orig_offset,
 800                              &other_start, &other_end)) {
 801                 if (recow) {
 802                         btrfs_release_path(path);
 803                         goto again;
 804                 }
 805                 key.offset = other_start;
 806                 del_slot = path->slots[0];
 807                 del_nr++;
 808                 ret = btrfs_free_extent(trans, &ref);
 809                 if (ret) {
 810                         btrfs_abort_transaction(trans, ret);
 811                         goto out;
 812                 }
 813         }
 814         if (del_nr == 0) {
 815                 fi = btrfs_item_ptr(leaf, path->slots[0],
 816                            struct btrfs_file_extent_item);
 817                 btrfs_set_file_extent_type(leaf, fi,
 818                                            BTRFS_FILE_EXTENT_REG);
 819                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 820                 btrfs_mark_buffer_dirty(trans, leaf);
 821         } else {
 822                 fi = btrfs_item_ptr(leaf, del_slot - 1,
 823                            struct btrfs_file_extent_item);
 824                 btrfs_set_file_extent_type(leaf, fi,
 825                                            BTRFS_FILE_EXTENT_REG);
 826                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
 827                 btrfs_set_file_extent_num_bytes(leaf, fi,
 828                                                 extent_end - key.offset);
 829                 btrfs_mark_buffer_dirty(trans, leaf);
 830
 831                 ret = btrfs_del_items(trans, root, path, del_slot, del_nr);
 832                 if (ret < 0) {
 833                         btrfs_abort_transaction(trans, ret);
 834                         goto out;
 835                 }
 836         }
 837 out:
 838         btrfs_free_path(path);
 839         return ret;
 840 }
 841
 842 /*
 843  * on error we return an unlocked page and the error value
 844  * on success we return a locked page and 0
 845  */
 846 static int prepare_uptodate_page(struct inode *inode,
 847                                  struct page *page, u64 pos,
 848                                  bool force_uptodate)
 849 {
 850         struct folio *folio = page_folio(page);
 851         int ret = 0;
 852
 853         if (((pos & (PAGE_SIZE - 1)) || force_uptodate) &&
 854             !PageUptodate(page)) {
 855                 ret = btrfs_read_folio(NULL, folio);
 856                 if (ret)
 857                         return ret;
 858                 lock_page(page);
 859                 if (!PageUptodate(page)) {
 860                         unlock_page(page);
 861                         return -EIO;
 862                 }
 863
 864                 /*
 865                  * Since btrfs_read_folio() will unlock the folio before it
 866                  * returns, there is a window where btrfs_release_folio() can be
 867                  * called to release the page.  Here we check both inode
 868                  * mapping and PagePrivate() to make sure the page was not
 869                  * released.
 870                  *
 871                  * The private flag check is essential for subpage as we need
 872                  * to store extra bitmap using page->private.
 873                  */
 874                 if (page->mapping != inode->i_mapping || !PagePrivate(page)) {
 875                         unlock_page(page);
 876                         return -EAGAIN;
 877                 }
 878         }
 879         return 0;
 880 }
 881
 882 static fgf_t get_prepare_fgp_flags(bool nowait)
 883 {
 884         fgf_t fgp_flags = FGP_LOCK | FGP_ACCESSED | FGP_CREAT;
 885
 886         if (nowait)
 887                 fgp_flags |= FGP_NOWAIT;
 888
 889         return fgp_flags;
 890 }
 891
 892 static gfp_t get_prepare_gfp_flags(struct inode *inode, bool nowait)
 893 {
 894         gfp_t gfp;
 895
 896         gfp = btrfs_alloc_write_mask(inode->i_mapping);
 897         if (nowait) {
 898                 gfp &= ~__GFP_DIRECT_RECLAIM;
 899                 gfp |= GFP_NOWAIT;
 900         }
 901
 902         return gfp;
 903 }
 904
 905 /*
 906  * this just gets pages into the page cache and locks them down.
 907  */
 908 static noinline int prepare_pages(struct inode *inode, struct page **pages,
 909                                   size_t num_pages, loff_t pos,
 910                                   size_t write_bytes, bool force_uptodate,
 911                                   bool nowait)
 912 {
 913         int i;
 914         unsigned long index = pos >> PAGE_SHIFT;
 915         gfp_t mask = get_prepare_gfp_flags(inode, nowait);
 916         fgf_t fgp_flags = get_prepare_fgp_flags(nowait);
 917         int err = 0;
 918         int faili;
 919
 920         for (i = 0; i < num_pages; i++) {
 921 again:
 922                 pages[i] = pagecache_get_page(inode->i_mapping, index + i,
 923                                               fgp_flags, mask | __GFP_WRITE);
 924                 if (!pages[i]) {
 925                         faili = i - 1;
 926                         if (nowait)
 927                                 err = -EAGAIN;
 928                         else
 929                                 err = -ENOMEM;
 930                         goto fail;
 931                 }
 932
 933                 err = set_page_extent_mapped(pages[i]);
 934                 if (err < 0) {
 935                         faili = i;
 936                         goto fail;
 937                 }
 938
 939                 if (i == 0)
 940                         err = prepare_uptodate_page(inode, pages[i], pos,
 941                                                     force_uptodate);
 942                 if (!err && i == num_pages - 1)
 943                         err = prepare_uptodate_page(inode, pages[i],
 944                                                     pos + write_bytes, false);
 945                 if (err) {
 946                         put_page(pages[i]);
 947                         if (!nowait && err == -EAGAIN) {
 948                                 err = 0;
 949                                 goto again;
 950                         }
 951                         faili = i - 1;
 952                         goto fail;
 953                 }
 954                 wait_on_page_writeback(pages[i]);
 955         }
 956
 957         return 0;
 958 fail:
 959         while (faili >= 0) {
 960                 unlock_page(pages[faili]);
 961                 put_page(pages[faili]);
 962                 faili--;
 963         }
 964         return err;
 965
 966 }
 967
 968 /*
 969  * This function locks the extent and properly waits for data=ordered extents
 970  * to finish before allowing the pages to be modified if need.
 971  *
 972  * The return value:
 973  * 1 - the extent is locked
 974  * 0 - the extent is not locked, and everything is OK
 975  * -EAGAIN - need re-prepare the pages
 976  * the other < 0 number - Something wrong happens
 977  */
 978 static noinline int
 979 lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
 980                                 size_t num_pages, loff_t pos,
 981                                 size_t write_bytes,
 982                                 u64 *lockstart, u64 *lockend, bool nowait,
 983                                 struct extent_state **cached_state)
 984 {
 985         struct btrfs_fs_info *fs_info = inode->root->fs_info;
 986         u64 start_pos;
 987         u64 last_pos;
 988         int i;
 989         int ret = 0;
 990
 991         start_pos = round_down(pos, fs_info->sectorsize);
 992         last_pos = round_up(pos + write_bytes, fs_info->sectorsize) - 1;
 993
 994         if (start_pos < inode->vfs_inode.i_size) {
 995                 struct btrfs_ordered_extent *ordered;
 996
 997                 if (nowait) {
 998                         if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
 999                                              cached_state)) {
1000                                 for (i = 0; i < num_pages; i++) {
1001                                         unlock_page(pages[i]);
1002                                         put_page(pages[i]);
1003                                         pages[i] = NULL;
1004                                 }
1005
1006                                 return -EAGAIN;
1007                         }
1008                 } else {
1009                         lock_extent(&inode->io_tree, start_pos, last_pos, cached_state);
1010                 }
1011
1012                 ordered = btrfs_lookup_ordered_range(inode, start_pos,
1013                                                      last_pos - start_pos + 1);
1014                 if (ordered &&
1015                     ordered->file_offset + ordered->num_bytes > start_pos &&
1016                     ordered->file_offset <= last_pos) {
1017                         unlock_extent(&inode->io_tree, start_pos, last_pos,
1018                                       cached_state);
1019                         for (i = 0; i < num_pages; i++) {
1020                                 unlock_page(pages[i]);
1021                                 put_page(pages[i]);
1022                         }
1023                         btrfs_start_ordered_extent(ordered);
1024                         btrfs_put_ordered_extent(ordered);
1025                         return -EAGAIN;
1026                 }
1027                 if (ordered)
1028                         btrfs_put_ordered_extent(ordered);
1029
1030                 *lockstart = start_pos;
1031                 *lockend = last_pos;
1032                 ret = 1;
1033         }
1034
1035         /*
1036          * We should be called after prepare_pages() which should have locked
1037          * all pages in the range.
1038          */
1039         for (i = 0; i < num_pages; i++)
1040                 WARN_ON(!PageLocked(pages[i]));
1041
1042         return ret;
1043 }
1044
1045 /*
1046  * Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
1047  *
1048  * @pos:         File offset.
1049  * @write_bytes: The length to write, will be updated to the nocow writeable
1050  *               range.
1051  *
1052  * This function will flush ordered extents in the range to ensure proper
1053  * nocow checks.
1054  *
1055  * Return:
1056  * > 0          If we can nocow, and updates @write_bytes.
1057  *  0           If we can't do a nocow write.
1058  * -EAGAIN      If we can't do a nocow write because snapshoting of the inode's
1059  *              root is in progress.
1060  * < 0          If an error happened.
1061  *
1062  * NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
1063  */
1064 int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
1065                            size_t *write_bytes, bool nowait)
1066 {
1067         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1068         struct btrfs_root *root = inode->root;
1069         struct extent_state *cached_state = NULL;
1070         u64 lockstart, lockend;
1071         u64 num_bytes;
1072         int ret;
1073
1074         if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1075                 return 0;
1076
1077         if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
1078                 return -EAGAIN;
1079
1080         lockstart = round_down(pos, fs_info->sectorsize);
1081         lockend = round_up(pos + *write_bytes,
1082                            fs_info->sectorsize) - 1;
1083         num_bytes = lockend - lockstart + 1;
1084
1085         if (nowait) {
1086                 if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
1087                                                   &cached_state)) {
1088                         btrfs_drew_write_unlock(&root->snapshot_lock);
1089                         return -EAGAIN;
1090                 }
1091         } else {
1092                 btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
1093                                                    &cached_state);
1094         }
1095         ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
1096                         NULL, NULL, NULL, nowait, false);
1097         if (ret <= 0)
1098                 btrfs_drew_write_unlock(&root->snapshot_lock);
1099         else
1100                 *write_bytes = min_t(size_t, *write_bytes ,
1101                                      num_bytes - pos + lockstart);
1102         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
1103
1104         return ret;
1105 }
1106
1107 void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
1108 {
1109         btrfs_drew_write_unlock(&inode->root->snapshot_lock);
1110 }
1111
1112 static void update_time_for_write(struct inode *inode)
1113 {
1114         struct timespec64 now, ctime;
1115
1116         if (IS_NOCMTIME(inode))
1117                 return;
1118
1119         now = current_time(inode);
1120         if (!timespec64_equal(&inode->i_mtime, &now))
1121                 inode->i_mtime = now;
1122
1123         ctime = inode_get_ctime(inode);
1124         if (!timespec64_equal(&ctime, &now))
1125                 inode_set_ctime_to_ts(inode, now);
1126
1127         if (IS_I_VERSION(inode))
1128                 inode_inc_iversion(inode);
1129 }
1130
1131 static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
1132                              size_t count)
1133 {
1134         struct file *file = iocb->ki_filp;
1135         struct inode *inode = file_inode(file);
1136         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1137         loff_t pos = iocb->ki_pos;
1138         int ret;
1139         loff_t oldsize;
1140         loff_t start_pos;
1141
1142         /*
1143          * Quickly bail out on NOWAIT writes if we don't have the nodatacow or
1144          * prealloc flags, as without those flags we always have to COW. We will
1145          * later check if we can really COW into the target range (using
1146          * can_nocow_extent() at btrfs_get_blocks_direct_write()).
1147          */
1148         if ((iocb->ki_flags & IOCB_NOWAIT) &&
1149             !(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
1150                 return -EAGAIN;
1151
1152         ret = file_remove_privs(file);
1153         if (ret)
1154                 return ret;
1155
1156         /*
1157          * We reserve space for updating the inode when we reserve space for the
1158          * extent we are going to write, so we will enospc out there.  We don't
1159          * need to start yet another transaction to update the inode as we will
1160          * update the inode when we finish writing whatever data we write.
1161          */
1162         update_time_for_write(inode);
1163
1164         start_pos = round_down(pos, fs_info->sectorsize);
1165         oldsize = i_size_read(inode);
1166         if (start_pos > oldsize) {
1167                 /* Expand hole size to cover write data, preventing empty gap */
1168                 loff_t end_pos = round_up(pos + count, fs_info->sectorsize);
1169
1170                 ret = btrfs_cont_expand(BTRFS_I(inode), oldsize, end_pos);
1171                 if (ret)
1172                         return ret;
1173         }
1174
1175         return 0;
1176 }
1177
1178 static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
1179                                                struct iov_iter *i)
1180 {
1181         struct file *file = iocb->ki_filp;
1182         loff_t pos;
1183         struct inode *inode = file_inode(file);
1184         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1185         struct page **pages = NULL;
1186         struct extent_changeset *data_reserved = NULL;
1187         u64 release_bytes = 0;
1188         u64 lockstart;
1189         u64 lockend;
1190         size_t num_written = 0;
1191         int nrptrs;
1192         ssize_t ret;
1193         bool only_release_metadata = false;
1194         bool force_page_uptodate = false;
1195         loff_t old_isize = i_size_read(inode);
1196         unsigned int ilock_flags = 0;
1197         const bool nowait = (iocb->ki_flags & IOCB_NOWAIT);
1198         unsigned int bdp_flags = (nowait ? BDP_ASYNC : 0);
1199
1200         if (nowait)
1201                 ilock_flags |= BTRFS_ILOCK_TRY;
1202
1203         ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1204         if (ret < 0)
1205                 return ret;
1206
1207         ret = generic_write_checks(iocb, i);
1208         if (ret <= 0)
1209                 goto out;
1210
1211         ret = btrfs_write_check(iocb, i, ret);
1212         if (ret < 0)
1213                 goto out;
1214
1215         pos = iocb->ki_pos;
1216         nrptrs = min(DIV_ROUND_UP(iov_iter_count(i), PAGE_SIZE),
1217                         PAGE_SIZE / (sizeof(struct page *)));
1218         nrptrs = min(nrptrs, current->nr_dirtied_pause - current->nr_dirtied);
1219         nrptrs = max(nrptrs, 8);
1220         pages = kmalloc_array(nrptrs, sizeof(struct page *), GFP_KERNEL);
1221         if (!pages) {
1222                 ret = -ENOMEM;
1223                 goto out;
1224         }
1225
1226         while (iov_iter_count(i) > 0) {
1227                 struct extent_state *cached_state = NULL;
1228                 size_t offset = offset_in_page(pos);
1229                 size_t sector_offset;
1230                 size_t write_bytes = min(iov_iter_count(i),
1231                                          nrptrs * (size_t)PAGE_SIZE -
1232                                          offset);
1233                 size_t num_pages;
1234                 size_t reserve_bytes;
1235                 size_t dirty_pages;
1236                 size_t copied;
1237                 size_t dirty_sectors;
1238                 size_t num_sectors;
1239                 int extents_locked;
1240
1241                 /*
1242                  * Fault pages before locking them in prepare_pages
1243                  * to avoid recursive lock
1244                  */
1245                 if (unlikely(fault_in_iov_iter_readable(i, write_bytes))) {
1246                         ret = -EFAULT;
1247                         break;
1248                 }
1249
1250                 only_release_metadata = false;
1251                 sector_offset = pos & (fs_info->sectorsize - 1);
1252
1253                 extent_changeset_release(data_reserved);
1254                 ret = btrfs_check_data_free_space(BTRFS_I(inode),
1255                                                   &data_reserved, pos,
1256                                                   write_bytes, nowait);
1257                 if (ret < 0) {
1258                         int can_nocow;
1259
1260                         if (nowait && (ret == -ENOSPC || ret == -EAGAIN)) {
1261                                 ret = -EAGAIN;
1262                                 break;
1263                         }
1264
1265                         /*
1266                          * If we don't have to COW at the offset, reserve
1267                          * metadata only. write_bytes may get smaller than
1268                          * requested here.
1269                          */
1270                         can_nocow = btrfs_check_nocow_lock(BTRFS_I(inode), pos,
1271                                                            &write_bytes, nowait);
1272                         if (can_nocow < 0)
1273                                 ret = can_nocow;
1274                         if (can_nocow > 0)
1275                                 ret = 0;
1276                         if (ret)
1277                                 break;
1278                         only_release_metadata = true;
1279                 }
1280
1281                 num_pages = DIV_ROUND_UP(write_bytes + offset, PAGE_SIZE);
1282                 WARN_ON(num_pages > nrptrs);
1283                 reserve_bytes = round_up(write_bytes + sector_offset,
1284                                          fs_info->sectorsize);
1285                 WARN_ON(reserve_bytes == 0);
1286                 ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
1287                                                       reserve_bytes,
1288                                                       reserve_bytes, nowait);
1289                 if (ret) {
1290                         if (!only_release_metadata)
1291                                 btrfs_free_reserved_data_space(BTRFS_I(inode),
1292                                                 data_reserved, pos,
1293                                                 write_bytes);
1294                         else
1295                                 btrfs_check_nocow_unlock(BTRFS_I(inode));
1296
1297                         if (nowait && ret == -ENOSPC)
1298                                 ret = -EAGAIN;
1299                         break;
1300                 }
1301
1302                 release_bytes = reserve_bytes;
1303 again:
1304                 ret = balance_dirty_pages_ratelimited_flags(inode->i_mapping, bdp_flags);
1305                 if (ret) {
1306                         btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1307                         break;
1308                 }
1309
1310                 /*
1311                  * This is going to setup the pages array with the number of
1312                  * pages we want, so we don't really need to worry about the
1313                  * contents of pages from loop to loop
1314                  */
1315                 ret = prepare_pages(inode, pages, num_pages,
1316                                     pos, write_bytes, force_page_uptodate, false);
1317                 if (ret) {
1318                         btrfs_delalloc_release_extents(BTRFS_I(inode),
1319                                                        reserve_bytes);
1320                         break;
1321                 }
1322
1323                 extents_locked = lock_and_cleanup_extent_if_need(
1324                                 BTRFS_I(inode), pages,
1325                                 num_pages, pos, write_bytes, &lockstart,
1326                                 &lockend, nowait, &cached_state);
1327                 if (extents_locked < 0) {
1328                         if (!nowait && extents_locked == -EAGAIN)
1329                                 goto again;
1330
1331                         btrfs_delalloc_release_extents(BTRFS_I(inode),
1332                                                        reserve_bytes);
1333                         ret = extents_locked;
1334                         break;
1335                 }
1336
1337                 copied = btrfs_copy_from_user(pos, write_bytes, pages, i);
1338
1339                 num_sectors = BTRFS_BYTES_TO_BLKS(fs_info, reserve_bytes);
1340                 dirty_sectors = round_up(copied + sector_offset,
1341                                         fs_info->sectorsize);
1342                 dirty_sectors = BTRFS_BYTES_TO_BLKS(fs_info, dirty_sectors);
1343
1344                 /*
1345                  * if we have trouble faulting in the pages, fall
1346                  * back to one page at a time
1347                  */
1348                 if (copied < write_bytes)
1349                         nrptrs = 1;
1350
1351                 if (copied == 0) {
1352                         force_page_uptodate = true;
1353                         dirty_sectors = 0;
1354                         dirty_pages = 0;
1355                 } else {
1356                         force_page_uptodate = false;
1357                         dirty_pages = DIV_ROUND_UP(copied + offset,
1358                                                    PAGE_SIZE);
1359                 }
1360
1361                 if (num_sectors > dirty_sectors) {
1362                         /* release everything except the sectors we dirtied */
1363                         release_bytes -= dirty_sectors << fs_info->sectorsize_bits;
1364                         if (only_release_metadata) {
1365                                 btrfs_delalloc_release_metadata(BTRFS_I(inode),
1366                                                         release_bytes, true);
1367                         } else {
1368                                 u64 __pos;
1369
1370                                 __pos = round_down(pos,
1371                                                    fs_info->sectorsize) +
1372                                         (dirty_pages << PAGE_SHIFT);
1373                                 btrfs_delalloc_release_space(BTRFS_I(inode),
1374                                                 data_reserved, __pos,
1375                                                 release_bytes, true);
1376                         }
1377                 }
1378
1379                 release_bytes = round_up(copied + sector_offset,
1380                                         fs_info->sectorsize);
1381
1382                 ret = btrfs_dirty_pages(BTRFS_I(inode), pages,
1383                                         dirty_pages, pos, copied,
1384                                         &cached_state, only_release_metadata);
1385
1386                 /*
1387                  * If we have not locked the extent range, because the range's
1388                  * start offset is >= i_size, we might still have a non-NULL
1389                  * cached extent state, acquired while marking the extent range
1390                  * as delalloc through btrfs_dirty_pages(). Therefore free any
1391                  * possible cached extent state to avoid a memory leak.
1392                  */
1393                 if (extents_locked)
1394                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
1395                                       lockend, &cached_state);
1396                 else
1397                         free_extent_state(cached_state);
1398
1399                 btrfs_delalloc_release_extents(BTRFS_I(inode), reserve_bytes);
1400                 if (ret) {
1401                         btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1402                         break;
1403                 }
1404
1405                 release_bytes = 0;
1406                 if (only_release_metadata)
1407                         btrfs_check_nocow_unlock(BTRFS_I(inode));
1408
1409                 btrfs_drop_pages(fs_info, pages, num_pages, pos, copied);
1410
1411                 cond_resched();
1412
1413                 pos += copied;
1414                 num_written += copied;
1415         }
1416
1417         kfree(pages);
1418
1419         if (release_bytes) {
1420                 if (only_release_metadata) {
1421                         btrfs_check_nocow_unlock(BTRFS_I(inode));
1422                         btrfs_delalloc_release_metadata(BTRFS_I(inode),
1423                                         release_bytes, true);
1424                 } else {
1425                         btrfs_delalloc_release_space(BTRFS_I(inode),
1426                                         data_reserved,
1427                                         round_down(pos, fs_info->sectorsize),
1428                                         release_bytes, true);
1429                 }
1430         }
1431
1432         extent_changeset_free(data_reserved);
1433         if (num_written > 0) {
1434                 pagecache_isize_extended(inode, old_isize, iocb->ki_pos);
1435                 iocb->ki_pos += num_written;
1436         }
1437 out:
1438         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1439         return num_written ? num_written : ret;
1440 }
1441
1442 static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
1443                                const struct iov_iter *iter, loff_t offset)
1444 {
1445         const u32 blocksize_mask = fs_info->sectorsize - 1;
1446
1447         if (offset & blocksize_mask)
1448                 return -EINVAL;
1449
1450         if (iov_iter_alignment(iter) & blocksize_mask)
1451                 return -EINVAL;
1452
1453         return 0;
1454 }
1455
1456 static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
1457 {
1458         struct file *file = iocb->ki_filp;
1459         struct inode *inode = file_inode(file);
1460         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1461         loff_t pos;
1462         ssize_t written = 0;
1463         ssize_t written_buffered;
1464         size_t prev_left = 0;
1465         loff_t endbyte;
1466         ssize_t err;
1467         unsigned int ilock_flags = 0;
1468         struct iomap_dio *dio;
1469
1470         if (iocb->ki_flags & IOCB_NOWAIT)
1471                 ilock_flags |= BTRFS_ILOCK_TRY;
1472
1473         /*
1474          * If the write DIO is within EOF, use a shared lock and also only if
1475          * security bits will likely not be dropped by file_remove_privs() called
1476          * from btrfs_write_check(). Either will need to be rechecked after the
1477          * lock was acquired.
1478          */
1479         if (iocb->ki_pos + iov_iter_count(from) <= i_size_read(inode) && IS_NOSEC(inode))
1480                 ilock_flags |= BTRFS_ILOCK_SHARED;
1481
1482 relock:
1483         err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
1484         if (err < 0)
1485                 return err;
1486
1487         /* Shared lock cannot be used with security bits set. */
1488         if ((ilock_flags & BTRFS_ILOCK_SHARED) && !IS_NOSEC(inode)) {
1489                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1490                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1491                 goto relock;
1492         }
1493
1494         err = generic_write_checks(iocb, from);
1495         if (err <= 0) {
1496                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1497                 return err;
1498         }
1499
1500         err = btrfs_write_check(iocb, from, err);
1501         if (err < 0) {
1502                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1503                 goto out;
1504         }
1505
1506         pos = iocb->ki_pos;
1507         /*
1508          * Re-check since file size may have changed just before taking the
1509          * lock or pos may have changed because of O_APPEND in generic_write_check()
1510          */
1511         if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
1512             pos + iov_iter_count(from) > i_size_read(inode)) {
1513                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1514                 ilock_flags &= ~BTRFS_ILOCK_SHARED;
1515                 goto relock;
1516         }
1517
1518         if (check_direct_IO(fs_info, from, pos)) {
1519                 btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1520                 goto buffered;
1521         }
1522
1523         /*
1524          * The iov_iter can be mapped to the same file range we are writing to.
1525          * If that's the case, then we will deadlock in the iomap code, because
1526          * it first calls our callback btrfs_dio_iomap_begin(), which will create
1527          * an ordered extent, and after that it will fault in the pages that the
1528          * iov_iter refers to. During the fault in we end up in the readahead
1529          * pages code (starting at btrfs_readahead()), which will lock the range,
1530          * find that ordered extent and then wait for it to complete (at
1531          * btrfs_lock_and_flush_ordered_range()), resulting in a deadlock since
1532          * obviously the ordered extent can never complete as we didn't submit
1533          * yet the respective bio(s). This always happens when the buffer is
1534          * memory mapped to the same file range, since the iomap DIO code always
1535          * invalidates pages in the target file range (after starting and waiting
1536          * for any writeback).
1537          *
1538          * So here we disable page faults in the iov_iter and then retry if we
1539          * got -EFAULT, faulting in the pages before the retry.
1540          */
1541         from->nofault = true;
1542         dio = btrfs_dio_write(iocb, from, written);
1543         from->nofault = false;
1544
1545         /*
1546          * iomap_dio_complete() will call btrfs_sync_file() if we have a dsync
1547          * iocb, and that needs to lock the inode. So unlock it before calling
1548          * iomap_dio_complete() to avoid a deadlock.
1549          */
1550         btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
1551
1552         if (IS_ERR_OR_NULL(dio))
1553                 err = PTR_ERR_OR_ZERO(dio);
1554         else
1555                 err = iomap_dio_complete(dio);
1556
1557         /* No increment (+=) because iomap returns a cumulative value. */
1558         if (err > 0)
1559                 written = err;
1560
1561         if (iov_iter_count(from) > 0 && (err == -EFAULT || err > 0)) {
1562                 const size_t left = iov_iter_count(from);
1563                 /*
1564                  * We have more data left to write. Try to fault in as many as
1565                  * possible of the remainder pages and retry. We do this without
1566                  * releasing and locking again the inode, to prevent races with
1567                  * truncate.
1568                  *
1569                  * Also, in case the iov refers to pages in the file range of the
1570                  * file we want to write to (due to a mmap), we could enter an
1571                  * infinite loop if we retry after faulting the pages in, since
1572                  * iomap will invalidate any pages in the range early on, before
1573                  * it tries to fault in the pages of the iov. So we keep track of
1574                  * how much was left of iov in the previous EFAULT and fallback
1575                  * to buffered IO in case we haven't made any progress.
1576                  */
1577                 if (left == prev_left) {
1578                         err = -ENOTBLK;
1579                 } else {
1580                         fault_in_iov_iter_readable(from, left);
1581                         prev_left = left;
1582                         goto relock;
1583                 }
1584         }
1585
1586         /*
1587          * If 'err' is -ENOTBLK or we have not written all data, then it means
1588          * we must fallback to buffered IO.
1589          */
1590         if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
1591                 goto out;
1592
1593 buffered:
1594         /*
1595          * If we are in a NOWAIT context, then return -EAGAIN to signal the caller
1596          * it must retry the operation in a context where blocking is acceptable,
1597          * because even if we end up not blocking during the buffered IO attempt
1598          * below, we will block when flushing and waiting for the IO.
1599          */
1600         if (iocb->ki_flags & IOCB_NOWAIT) {
1601                 err = -EAGAIN;
1602                 goto out;
1603         }
1604
1605         pos = iocb->ki_pos;
1606         written_buffered = btrfs_buffered_write(iocb, from);
1607         if (written_buffered < 0) {
1608                 err = written_buffered;
1609                 goto out;
1610         }
1611         /*
1612          * Ensure all data is persisted. We want the next direct IO read to be
1613          * able to read what was just written.
1614          */
1615         endbyte = pos + written_buffered - 1;
1616         err = btrfs_fdatawrite_range(inode, pos, endbyte);
1617         if (err)
1618                 goto out;
1619         err = filemap_fdatawait_range(inode->i_mapping, pos, endbyte);
1620         if (err)
1621                 goto out;
1622         written += written_buffered;
1623         iocb->ki_pos = pos + written_buffered;
1624         invalidate_mapping_pages(file->f_mapping, pos >> PAGE_SHIFT,
1625                                  endbyte >> PAGE_SHIFT);
1626 out:
1627         return err < 0 ? err : written;
1628 }
1629
1630 static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
1631                         const struct btrfs_ioctl_encoded_io_args *encoded)
1632 {
1633         struct file *file = iocb->ki_filp;
1634         struct inode *inode = file_inode(file);
1635         loff_t count;
1636         ssize_t ret;
1637
1638         btrfs_inode_lock(BTRFS_I(inode), 0);
1639         count = encoded->len;
1640         ret = generic_write_checks_count(iocb, &count);
1641         if (ret == 0 && count != encoded->len) {
1642                 /*
1643                  * The write got truncated by generic_write_checks_count(). We
1644                  * can't do a partial encoded write.
1645                  */
1646                 ret = -EFBIG;
1647         }
1648         if (ret || encoded->len == 0)
1649                 goto out;
1650
1651         ret = btrfs_write_check(iocb, from, encoded->len);
1652         if (ret < 0)
1653                 goto out;
1654
1655         ret = btrfs_do_encoded_write(iocb, from, encoded);
1656 out:
1657         btrfs_inode_unlock(BTRFS_I(inode), 0);
1658         return ret;
1659 }
1660
1661 ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
1662                             const struct btrfs_ioctl_encoded_io_args *encoded)
1663 {
1664         struct file *file = iocb->ki_filp;
1665         struct btrfs_inode *inode = BTRFS_I(file_inode(file));
1666         ssize_t num_written, num_sync;
1667
1668         /*
1669          * If the fs flips readonly due to some impossible error, although we
1670          * have opened a file as writable, we have to stop this write operation
1671          * to ensure consistency.
1672          */
1673         if (BTRFS_FS_ERROR(inode->root->fs_info))
1674                 return -EROFS;
1675
1676         if (encoded && (iocb->ki_flags & IOCB_NOWAIT))
1677                 return -EOPNOTSUPP;
1678
1679         if (encoded) {
1680                 num_written = btrfs_encoded_write(iocb, from, encoded);
1681                 num_sync = encoded->len;
1682         } else if (iocb->ki_flags & IOCB_DIRECT) {
1683                 num_written = btrfs_direct_write(iocb, from);
1684                 num_sync = num_written;
1685         } else {
1686                 num_written = btrfs_buffered_write(iocb, from);
1687                 num_sync = num_written;
1688         }
1689
1690         btrfs_set_inode_last_sub_trans(inode);
1691
1692         if (num_sync > 0) {
1693                 num_sync = generic_write_sync(iocb, num_sync);
1694                 if (num_sync < 0)
1695                         num_written = num_sync;
1696         }
1697
1698         return num_written;
1699 }
1700
1701 static ssize_t btrfs_file_write_iter(struct kiocb *iocb, struct iov_iter *from)
1702 {
1703         return btrfs_do_write_iter(iocb, from, NULL);
1704 }
1705
1706 int btrfs_release_file(struct inode *inode, struct file *filp)
1707 {
1708         struct btrfs_file_private *private = filp->private_data;
1709
1710         if (private) {
1711                 kfree(private->filldir_buf);
1712                 free_extent_state(private->llseek_cached_state);
1713                 kfree(private);
1714                 filp->private_data = NULL;
1715         }
1716
1717         /*
1718          * Set by setattr when we are about to truncate a file from a non-zero
1719          * size to a zero size.  This tries to flush down new bytes that may
1720          * have been written if the application were using truncate to replace
1721          * a file in place.
1722          */
1723         if (test_and_clear_bit(BTRFS_INODE_FLUSH_ON_CLOSE,
1724                                &BTRFS_I(inode)->runtime_flags))
1725                         filemap_flush(inode->i_mapping);
1726         return 0;
1727 }
1728
1729 static int start_ordered_ops(struct inode *inode, loff_t start, loff_t end)
1730 {
1731         int ret;
1732         struct blk_plug plug;
1733
1734         /*
1735          * This is only called in fsync, which would do synchronous writes, so
1736          * a plug can merge adjacent IOs as much as possible.  Esp. in case of
1737          * multiple disks using raid profile, a large IO can be split to
1738          * several segments of stripe length (currently 64K).
1739          */
1740         blk_start_plug(&plug);
1741         ret = btrfs_fdatawrite_range(inode, start, end);
1742         blk_finish_plug(&plug);
1743
1744         return ret;
1745 }
1746
1747 static inline bool skip_inode_logging(const struct btrfs_log_ctx *ctx)
1748 {
1749         struct btrfs_inode *inode = BTRFS_I(ctx->inode);
1750         struct btrfs_fs_info *fs_info = inode->root->fs_info;
1751
1752         if (btrfs_inode_in_log(inode, fs_info->generation) &&
1753             list_empty(&ctx->ordered_extents))
1754                 return true;
1755
1756         /*
1757          * If we are doing a fast fsync we can not bail out if the inode's
1758          * last_trans is <= then the last committed transaction, because we only
1759          * update the last_trans of the inode during ordered extent completion,
1760          * and for a fast fsync we don't wait for that, we only wait for the
1761          * writeback to complete.
1762          */
1763         if (inode->last_trans <= fs_info->last_trans_committed &&
1764             (test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, &inode->runtime_flags) ||
1765              list_empty(&ctx->ordered_extents)))
1766                 return true;
1767
1768         return false;
1769 }
1770
1771 /*
1772  * fsync call for both files and directories.  This logs the inode into
1773  * the tree log instead of forcing full commits whenever possible.
1774  *
1775  * It needs to call filemap_fdatawait so that all ordered extent updates are
1776  * in the metadata btree are up to date for copying to the log.
1777  *
1778  * It drops the inode mutex before doing the tree log commit.  This is an
1779  * important optimization for directories because holding the mutex prevents
1780  * new operations on the dir while we write to disk.
1781  */
1782 int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
1783 {
1784         struct dentry *dentry = file_dentry(file);
1785         struct inode *inode = d_inode(dentry);
1786         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
1787         struct btrfs_root *root = BTRFS_I(inode)->root;
1788         struct btrfs_trans_handle *trans;
1789         struct btrfs_log_ctx ctx;
1790         int ret = 0, err;
1791         u64 len;
1792         bool full_sync;
1793
1794         trace_btrfs_sync_file(file, datasync);
1795
1796         btrfs_init_log_ctx(&ctx, inode);
1797
1798         /*
1799          * Always set the range to a full range, otherwise we can get into
1800          * several problems, from missing file extent items to represent holes
1801          * when not using the NO_HOLES feature, to log tree corruption due to
1802          * races between hole detection during logging and completion of ordered
1803          * extents outside the range, to missing checksums due to ordered extents
1804          * for which we flushed only a subset of their pages.
1805          */
1806         start = 0;
1807         end = LLONG_MAX;
1808         len = (u64)LLONG_MAX + 1;
1809
1810         /*
1811          * We write the dirty pages in the range and wait until they complete
1812          * out of the ->i_mutex. If so, we can flush the dirty pages by
1813          * multi-task, and make the performance up.  See
1814          * btrfs_wait_ordered_range for an explanation of the ASYNC check.
1815          */
1816         ret = start_ordered_ops(inode, start, end);
1817         if (ret)
1818                 goto out;
1819
1820         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1821
1822         atomic_inc(&root->log_batch);
1823
1824         /*
1825          * Before we acquired the inode's lock and the mmap lock, someone may
1826          * have dirtied more pages in the target range. We need to make sure
1827          * that writeback for any such pages does not start while we are logging
1828          * the inode, because if it does, any of the following might happen when
1829          * we are not doing a full inode sync:
1830          *
1831          * 1) We log an extent after its writeback finishes but before its
1832          *    checksums are added to the csum tree, leading to -EIO errors
1833          *    when attempting to read the extent after a log replay.
1834          *
1835          * 2) We can end up logging an extent before its writeback finishes.
1836          *    Therefore after the log replay we will have a file extent item
1837          *    pointing to an unwritten extent (and no data checksums as well).
1838          *
1839          * So trigger writeback for any eventual new dirty pages and then we
1840          * wait for all ordered extents to complete below.
1841          */
1842         ret = start_ordered_ops(inode, start, end);
1843         if (ret) {
1844                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1845                 goto out;
1846         }
1847
1848         /*
1849          * Always check for the full sync flag while holding the inode's lock,
1850          * to avoid races with other tasks. The flag must be either set all the
1851          * time during logging or always off all the time while logging.
1852          * We check the flag here after starting delalloc above, because when
1853          * running delalloc the full sync flag may be set if we need to drop
1854          * extra extent map ranges due to temporary memory allocation failures.
1855          */
1856         full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1857                              &BTRFS_I(inode)->runtime_flags);
1858
1859         /*
1860          * We have to do this here to avoid the priority inversion of waiting on
1861          * IO of a lower priority task while holding a transaction open.
1862          *
1863          * For a full fsync we wait for the ordered extents to complete while
1864          * for a fast fsync we wait just for writeback to complete, and then
1865          * attach the ordered extents to the transaction so that a transaction
1866          * commit waits for their completion, to avoid data loss if we fsync,
1867          * the current transaction commits before the ordered extents complete
1868          * and a power failure happens right after that.
1869          *
1870          * For zoned filesystem, if a write IO uses a ZONE_APPEND command, the
1871          * logical address recorded in the ordered extent may change. We need
1872          * to wait for the IO to stabilize the logical address.
1873          */
1874         if (full_sync || btrfs_is_zoned(fs_info)) {
1875                 ret = btrfs_wait_ordered_range(inode, start, len);
1876         } else {
1877                 /*
1878                  * Get our ordered extents as soon as possible to avoid doing
1879                  * checksum lookups in the csum tree, and use instead the
1880                  * checksums attached to the ordered extents.
1881                  */
1882                 btrfs_get_ordered_extents_for_logging(BTRFS_I(inode),
1883                                                       &ctx.ordered_extents);
1884                 ret = filemap_fdatawait_range(inode->i_mapping, start, end);
1885         }
1886
1887         if (ret)
1888                 goto out_release_extents;
1889
1890         atomic_inc(&root->log_batch);
1891
1892         smp_mb();
1893         if (skip_inode_logging(&ctx)) {
1894                 /*
1895                  * We've had everything committed since the last time we were
1896                  * modified so clear this flag in case it was set for whatever
1897                  * reason, it's no longer relevant.
1898                  */
1899                 clear_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
1900                           &BTRFS_I(inode)->runtime_flags);
1901                 /*
1902                  * An ordered extent might have started before and completed
1903                  * already with io errors, in which case the inode was not
1904                  * updated and we end up here. So check the inode's mapping
1905                  * for any errors that might have happened since we last
1906                  * checked called fsync.
1907                  */
1908                 ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err);
1909                 goto out_release_extents;
1910         }
1911
1912         /*
1913          * We use start here because we will need to wait on the IO to complete
1914          * in btrfs_sync_log, which could require joining a transaction (for
1915          * example checking cross references in the nocow path).  If we use join
1916          * here we could get into a situation where we're waiting on IO to
1917          * happen that is blocked on a transaction trying to commit.  With start
1918          * we inc the extwriter counter, so we wait for all extwriters to exit
1919          * before we start blocking joiners.  This comment is to keep somebody
1920          * from thinking they are super smart and changing this to
1921          * btrfs_join_transaction *cough*Josef*cough*.
1922          */
1923         trans = btrfs_start_transaction(root, 0);
1924         if (IS_ERR(trans)) {
1925                 ret = PTR_ERR(trans);
1926                 goto out_release_extents;
1927         }
1928         trans->in_fsync = true;
1929
1930         ret = btrfs_log_dentry_safe(trans, dentry, &ctx);
1931         btrfs_release_log_ctx_extents(&ctx);
1932         if (ret < 0) {
1933                 /* Fallthrough and commit/free transaction. */
1934                 ret = BTRFS_LOG_FORCE_COMMIT;
1935         }
1936
1937         /* we've logged all the items and now have a consistent
1938          * version of the file in the log.  It is possible that
1939          * someone will come in and modify the file, but that's
1940          * fine because the log is consistent on disk, and we
1941          * have references to all of the file's extents
1942          *
1943          * It is possible that someone will come in and log the
1944          * file again, but that will end up using the synchronization
1945          * inside btrfs_sync_log to keep things safe.
1946          */
1947         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
1948
1949         if (ret == BTRFS_NO_LOG_SYNC) {
1950                 ret = btrfs_end_transaction(trans);
1951                 goto out;
1952         }
1953
1954         /* We successfully logged the inode, attempt to sync the log. */
1955         if (!ret) {
1956                 ret = btrfs_sync_log(trans, root, &ctx);
1957                 if (!ret) {
1958                         ret = btrfs_end_transaction(trans);
1959                         goto out;
1960                 }
1961         }
1962
1963         /*
1964          * At this point we need to commit the transaction because we had
1965          * btrfs_need_log_full_commit() or some other error.
1966          *
1967          * If we didn't do a full sync we have to stop the trans handle, wait on
1968          * the ordered extents, start it again and commit the transaction.  If
1969          * we attempt to wait on the ordered extents here we could deadlock with
1970          * something like fallocate() that is holding the extent lock trying to
1971          * start a transaction while some other thread is trying to commit the
1972          * transaction while we (fsync) are currently holding the transaction
1973          * open.
1974          */
1975         if (!full_sync) {
1976                 ret = btrfs_end_transaction(trans);
1977                 if (ret)
1978                         goto out;
1979                 ret = btrfs_wait_ordered_range(inode, start, len);
1980                 if (ret)
1981                         goto out;
1982
1983                 /*
1984                  * This is safe to use here because we're only interested in
1985                  * making sure the transaction that had the ordered extents is
1986                  * committed.  We aren't waiting on anything past this point,
1987                  * we're purely getting the transaction and committing it.
1988                  */
1989                 trans = btrfs_attach_transaction_barrier(root);
1990                 if (IS_ERR(trans)) {
1991                         ret = PTR_ERR(trans);
1992
1993                         /*
1994                          * We committed the transaction and there's no currently
1995                          * running transaction, this means everything we care
1996                          * about made it to disk and we are done.
1997                          */
1998                         if (ret == -ENOENT)
1999                                 ret = 0;
2000                         goto out;
2001                 }
2002         }
2003
2004         ret = btrfs_commit_transaction(trans);
2005 out:
2006         ASSERT(list_empty(&ctx.list));
2007         ASSERT(list_empty(&ctx.conflict_inodes));
2008         err = file_check_and_advance_wb_err(file);
2009         if (!ret)
2010                 ret = err;
2011         return ret > 0 ? -EIO : ret;
2012
2013 out_release_extents:
2014         btrfs_release_log_ctx_extents(&ctx);
2015         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2016         goto out;
2017 }
2018
2019 static const struct vm_operations_struct btrfs_file_vm_ops = {
2020         .fault          = filemap_fault,
2021         .map_pages      = filemap_map_pages,
2022         .page_mkwrite   = btrfs_page_mkwrite,
2023 };
2024
2025 static int btrfs_file_mmap(struct file  *filp, struct vm_area_struct *vma)
2026 {
2027         struct address_space *mapping = filp->f_mapping;
2028
2029         if (!mapping->a_ops->read_folio)
2030                 return -ENOEXEC;
2031
2032         file_accessed(filp);
2033         vma->vm_ops = &btrfs_file_vm_ops;
2034
2035         return 0;
2036 }
2037
2038 static int hole_mergeable(struct btrfs_inode *inode, struct extent_buffer *leaf,
2039                           int slot, u64 start, u64 end)
2040 {
2041         struct btrfs_file_extent_item *fi;
2042         struct btrfs_key key;
2043
2044         if (slot < 0 || slot >= btrfs_header_nritems(leaf))
2045                 return 0;
2046
2047         btrfs_item_key_to_cpu(leaf, &key, slot);
2048         if (key.objectid != btrfs_ino(inode) ||
2049             key.type != BTRFS_EXTENT_DATA_KEY)
2050                 return 0;
2051
2052         fi = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2053
2054         if (btrfs_file_extent_type(leaf, fi) != BTRFS_FILE_EXTENT_REG)
2055                 return 0;
2056
2057         if (btrfs_file_extent_disk_bytenr(leaf, fi))
2058                 return 0;
2059
2060         if (key.offset == end)
2061                 return 1;
2062         if (key.offset + btrfs_file_extent_num_bytes(leaf, fi) == start)
2063                 return 1;
2064         return 0;
2065 }
2066
2067 static int fill_holes(struct btrfs_trans_handle *trans,
2068                 struct btrfs_inode *inode,
2069                 struct btrfs_path *path, u64 offset, u64 end)
2070 {
2071         struct btrfs_fs_info *fs_info = trans->fs_info;
2072         struct btrfs_root *root = inode->root;
2073         struct extent_buffer *leaf;
2074         struct btrfs_file_extent_item *fi;
2075         struct extent_map *hole_em;
2076         struct btrfs_key key;
2077         int ret;
2078
2079         if (btrfs_fs_incompat(fs_info, NO_HOLES))
2080                 goto out;
2081
2082         key.objectid = btrfs_ino(inode);
2083         key.type = BTRFS_EXTENT_DATA_KEY;
2084         key.offset = offset;
2085
2086         ret = btrfs_search_slot(trans, root, &key, path, 0, 1);
2087         if (ret <= 0) {
2088                 /*
2089                  * We should have dropped this offset, so if we find it then
2090                  * something has gone horribly wrong.
2091                  */
2092                 if (ret == 0)
2093                         ret = -EINVAL;
2094                 return ret;
2095         }
2096
2097         leaf = path->nodes[0];
2098         if (hole_mergeable(inode, leaf, path->slots[0] - 1, offset, end)) {
2099                 u64 num_bytes;
2100
2101                 path->slots[0]--;
2102                 fi = btrfs_item_ptr(leaf, path->slots[0],
2103                                     struct btrfs_file_extent_item);
2104                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) +
2105                         end - offset;
2106                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2107                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2108                 btrfs_set_file_extent_offset(leaf, fi, 0);
2109                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2110                 btrfs_mark_buffer_dirty(trans, leaf);
2111                 goto out;
2112         }
2113
2114         if (hole_mergeable(inode, leaf, path->slots[0], offset, end)) {
2115                 u64 num_bytes;
2116
2117                 key.offset = offset;
2118                 btrfs_set_item_key_safe(trans, path, &key);
2119                 fi = btrfs_item_ptr(leaf, path->slots[0],
2120                                     struct btrfs_file_extent_item);
2121                 num_bytes = btrfs_file_extent_num_bytes(leaf, fi) + end -
2122                         offset;
2123                 btrfs_set_file_extent_num_bytes(leaf, fi, num_bytes);
2124                 btrfs_set_file_extent_ram_bytes(leaf, fi, num_bytes);
2125                 btrfs_set_file_extent_offset(leaf, fi, 0);
2126                 btrfs_set_file_extent_generation(leaf, fi, trans->transid);
2127                 btrfs_mark_buffer_dirty(trans, leaf);
2128                 goto out;
2129         }
2130         btrfs_release_path(path);
2131
2132         ret = btrfs_insert_hole_extent(trans, root, btrfs_ino(inode), offset,
2133                                        end - offset);
2134         if (ret)
2135                 return ret;
2136
2137 out:
2138         btrfs_release_path(path);
2139
2140         hole_em = alloc_extent_map();
2141         if (!hole_em) {
2142                 btrfs_drop_extent_map_range(inode, offset, end - 1, false);
2143                 btrfs_set_inode_full_sync(inode);
2144         } else {
2145                 hole_em->start = offset;
2146                 hole_em->len = end - offset;
2147                 hole_em->ram_bytes = hole_em->len;
2148                 hole_em->orig_start = offset;
2149
2150                 hole_em->block_start = EXTENT_MAP_HOLE;
2151                 hole_em->block_len = 0;
2152                 hole_em->orig_block_len = 0;
2153                 hole_em->compress_type = BTRFS_COMPRESS_NONE;
2154                 hole_em->generation = trans->transid;
2155
2156                 ret = btrfs_replace_extent_map_range(inode, hole_em, true);
2157                 free_extent_map(hole_em);
2158                 if (ret)
2159                         btrfs_set_inode_full_sync(inode);
2160         }
2161
2162         return 0;
2163 }
2164
2165 /*
2166  * Find a hole extent on given inode and change start/len to the end of hole
2167  * extent.(hole/vacuum extent whose em->start <= start &&
2168  *         em->start + em->len > start)
2169  * When a hole extent is found, return 1 and modify start/len.
2170  */
2171 static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
2172 {
2173         struct btrfs_fs_info *fs_info = inode->root->fs_info;
2174         struct extent_map *em;
2175         int ret = 0;
2176
2177         em = btrfs_get_extent(inode, NULL, 0,
2178                               round_down(*start, fs_info->sectorsize),
2179                               round_up(*len, fs_info->sectorsize));
2180         if (IS_ERR(em))
2181                 return PTR_ERR(em);
2182
2183         /* Hole or vacuum extent(only exists in no-hole mode) */
2184         if (em->block_start == EXTENT_MAP_HOLE) {
2185                 ret = 1;
2186                 *len = em->start + em->len > *start + *len ?
2187                        0 : *start + *len - em->start - em->len;
2188                 *start = em->start + em->len;
2189         }
2190         free_extent_map(em);
2191         return ret;
2192 }
2193
2194 static void btrfs_punch_hole_lock_range(struct inode *inode,
2195                                         const u64 lockstart,
2196                                         const u64 lockend,
2197                                         struct extent_state **cached_state)
2198 {
2199         /*
2200          * For subpage case, if the range is not at page boundary, we could
2201          * have pages at the leading/tailing part of the range.
2202          * This could lead to dead loop since filemap_range_has_page()
2203          * will always return true.
2204          * So here we need to do extra page alignment for
2205          * filemap_range_has_page().
2206          */
2207         const u64 page_lockstart = round_up(lockstart, PAGE_SIZE);
2208         const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
2209
2210         while (1) {
2211                 truncate_pagecache_range(inode, lockstart, lockend);
2212
2213                 lock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2214                             cached_state);
2215                 /*
2216                  * We can't have ordered extents in the range, nor dirty/writeback
2217                  * pages, because we have locked the inode's VFS lock in exclusive
2218                  * mode, we have locked the inode's i_mmap_lock in exclusive mode,
2219                  * we have flushed all delalloc in the range and we have waited
2220                  * for any ordered extents in the range to complete.
2221                  * We can race with anyone reading pages from this range, so after
2222                  * locking the range check if we have pages in the range, and if
2223                  * we do, unlock the range and retry.
2224                  */
2225                 if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
2226                                             page_lockend))
2227                         break;
2228
2229                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2230                               cached_state);
2231         }
2232
2233         btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
2234 }
2235
2236 static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
2237                                      struct btrfs_inode *inode,
2238                                      struct btrfs_path *path,
2239                                      struct btrfs_replace_extent_info *extent_info,
2240                                      const u64 replace_len,
2241                                      const u64 bytes_to_drop)
2242 {
2243         struct btrfs_fs_info *fs_info = trans->fs_info;
2244         struct btrfs_root *root = inode->root;
2245         struct btrfs_file_extent_item *extent;
2246         struct extent_buffer *leaf;
2247         struct btrfs_key key;
2248         int slot;
2249         struct btrfs_ref ref = { 0 };
2250         int ret;
2251
2252         if (replace_len == 0)
2253                 return 0;
2254
2255         if (extent_info->disk_offset == 0 &&
2256             btrfs_fs_incompat(fs_info, NO_HOLES)) {
2257                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2258                 return 0;
2259         }
2260
2261         key.objectid = btrfs_ino(inode);
2262         key.type = BTRFS_EXTENT_DATA_KEY;
2263         key.offset = extent_info->file_offset;
2264         ret = btrfs_insert_empty_item(trans, root, path, &key,
2265                                       sizeof(struct btrfs_file_extent_item));
2266         if (ret)
2267                 return ret;
2268         leaf = path->nodes[0];
2269         slot = path->slots[0];
2270         write_extent_buffer(leaf, extent_info->extent_buf,
2271                             btrfs_item_ptr_offset(leaf, slot),
2272                             sizeof(struct btrfs_file_extent_item));
2273         extent = btrfs_item_ptr(leaf, slot, struct btrfs_file_extent_item);
2274         ASSERT(btrfs_file_extent_type(leaf, extent) != BTRFS_FILE_EXTENT_INLINE);
2275         btrfs_set_file_extent_offset(leaf, extent, extent_info->data_offset);
2276         btrfs_set_file_extent_num_bytes(leaf, extent, replace_len);
2277         if (extent_info->is_new_extent)
2278                 btrfs_set_file_extent_generation(leaf, extent, trans->transid);
2279         btrfs_mark_buffer_dirty(trans, leaf);
2280         btrfs_release_path(path);
2281
2282         ret = btrfs_inode_set_file_extent_range(inode, extent_info->file_offset,
2283                                                 replace_len);
2284         if (ret)
2285                 return ret;
2286
2287         /* If it's a hole, nothing more needs to be done. */
2288         if (extent_info->disk_offset == 0) {
2289                 btrfs_update_inode_bytes(inode, 0, bytes_to_drop);
2290                 return 0;
2291         }
2292
2293         btrfs_update_inode_bytes(inode, replace_len, bytes_to_drop);
2294
2295         if (extent_info->is_new_extent && extent_info->insertions == 0) {
2296                 key.objectid = extent_info->disk_offset;
2297                 key.type = BTRFS_EXTENT_ITEM_KEY;
2298                 key.offset = extent_info->disk_len;
2299                 ret = btrfs_alloc_reserved_file_extent(trans, root,
2300                                                        btrfs_ino(inode),
2301                                                        extent_info->file_offset,
2302                                                        extent_info->qgroup_reserved,
2303                                                        &key);
2304         } else {
2305                 u64 ref_offset;
2306
2307                 btrfs_init_generic_ref(&ref, BTRFS_ADD_DELAYED_REF,
2308                                        extent_info->disk_offset,
2309                                        extent_info->disk_len, 0,
2310                                        root->root_key.objectid);
2311                 ref_offset = extent_info->file_offset - extent_info->data_offset;
2312                 btrfs_init_data_ref(&ref, root->root_key.objectid,
2313                                     btrfs_ino(inode), ref_offset, 0, false);
2314                 ret = btrfs_inc_extent_ref(trans, &ref);
2315         }
2316
2317         extent_info->insertions++;
2318
2319         return ret;
2320 }
2321
2322 /*
2323  * The respective range must have been previously locked, as well as the inode.
2324  * The end offset is inclusive (last byte of the range).
2325  * @extent_info is NULL for fallocate's hole punching and non-NULL when replacing
2326  * the file range with an extent.
2327  * When not punching a hole, we don't want to end up in a state where we dropped
2328  * extents without inserting a new one, so we must abort the transaction to avoid
2329  * a corruption.
2330  */
2331 int btrfs_replace_file_extents(struct btrfs_inode *inode,
2332                                struct btrfs_path *path, const u64 start,
2333                                const u64 end,
2334                                struct btrfs_replace_extent_info *extent_info,
2335                                struct btrfs_trans_handle **trans_out)
2336 {
2337         struct btrfs_drop_extents_args drop_args = { 0 };
2338         struct btrfs_root *root = inode->root;
2339         struct btrfs_fs_info *fs_info = root->fs_info;
2340         u64 min_size = btrfs_calc_insert_metadata_size(fs_info, 1);
2341         u64 ino_size = round_up(inode->vfs_inode.i_size, fs_info->sectorsize);
2342         struct btrfs_trans_handle *trans = NULL;
2343         struct btrfs_block_rsv *rsv;
2344         unsigned int rsv_count;
2345         u64 cur_offset;
2346         u64 len = end - start;
2347         int ret = 0;
2348
2349         if (end <= start)
2350                 return -EINVAL;
2351
2352         rsv = btrfs_alloc_block_rsv(fs_info, BTRFS_BLOCK_RSV_TEMP);
2353         if (!rsv) {
2354                 ret = -ENOMEM;
2355                 goto out;
2356         }
2357         rsv->size = btrfs_calc_insert_metadata_size(fs_info, 1);
2358         rsv->failfast = true;
2359
2360         /*
2361          * 1 - update the inode
2362          * 1 - removing the extents in the range
2363          * 1 - adding the hole extent if no_holes isn't set or if we are
2364          *     replacing the range with a new extent
2365          */
2366         if (!btrfs_fs_incompat(fs_info, NO_HOLES) || extent_info)
2367                 rsv_count = 3;
2368         else
2369                 rsv_count = 2;
2370
2371         trans = btrfs_start_transaction(root, rsv_count);
2372         if (IS_ERR(trans)) {
2373                 ret = PTR_ERR(trans);
2374                 trans = NULL;
2375                 goto out_free;
2376         }
2377
2378         ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv, rsv,
2379                                       min_size, false);
2380         if (WARN_ON(ret))
2381                 goto out_trans;
2382         trans->block_rsv = rsv;
2383
2384         cur_offset = start;
2385         drop_args.path = path;
2386         drop_args.end = end + 1;
2387         drop_args.drop_cache = true;
2388         while (cur_offset < end) {
2389                 drop_args.start = cur_offset;
2390                 ret = btrfs_drop_extents(trans, root, inode, &drop_args);
2391                 /* If we are punching a hole decrement the inode's byte count */
2392                 if (!extent_info)
2393                         btrfs_update_inode_bytes(inode, 0,
2394                                                  drop_args.bytes_found);
2395                 if (ret != -ENOSPC) {
2396                         /*
2397                          * The only time we don't want to abort is if we are
2398                          * attempting to clone a partial inline extent, in which
2399                          * case we'll get EOPNOTSUPP.  However if we aren't
2400                          * clone we need to abort no matter what, because if we
2401                          * got EOPNOTSUPP via prealloc then we messed up and
2402                          * need to abort.
2403                          */
2404                         if (ret &&
2405                             (ret != -EOPNOTSUPP ||
2406                              (extent_info && extent_info->is_new_extent)))
2407                                 btrfs_abort_transaction(trans, ret);
2408                         break;
2409                 }
2410
2411                 trans->block_rsv = &fs_info->trans_block_rsv;
2412
2413                 if (!extent_info && cur_offset < drop_args.drop_end &&
2414                     cur_offset < ino_size) {
2415                         ret = fill_holes(trans, inode, path, cur_offset,
2416                                          drop_args.drop_end);
2417                         if (ret) {
2418                                 /*
2419                                  * If we failed then we didn't insert our hole
2420                                  * entries for the area we dropped, so now the
2421                                  * fs is corrupted, so we must abort the
2422                                  * transaction.
2423                                  */
2424                                 btrfs_abort_transaction(trans, ret);
2425                                 break;
2426                         }
2427                 } else if (!extent_info && cur_offset < drop_args.drop_end) {
2428                         /*
2429                          * We are past the i_size here, but since we didn't
2430                          * insert holes we need to clear the mapped area so we
2431                          * know to not set disk_i_size in this area until a new
2432                          * file extent is inserted here.
2433                          */
2434                         ret = btrfs_inode_clear_file_extent_range(inode,
2435                                         cur_offset,
2436                                         drop_args.drop_end - cur_offset);
2437                         if (ret) {
2438                                 /*
2439                                  * We couldn't clear our area, so we could
2440                                  * presumably adjust up and corrupt the fs, so
2441                                  * we need to abort.
2442                                  */
2443                                 btrfs_abort_transaction(trans, ret);
2444                                 break;
2445                         }
2446                 }
2447
2448                 if (extent_info &&
2449                     drop_args.drop_end > extent_info->file_offset) {
2450                         u64 replace_len = drop_args.drop_end -
2451                                           extent_info->file_offset;
2452
2453                         ret = btrfs_insert_replace_extent(trans, inode, path,
2454                                         extent_info, replace_len,
2455                                         drop_args.bytes_found);
2456                         if (ret) {
2457                                 btrfs_abort_transaction(trans, ret);
2458                                 break;
2459                         }
2460                         extent_info->data_len -= replace_len;
2461                         extent_info->data_offset += replace_len;
2462                         extent_info->file_offset += replace_len;
2463                 }
2464
2465                 /*
2466                  * We are releasing our handle on the transaction, balance the
2467                  * dirty pages of the btree inode and flush delayed items, and
2468                  * then get a new transaction handle, which may now point to a
2469                  * new transaction in case someone else may have committed the
2470                  * transaction we used to replace/drop file extent items. So
2471                  * bump the inode's iversion and update mtime and ctime except
2472                  * if we are called from a dedupe context. This is because a
2473                  * power failure/crash may happen after the transaction is
2474                  * committed and before we finish replacing/dropping all the
2475                  * file extent items we need.
2476                  */
2477                 inode_inc_iversion(&inode->vfs_inode);
2478
2479                 if (!extent_info || extent_info->update_times)
2480                         inode->vfs_inode.i_mtime = inode_set_ctime_current(&inode->vfs_inode);
2481
2482                 ret = btrfs_update_inode(trans, root, inode);
2483                 if (ret)
2484                         break;
2485
2486                 btrfs_end_transaction(trans);
2487                 btrfs_btree_balance_dirty(fs_info);
2488
2489                 trans = btrfs_start_transaction(root, rsv_count);
2490                 if (IS_ERR(trans)) {
2491                         ret = PTR_ERR(trans);
2492                         trans = NULL;
2493                         break;
2494                 }
2495
2496                 ret = btrfs_block_rsv_migrate(&fs_info->trans_block_rsv,
2497                                               rsv, min_size, false);
2498                 if (WARN_ON(ret))
2499                         break;
2500                 trans->block_rsv = rsv;
2501
2502                 cur_offset = drop_args.drop_end;
2503                 len = end - cur_offset;
2504                 if (!extent_info && len) {
2505                         ret = find_first_non_hole(inode, &cur_offset, &len);
2506                         if (unlikely(ret < 0))
2507                                 break;
2508                         if (ret && !len) {
2509                                 ret = 0;
2510                                 break;
2511                         }
2512                 }
2513         }
2514
2515         /*
2516          * If we were cloning, force the next fsync to be a full one since we
2517          * we replaced (or just dropped in the case of cloning holes when
2518          * NO_HOLES is enabled) file extent items and did not setup new extent
2519          * maps for the replacement extents (or holes).
2520          */
2521         if (extent_info && !extent_info->is_new_extent)
2522                 btrfs_set_inode_full_sync(inode);
2523
2524         if (ret)
2525                 goto out_trans;
2526
2527         trans->block_rsv = &fs_info->trans_block_rsv;
2528         /*
2529          * If we are using the NO_HOLES feature we might have had already an
2530          * hole that overlaps a part of the region [lockstart, lockend] and
2531          * ends at (or beyond) lockend. Since we have no file extent items to
2532          * represent holes, drop_end can be less than lockend and so we must
2533          * make sure we have an extent map representing the existing hole (the
2534          * call to __btrfs_drop_extents() might have dropped the existing extent
2535          * map representing the existing hole), otherwise the fast fsync path
2536          * will not record the existence of the hole region
2537          * [existing_hole_start, lockend].
2538          */
2539         if (drop_args.drop_end <= end)
2540                 drop_args.drop_end = end + 1;
2541         /*
2542          * Don't insert file hole extent item if it's for a range beyond eof
2543          * (because it's useless) or if it represents a 0 bytes range (when
2544          * cur_offset == drop_end).
2545          */
2546         if (!extent_info && cur_offset < ino_size &&
2547             cur_offset < drop_args.drop_end) {
2548                 ret = fill_holes(trans, inode, path, cur_offset,
2549                                  drop_args.drop_end);
2550                 if (ret) {
2551                         /* Same comment as above. */
2552                         btrfs_abort_transaction(trans, ret);
2553                         goto out_trans;
2554                 }
2555         } else if (!extent_info && cur_offset < drop_args.drop_end) {
2556                 /* See the comment in the loop above for the reasoning here. */
2557                 ret = btrfs_inode_clear_file_extent_range(inode, cur_offset,
2558                                         drop_args.drop_end - cur_offset);
2559                 if (ret) {
2560                         btrfs_abort_transaction(trans, ret);
2561                         goto out_trans;
2562                 }
2563
2564         }
2565         if (extent_info) {
2566                 ret = btrfs_insert_replace_extent(trans, inode, path,
2567                                 extent_info, extent_info->data_len,
2568                                 drop_args.bytes_found);
2569                 if (ret) {
2570                         btrfs_abort_transaction(trans, ret);
2571                         goto out_trans;
2572                 }
2573         }
2574
2575 out_trans:
2576         if (!trans)
2577                 goto out_free;
2578
2579         trans->block_rsv = &fs_info->trans_block_rsv;
2580         if (ret)
2581                 btrfs_end_transaction(trans);
2582         else
2583                 *trans_out = trans;
2584 out_free:
2585         btrfs_free_block_rsv(fs_info, rsv);
2586 out:
2587         return ret;
2588 }
2589
2590 static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
2591 {
2592         struct inode *inode = file_inode(file);
2593         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
2594         struct btrfs_root *root = BTRFS_I(inode)->root;
2595         struct extent_state *cached_state = NULL;
2596         struct btrfs_path *path;
2597         struct btrfs_trans_handle *trans = NULL;
2598         u64 lockstart;
2599         u64 lockend;
2600         u64 tail_start;
2601         u64 tail_len;
2602         u64 orig_start = offset;
2603         int ret = 0;
2604         bool same_block;
2605         u64 ino_size;
2606         bool truncated_block = false;
2607         bool updated_inode = false;
2608
2609         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2610
2611         ret = btrfs_wait_ordered_range(inode, offset, len);
2612         if (ret)
2613                 goto out_only_mutex;
2614
2615         ino_size = round_up(inode->i_size, fs_info->sectorsize);
2616         ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2617         if (ret < 0)
2618                 goto out_only_mutex;
2619         if (ret && !len) {
2620                 /* Already in a large hole */
2621                 ret = 0;
2622                 goto out_only_mutex;
2623         }
2624
2625         ret = file_modified(file);
2626         if (ret)
2627                 goto out_only_mutex;
2628
2629         lockstart = round_up(offset, fs_info->sectorsize);
2630         lockend = round_down(offset + len, fs_info->sectorsize) - 1;
2631         same_block = (BTRFS_BYTES_TO_BLKS(fs_info, offset))
2632                 == (BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1));
2633         /*
2634          * We needn't truncate any block which is beyond the end of the file
2635          * because we are sure there is no data there.
2636          */
2637         /*
2638          * Only do this if we are in the same block and we aren't doing the
2639          * entire block.
2640          */
2641         if (same_block && len < fs_info->sectorsize) {
2642                 if (offset < ino_size) {
2643                         truncated_block = true;
2644                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2645                                                    0);
2646                 } else {
2647                         ret = 0;
2648                 }
2649                 goto out_only_mutex;
2650         }
2651
2652         /* zero back part of the first block */
2653         if (offset < ino_size) {
2654                 truncated_block = true;
2655                 ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2656                 if (ret) {
2657                         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2658                         return ret;
2659                 }
2660         }
2661
2662         /* Check the aligned pages after the first unaligned page,
2663          * if offset != orig_start, which means the first unaligned page
2664          * including several following pages are already in holes,
2665          * the extra check can be skipped */
2666         if (offset == orig_start) {
2667                 /* after truncate page, check hole again */
2668                 len = offset + len - lockstart;
2669                 offset = lockstart;
2670                 ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
2671                 if (ret < 0)
2672                         goto out_only_mutex;
2673                 if (ret && !len) {
2674                         ret = 0;
2675                         goto out_only_mutex;
2676                 }
2677                 lockstart = offset;
2678         }
2679
2680         /* Check the tail unaligned part is in a hole */
2681         tail_start = lockend + 1;
2682         tail_len = offset + len - tail_start;
2683         if (tail_len) {
2684                 ret = find_first_non_hole(BTRFS_I(inode), &tail_start, &tail_len);
2685                 if (unlikely(ret < 0))
2686                         goto out_only_mutex;
2687                 if (!ret) {
2688                         /* zero the front end of the last page */
2689                         if (tail_start + tail_len < ino_size) {
2690                                 truncated_block = true;
2691                                 ret = btrfs_truncate_block(BTRFS_I(inode),
2692                                                         tail_start + tail_len,
2693                                                         0, 1);
2694                                 if (ret)
2695                                         goto out_only_mutex;
2696                         }
2697                 }
2698         }
2699
2700         if (lockend < lockstart) {
2701                 ret = 0;
2702                 goto out_only_mutex;
2703         }
2704
2705         btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
2706
2707         path = btrfs_alloc_path();
2708         if (!path) {
2709                 ret = -ENOMEM;
2710                 goto out;
2711         }
2712
2713         ret = btrfs_replace_file_extents(BTRFS_I(inode), path, lockstart,
2714                                          lockend, NULL, &trans);
2715         btrfs_free_path(path);
2716         if (ret)
2717                 goto out;
2718
2719         ASSERT(trans != NULL);
2720         inode_inc_iversion(inode);
2721         inode->i_mtime = inode_set_ctime_current(inode);
2722         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2723         updated_inode = true;
2724         btrfs_end_transaction(trans);
2725         btrfs_btree_balance_dirty(fs_info);
2726 out:
2727         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
2728                       &cached_state);
2729 out_only_mutex:
2730         if (!updated_inode && truncated_block && !ret) {
2731                 /*
2732                  * If we only end up zeroing part of a page, we still need to
2733                  * update the inode item, so that all the time fields are
2734                  * updated as well as the necessary btrfs inode in memory fields
2735                  * for detecting, at fsync time, if the inode isn't yet in the
2736                  * log tree or it's there but not up to date.
2737                  */
2738                 struct timespec64 now = inode_set_ctime_current(inode);
2739
2740                 inode_inc_iversion(inode);
2741                 inode->i_mtime = now;
2742                 trans = btrfs_start_transaction(root, 1);
2743                 if (IS_ERR(trans)) {
2744                         ret = PTR_ERR(trans);
2745                 } else {
2746                         int ret2;
2747
2748                         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2749                         ret2 = btrfs_end_transaction(trans);
2750                         if (!ret)
2751                                 ret = ret2;
2752                 }
2753         }
2754         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
2755         return ret;
2756 }
2757
2758 /* Helper structure to record which range is already reserved */
2759 struct falloc_range {
2760         struct list_head list;
2761         u64 start;
2762         u64 len;
2763 };
2764
2765 /*
2766  * Helper function to add falloc range
2767  *
2768  * Caller should have locked the larger range of extent containing
2769  * [start, len)
2770  */
2771 static int add_falloc_range(struct list_head *head, u64 start, u64 len)
2772 {
2773         struct falloc_range *range = NULL;
2774
2775         if (!list_empty(head)) {
2776                 /*
2777                  * As fallocate iterates by bytenr order, we only need to check
2778                  * the last range.
2779                  */
2780                 range = list_last_entry(head, struct falloc_range, list);
2781                 if (range->start + range->len == start) {
2782                         range->len += len;
2783                         return 0;
2784                 }
2785         }
2786
2787         range = kmalloc(sizeof(*range), GFP_KERNEL);
2788         if (!range)
2789                 return -ENOMEM;
2790         range->start = start;
2791         range->len = len;
2792         list_add_tail(&range->list, head);
2793         return 0;
2794 }
2795
2796 static int btrfs_fallocate_update_isize(struct inode *inode,
2797                                         const u64 end,
2798                                         const int mode)
2799 {
2800         struct btrfs_trans_handle *trans;
2801         struct btrfs_root *root = BTRFS_I(inode)->root;
2802         int ret;
2803         int ret2;
2804
2805         if (mode & FALLOC_FL_KEEP_SIZE || end <= i_size_read(inode))
2806                 return 0;
2807
2808         trans = btrfs_start_transaction(root, 1);
2809         if (IS_ERR(trans))
2810                 return PTR_ERR(trans);
2811
2812         inode_set_ctime_current(inode);
2813         i_size_write(inode, end);
2814         btrfs_inode_safe_disk_i_size_write(BTRFS_I(inode), 0);
2815         ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
2816         ret2 = btrfs_end_transaction(trans);
2817
2818         return ret ? ret : ret2;
2819 }
2820
2821 enum {
2822         RANGE_BOUNDARY_WRITTEN_EXTENT,
2823         RANGE_BOUNDARY_PREALLOC_EXTENT,
2824         RANGE_BOUNDARY_HOLE,
2825 };
2826
2827 static int btrfs_zero_range_check_range_boundary(struct btrfs_inode *inode,
2828                                                  u64 offset)
2829 {
2830         const u64 sectorsize = inode->root->fs_info->sectorsize;
2831         struct extent_map *em;
2832         int ret;
2833
2834         offset = round_down(offset, sectorsize);
2835         em = btrfs_get_extent(inode, NULL, 0, offset, sectorsize);
2836         if (IS_ERR(em))
2837                 return PTR_ERR(em);
2838
2839         if (em->block_start == EXTENT_MAP_HOLE)
2840                 ret = RANGE_BOUNDARY_HOLE;
2841         else if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
2842                 ret = RANGE_BOUNDARY_PREALLOC_EXTENT;
2843         else
2844                 ret = RANGE_BOUNDARY_WRITTEN_EXTENT;
2845
2846         free_extent_map(em);
2847         return ret;
2848 }
2849
2850 static int btrfs_zero_range(struct inode *inode,
2851                             loff_t offset,
2852                             loff_t len,
2853                             const int mode)
2854 {
2855         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
2856         struct extent_map *em;
2857         struct extent_changeset *data_reserved = NULL;
2858         int ret;
2859         u64 alloc_hint = 0;
2860         const u64 sectorsize = fs_info->sectorsize;
2861         u64 alloc_start = round_down(offset, sectorsize);
2862         u64 alloc_end = round_up(offset + len, sectorsize);
2863         u64 bytes_to_reserve = 0;
2864         bool space_reserved = false;
2865
2866         em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2867                               alloc_end - alloc_start);
2868         if (IS_ERR(em)) {
2869                 ret = PTR_ERR(em);
2870                 goto out;
2871         }
2872
2873         /*
2874          * Avoid hole punching and extent allocation for some cases. More cases
2875          * could be considered, but these are unlikely common and we keep things
2876          * as simple as possible for now. Also, intentionally, if the target
2877          * range contains one or more prealloc extents together with regular
2878          * extents and holes, we drop all the existing extents and allocate a
2879          * new prealloc extent, so that we get a larger contiguous disk extent.
2880          */
2881         if (em->start <= alloc_start &&
2882             test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2883                 const u64 em_end = em->start + em->len;
2884
2885                 if (em_end >= offset + len) {
2886                         /*
2887                          * The whole range is already a prealloc extent,
2888                          * do nothing except updating the inode's i_size if
2889                          * needed.
2890                          */
2891                         free_extent_map(em);
2892                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2893                                                            mode);
2894                         goto out;
2895                 }
2896                 /*
2897                  * Part of the range is already a prealloc extent, so operate
2898                  * only on the remaining part of the range.
2899                  */
2900                 alloc_start = em_end;
2901                 ASSERT(IS_ALIGNED(alloc_start, sectorsize));
2902                 len = offset + len - alloc_start;
2903                 offset = alloc_start;
2904                 alloc_hint = em->block_start + em->len;
2905         }
2906         free_extent_map(em);
2907
2908         if (BTRFS_BYTES_TO_BLKS(fs_info, offset) ==
2909             BTRFS_BYTES_TO_BLKS(fs_info, offset + len - 1)) {
2910                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
2911                                       sectorsize);
2912                 if (IS_ERR(em)) {
2913                         ret = PTR_ERR(em);
2914                         goto out;
2915                 }
2916
2917                 if (test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) {
2918                         free_extent_map(em);
2919                         ret = btrfs_fallocate_update_isize(inode, offset + len,
2920                                                            mode);
2921                         goto out;
2922                 }
2923                 if (len < sectorsize && em->block_start != EXTENT_MAP_HOLE) {
2924                         free_extent_map(em);
2925                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, len,
2926                                                    0);
2927                         if (!ret)
2928                                 ret = btrfs_fallocate_update_isize(inode,
2929                                                                    offset + len,
2930                                                                    mode);
2931                         return ret;
2932                 }
2933                 free_extent_map(em);
2934                 alloc_start = round_down(offset, sectorsize);
2935                 alloc_end = alloc_start + sectorsize;
2936                 goto reserve_space;
2937         }
2938
2939         alloc_start = round_up(offset, sectorsize);
2940         alloc_end = round_down(offset + len, sectorsize);
2941
2942         /*
2943          * For unaligned ranges, check the pages at the boundaries, they might
2944          * map to an extent, in which case we need to partially zero them, or
2945          * they might map to a hole, in which case we need our allocation range
2946          * to cover them.
2947          */
2948         if (!IS_ALIGNED(offset, sectorsize)) {
2949                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2950                                                             offset);
2951                 if (ret < 0)
2952                         goto out;
2953                 if (ret == RANGE_BOUNDARY_HOLE) {
2954                         alloc_start = round_down(offset, sectorsize);
2955                         ret = 0;
2956                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2957                         ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
2958                         if (ret)
2959                                 goto out;
2960                 } else {
2961                         ret = 0;
2962                 }
2963         }
2964
2965         if (!IS_ALIGNED(offset + len, sectorsize)) {
2966                 ret = btrfs_zero_range_check_range_boundary(BTRFS_I(inode),
2967                                                             offset + len);
2968                 if (ret < 0)
2969                         goto out;
2970                 if (ret == RANGE_BOUNDARY_HOLE) {
2971                         alloc_end = round_up(offset + len, sectorsize);
2972                         ret = 0;
2973                 } else if (ret == RANGE_BOUNDARY_WRITTEN_EXTENT) {
2974                         ret = btrfs_truncate_block(BTRFS_I(inode), offset + len,
2975                                                    0, 1);
2976                         if (ret)
2977                                 goto out;
2978                 } else {
2979                         ret = 0;
2980                 }
2981         }
2982
2983 reserve_space:
2984         if (alloc_start < alloc_end) {
2985                 struct extent_state *cached_state = NULL;
2986                 const u64 lockstart = alloc_start;
2987                 const u64 lockend = alloc_end - 1;
2988
2989                 bytes_to_reserve = alloc_end - alloc_start;
2990                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
2991                                                       bytes_to_reserve);
2992                 if (ret < 0)
2993                         goto out;
2994                 space_reserved = true;
2995                 btrfs_punch_hole_lock_range(inode, lockstart, lockend,
2996                                             &cached_state);
2997                 ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
2998                                                 alloc_start, bytes_to_reserve);
2999                 if (ret) {
3000                         unlock_extent(&BTRFS_I(inode)->io_tree, lockstart,
3001                                       lockend, &cached_state);
3002                         goto out;
3003                 }
3004                 ret = btrfs_prealloc_file_range(inode, mode, alloc_start,
3005                                                 alloc_end - alloc_start,
3006                                                 i_blocksize(inode),
3007                                                 offset + len, &alloc_hint);
3008                 unlock_extent(&BTRFS_I(inode)->io_tree, lockstart, lockend,
3009                               &cached_state);
3010                 /* btrfs_prealloc_file_range releases reserved space on error */
3011                 if (ret) {
3012                         space_reserved = false;
3013                         goto out;
3014                 }
3015         }
3016         ret = btrfs_fallocate_update_isize(inode, offset + len, mode);
3017  out:
3018         if (ret && space_reserved)
3019                 btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
3020                                                alloc_start, bytes_to_reserve);
3021         extent_changeset_free(data_reserved);
3022
3023         return ret;
3024 }
3025
3026 static long btrfs_fallocate(struct file *file, int mode,
3027                             loff_t offset, loff_t len)
3028 {
3029         struct inode *inode = file_inode(file);
3030         struct extent_state *cached_state = NULL;
3031         struct extent_changeset *data_reserved = NULL;
3032         struct falloc_range *range;
3033         struct falloc_range *tmp;
3034         LIST_HEAD(reserve_list);
3035         u64 cur_offset;
3036         u64 last_byte;
3037         u64 alloc_start;
3038         u64 alloc_end;
3039         u64 alloc_hint = 0;
3040         u64 locked_end;
3041         u64 actual_end = 0;
3042         u64 data_space_needed = 0;
3043         u64 data_space_reserved = 0;
3044         u64 qgroup_reserved = 0;
3045         struct extent_map *em;
3046         int blocksize = BTRFS_I(inode)->root->fs_info->sectorsize;
3047         int ret;
3048
3049         /* Do not allow fallocate in ZONED mode */
3050         if (btrfs_is_zoned(btrfs_sb(inode->i_sb)))
3051                 return -EOPNOTSUPP;
3052
3053         alloc_start = round_down(offset, blocksize);
3054         alloc_end = round_up(offset + len, blocksize);
3055         cur_offset = alloc_start;
3056
3057         /* Make sure we aren't being give some crap mode */
3058         if (mode & ~(FALLOC_FL_KEEP_SIZE | FALLOC_FL_PUNCH_HOLE |
3059                      FALLOC_FL_ZERO_RANGE))
3060                 return -EOPNOTSUPP;
3061
3062         if (mode & FALLOC_FL_PUNCH_HOLE)
3063                 return btrfs_punch_hole(file, offset, len);
3064
3065         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3066
3067         if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
3068                 ret = inode_newsize_ok(inode, offset + len);
3069                 if (ret)
3070                         goto out;
3071         }
3072
3073         ret = file_modified(file);
3074         if (ret)
3075                 goto out;
3076
3077         /*
3078          * TODO: Move these two operations after we have checked
3079          * accurate reserved space, or fallocate can still fail but
3080          * with page truncated or size expanded.
3081          *
3082          * But that's a minor problem and won't do much harm BTW.
3083          */
3084         if (alloc_start > inode->i_size) {
3085                 ret = btrfs_cont_expand(BTRFS_I(inode), i_size_read(inode),
3086                                         alloc_start);
3087                 if (ret)
3088                         goto out;
3089         } else if (offset + len > inode->i_size) {
3090                 /*
3091                  * If we are fallocating from the end of the file onward we
3092                  * need to zero out the end of the block if i_size lands in the
3093                  * middle of a block.
3094                  */
3095                 ret = btrfs_truncate_block(BTRFS_I(inode), inode->i_size, 0, 0);
3096                 if (ret)
3097                         goto out;
3098         }
3099
3100         /*
3101          * We have locked the inode at the VFS level (in exclusive mode) and we
3102          * have locked the i_mmap_lock lock (in exclusive mode). Now before
3103          * locking the file range, flush all dealloc in the range and wait for
3104          * all ordered extents in the range to complete. After this we can lock
3105          * the file range and, due to the previous locking we did, we know there
3106          * can't be more delalloc or ordered extents in the range.
3107          */
3108         ret = btrfs_wait_ordered_range(inode, alloc_start,
3109                                        alloc_end - alloc_start);
3110         if (ret)
3111                 goto out;
3112
3113         if (mode & FALLOC_FL_ZERO_RANGE) {
3114                 ret = btrfs_zero_range(inode, offset, len, mode);
3115                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3116                 return ret;
3117         }
3118
3119         locked_end = alloc_end - 1;
3120         lock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3121                     &cached_state);
3122
3123         btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
3124
3125         /* First, check if we exceed the qgroup limit */
3126         while (cur_offset < alloc_end) {
3127                 em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, cur_offset,
3128                                       alloc_end - cur_offset);
3129                 if (IS_ERR(em)) {
3130                         ret = PTR_ERR(em);
3131                         break;
3132                 }
3133                 last_byte = min(extent_map_end(em), alloc_end);
3134                 actual_end = min_t(u64, extent_map_end(em), offset + len);
3135                 last_byte = ALIGN(last_byte, blocksize);
3136                 if (em->block_start == EXTENT_MAP_HOLE ||
3137                     (cur_offset >= inode->i_size &&
3138                      !test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
3139                         const u64 range_len = last_byte - cur_offset;
3140
3141                         ret = add_falloc_range(&reserve_list, cur_offset, range_len);
3142                         if (ret < 0) {
3143                                 free_extent_map(em);
3144                                 break;
3145                         }
3146                         ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
3147                                         &data_reserved, cur_offset, range_len);
3148                         if (ret < 0) {
3149                                 free_extent_map(em);
3150                                 break;
3151                         }
3152                         qgroup_reserved += range_len;
3153                         data_space_needed += range_len;
3154                 }
3155                 free_extent_map(em);
3156                 cur_offset = last_byte;
3157         }
3158
3159         if (!ret && data_space_needed > 0) {
3160                 /*
3161                  * We are safe to reserve space here as we can't have delalloc
3162                  * in the range, see above.
3163                  */
3164                 ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
3165                                                       data_space_needed);
3166                 if (!ret)
3167                         data_space_reserved = data_space_needed;
3168         }
3169
3170         /*
3171          * If ret is still 0, means we're OK to fallocate.
3172          * Or just cleanup the list and exit.
3173          */
3174         list_for_each_entry_safe(range, tmp, &reserve_list, list) {
3175                 if (!ret) {
3176                         ret = btrfs_prealloc_file_range(inode, mode,
3177                                         range->start,
3178                                         range->len, i_blocksize(inode),
3179                                         offset + len, &alloc_hint);
3180                         /*
3181                          * btrfs_prealloc_file_range() releases space even
3182                          * if it returns an error.
3183                          */
3184                         data_space_reserved -= range->len;
3185                         qgroup_reserved -= range->len;
3186                 } else if (data_space_reserved > 0) {
3187                         btrfs_free_reserved_data_space(BTRFS_I(inode),
3188                                                data_reserved, range->start,
3189                                                range->len);
3190                         data_space_reserved -= range->len;
3191                         qgroup_reserved -= range->len;
3192                 } else if (qgroup_reserved > 0) {
3193                         btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
3194                                                range->start, range->len);
3195                         qgroup_reserved -= range->len;
3196                 }
3197                 list_del(&range->list);
3198                 kfree(range);
3199         }
3200         if (ret < 0)
3201                 goto out_unlock;
3202
3203         /*
3204          * We didn't need to allocate any more space, but we still extended the
3205          * size of the file so we need to update i_size and the inode item.
3206          */
3207         ret = btrfs_fallocate_update_isize(inode, actual_end, mode);
3208 out_unlock:
3209         unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
3210                       &cached_state);
3211 out:
3212         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
3213         extent_changeset_free(data_reserved);
3214         return ret;
3215 }
3216
3217 /*
3218  * Helper for btrfs_find_delalloc_in_range(). Find a subrange in a given range
3219  * that has unflushed and/or flushing delalloc. There might be other adjacent
3220  * subranges after the one it found, so btrfs_find_delalloc_in_range() keeps
3221  * looping while it gets adjacent subranges, and merging them together.
3222  */
3223 static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
3224                                    struct extent_state **cached_state,
3225                                    bool *search_io_tree,
3226                                    u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3227 {
3228         u64 len = end + 1 - start;
3229         u64 delalloc_len = 0;
3230         struct btrfs_ordered_extent *oe;
3231         u64 oe_start;
3232         u64 oe_end;
3233
3234         /*
3235          * Search the io tree first for EXTENT_DELALLOC. If we find any, it
3236          * means we have delalloc (dirty pages) for which writeback has not
3237          * started yet.
3238          */
3239         if (*search_io_tree) {
3240                 spin_lock(&inode->lock);
3241                 if (inode->delalloc_bytes > 0) {
3242                         spin_unlock(&inode->lock);
3243                         *delalloc_start_ret = start;
3244                         delalloc_len = count_range_bits(&inode->io_tree,
3245                                                         delalloc_start_ret, end,
3246                                                         len, EXTENT_DELALLOC, 1,
3247                                                         cached_state);
3248                 } else {
3249                         spin_unlock(&inode->lock);
3250                 }
3251         }
3252
3253         if (delalloc_len > 0) {
3254                 /*
3255                  * If delalloc was found then *delalloc_start_ret has a sector size
3256                  * aligned value (rounded down).
3257                  */
3258                 *delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
3259
3260                 if (*delalloc_start_ret == start) {
3261                         /* Delalloc for the whole range, nothing more to do. */
3262                         if (*delalloc_end_ret == end)
3263                                 return true;
3264                         /* Else trim our search range for ordered extents. */
3265                         start = *delalloc_end_ret + 1;
3266                         len = end + 1 - start;
3267                 }
3268         } else {
3269                 /* No delalloc, future calls don't need to search again. */
3270                 *search_io_tree = false;
3271         }
3272
3273         /*
3274          * Now also check if there's any ordered extent in the range.
3275          * We do this because:
3276          *
3277          * 1) When delalloc is flushed, the file range is locked, we clear the
3278          *    EXTENT_DELALLOC bit from the io tree and create an extent map and
3279          *    an ordered extent for the write. So we might just have been called
3280          *    after delalloc is flushed and before the ordered extent completes
3281          *    and inserts the new file extent item in the subvolume's btree;
3282          *
3283          * 2) We may have an ordered extent created by flushing delalloc for a
3284          *    subrange that starts before the subrange we found marked with
3285          *    EXTENT_DELALLOC in the io tree.
3286          *
3287          * We could also use the extent map tree to find such delalloc that is
3288          * being flushed, but using the ordered extents tree is more efficient
3289          * because it's usually much smaller as ordered extents are removed from
3290          * the tree once they complete. With the extent maps, we mau have them
3291          * in the extent map tree for a very long time, and they were either
3292          * created by previous writes or loaded by read operations.
3293          */
3294         oe = btrfs_lookup_first_ordered_range(inode, start, len);
3295         if (!oe)
3296                 return (delalloc_len > 0);
3297
3298         /* The ordered extent may span beyond our search range. */
3299         oe_start = max(oe->file_offset, start);
3300         oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
3301
3302         btrfs_put_ordered_extent(oe);
3303
3304         /* Don't have unflushed delalloc, return the ordered extent range. */
3305         if (delalloc_len == 0) {
3306                 *delalloc_start_ret = oe_start;
3307                 *delalloc_end_ret = oe_end;
3308                 return true;
3309         }
3310
3311         /*
3312          * We have both unflushed delalloc (io_tree) and an ordered extent.
3313          * If the ranges are adjacent returned a combined range, otherwise
3314          * return the leftmost range.
3315          */
3316         if (oe_start < *delalloc_start_ret) {
3317                 if (oe_end < *delalloc_start_ret)
3318                         *delalloc_end_ret = oe_end;
3319                 *delalloc_start_ret = oe_start;
3320         } else if (*delalloc_end_ret + 1 == oe_start) {
3321                 *delalloc_end_ret = oe_end;
3322         }
3323
3324         return true;
3325 }
3326
3327 /*
3328  * Check if there's delalloc in a given range.
3329  *
3330  * @inode:               The inode.
3331  * @start:               The start offset of the range. It does not need to be
3332  *                       sector size aligned.
3333  * @end:                 The end offset (inclusive value) of the search range.
3334  *                       It does not need to be sector size aligned.
3335  * @cached_state:        Extent state record used for speeding up delalloc
3336  *                       searches in the inode's io_tree. Can be NULL.
3337  * @delalloc_start_ret:  Output argument, set to the start offset of the
3338  *                       subrange found with delalloc (may not be sector size
3339  *                       aligned).
3340  * @delalloc_end_ret:    Output argument, set to he end offset (inclusive value)
3341  *                       of the subrange found with delalloc.
3342  *
3343  * Returns true if a subrange with delalloc is found within the given range, and
3344  * if so it sets @delalloc_start_ret and @delalloc_end_ret with the start and
3345  * end offsets of the subrange.
3346  */
3347 bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
3348                                   struct extent_state **cached_state,
3349                                   u64 *delalloc_start_ret, u64 *delalloc_end_ret)
3350 {
3351         u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
3352         u64 prev_delalloc_end = 0;
3353         bool search_io_tree = true;
3354         bool ret = false;
3355
3356         while (cur_offset <= end) {
3357                 u64 delalloc_start;
3358                 u64 delalloc_end;
3359                 bool delalloc;
3360
3361                 delalloc = find_delalloc_subrange(inode, cur_offset, end,
3362                                                   cached_state, &search_io_tree,
3363                                                   &delalloc_start,
3364                                                   &delalloc_end);
3365                 if (!delalloc)
3366                         break;
3367
3368                 if (prev_delalloc_end == 0) {
3369                         /* First subrange found. */
3370                         *delalloc_start_ret = max(delalloc_start, start);
3371                         *delalloc_end_ret = delalloc_end;
3372                         ret = true;
3373                 } else if (delalloc_start == prev_delalloc_end + 1) {
3374                         /* Subrange adjacent to the previous one, merge them. */
3375                         *delalloc_end_ret = delalloc_end;
3376                 } else {
3377                         /* Subrange not adjacent to the previous one, exit. */
3378                         break;
3379                 }
3380
3381                 prev_delalloc_end = delalloc_end;
3382                 cur_offset = delalloc_end + 1;
3383                 cond_resched();
3384         }
3385
3386         return ret;
3387 }
3388
3389 /*
3390  * Check if there's a hole or delalloc range in a range representing a hole (or
3391  * prealloc extent) found in the inode's subvolume btree.
3392  *
3393  * @inode:      The inode.
3394  * @whence:     Seek mode (SEEK_DATA or SEEK_HOLE).
3395  * @start:      Start offset of the hole region. It does not need to be sector
3396  *              size aligned.
3397  * @end:        End offset (inclusive value) of the hole region. It does not
3398  *              need to be sector size aligned.
3399  * @start_ret:  Return parameter, used to set the start of the subrange in the
3400  *              hole that matches the search criteria (seek mode), if such
3401  *              subrange is found (return value of the function is true).
3402  *              The value returned here may not be sector size aligned.
3403  *
3404  * Returns true if a subrange matching the given seek mode is found, and if one
3405  * is found, it updates @start_ret with the start of the subrange.
3406  */
3407 static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
3408                                         struct extent_state **cached_state,
3409                                         u64 start, u64 end, u64 *start_ret)
3410 {
3411         u64 delalloc_start;
3412         u64 delalloc_end;
3413         bool delalloc;
3414
3415         delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
3416                                                 &delalloc_start, &delalloc_end);
3417         if (delalloc && whence == SEEK_DATA) {
3418                 *start_ret = delalloc_start;
3419                 return true;
3420         }
3421
3422         if (delalloc && whence == SEEK_HOLE) {
3423                 /*
3424                  * We found delalloc but it starts after out start offset. So we
3425                  * have a hole between our start offset and the delalloc start.
3426                  */
3427                 if (start < delalloc_start) {
3428                         *start_ret = start;
3429                         return true;
3430                 }
3431                 /*
3432                  * Delalloc range starts at our start offset.
3433                  * If the delalloc range's length is smaller than our range,
3434                  * then it means we have a hole that starts where the delalloc
3435                  * subrange ends.
3436                  */
3437                 if (delalloc_end < end) {
3438                         *start_ret = delalloc_end + 1;
3439                         return true;
3440                 }
3441
3442                 /* There's delalloc for the whole range. */
3443                 return false;
3444         }
3445
3446         if (!delalloc && whence == SEEK_HOLE) {
3447                 *start_ret = start;
3448                 return true;
3449         }
3450
3451         /*
3452          * No delalloc in the range and we are seeking for data. The caller has
3453          * to iterate to the next extent item in the subvolume btree.
3454          */
3455         return false;
3456 }
3457
3458 static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
3459 {
3460         struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
3461         struct btrfs_file_private *private = file->private_data;
3462         struct btrfs_fs_info *fs_info = inode->root->fs_info;
3463         struct extent_state *cached_state = NULL;
3464         struct extent_state **delalloc_cached_state;
3465         const loff_t i_size = i_size_read(&inode->vfs_inode);
3466         const u64 ino = btrfs_ino(inode);
3467         struct btrfs_root *root = inode->root;
3468         struct btrfs_path *path;
3469         struct btrfs_key key;
3470         u64 last_extent_end;
3471         u64 lockstart;
3472         u64 lockend;
3473         u64 start;
3474         int ret;
3475         bool found = false;
3476
3477         if (i_size == 0 || offset >= i_size)
3478                 return -ENXIO;
3479
3480         /*
3481          * Quick path. If the inode has no prealloc extents and its number of
3482          * bytes used matches its i_size, then it can not have holes.
3483          */
3484         if (whence == SEEK_HOLE &&
3485             !(inode->flags & BTRFS_INODE_PREALLOC) &&
3486             inode_get_bytes(&inode->vfs_inode) == i_size)
3487                 return i_size;
3488
3489         if (!private) {
3490                 private = kzalloc(sizeof(*private), GFP_KERNEL);
3491                 /*
3492                  * No worries if memory allocation failed.
3493                  * The private structure is used only for speeding up multiple
3494                  * lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
3495                  * so everything will still be correct.
3496                  */
3497                 file->private_data = private;
3498         }
3499
3500         if (private)
3501                 delalloc_cached_state = &private->llseek_cached_state;
3502         else
3503                 delalloc_cached_state = NULL;
3504
3505         /*
3506          * offset can be negative, in this case we start finding DATA/HOLE from
3507          * the very start of the file.
3508          */
3509         start = max_t(loff_t, 0, offset);
3510
3511         lockstart = round_down(start, fs_info->sectorsize);
3512         lockend = round_up(i_size, fs_info->sectorsize);
3513         if (lockend <= lockstart)
3514                 lockend = lockstart + fs_info->sectorsize;
3515         lockend--;
3516
3517         path = btrfs_alloc_path();
3518         if (!path)
3519                 return -ENOMEM;
3520         path->reada = READA_FORWARD;
3521
3522         key.objectid = ino;
3523         key.type = BTRFS_EXTENT_DATA_KEY;
3524         key.offset = start;
3525
3526         last_extent_end = lockstart;
3527
3528         lock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3529
3530         ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
3531         if (ret < 0) {
3532                 goto out;
3533         } else if (ret > 0 && path->slots[0] > 0) {
3534                 btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0] - 1);
3535                 if (key.objectid == ino && key.type == BTRFS_EXTENT_DATA_KEY)
3536                         path->slots[0]--;
3537         }
3538
3539         while (start < i_size) {
3540                 struct extent_buffer *leaf = path->nodes[0];
3541                 struct btrfs_file_extent_item *extent;
3542                 u64 extent_end;
3543                 u8 type;
3544
3545                 if (path->slots[0] >= btrfs_header_nritems(leaf)) {
3546                         ret = btrfs_next_leaf(root, path);
3547                         if (ret < 0)
3548                                 goto out;
3549                         else if (ret > 0)
3550                                 break;
3551
3552                         leaf = path->nodes[0];
3553                 }
3554
3555                 btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
3556                 if (key.objectid != ino || key.type != BTRFS_EXTENT_DATA_KEY)
3557                         break;
3558
3559                 extent_end = btrfs_file_extent_end(path);
3560
3561                 /*
3562                  * In the first iteration we may have a slot that points to an
3563                  * extent that ends before our start offset, so skip it.
3564                  */
3565                 if (extent_end <= start) {
3566                         path->slots[0]++;
3567                         continue;
3568                 }
3569
3570                 /* We have an implicit hole, NO_HOLES feature is likely set. */
3571                 if (last_extent_end < key.offset) {
3572                         u64 search_start = last_extent_end;
3573                         u64 found_start;
3574
3575                         /*
3576                          * First iteration, @start matches @offset and it's
3577                          * within the hole.
3578                          */
3579                         if (start == offset)
3580                                 search_start = offset;
3581
3582                         found = find_desired_extent_in_hole(inode, whence,
3583                                                             delalloc_cached_state,
3584                                                             search_start,
3585                                                             key.offset - 1,
3586                                                             &found_start);
3587                         if (found) {
3588                                 start = found_start;
3589                                 break;
3590                         }
3591                         /*
3592                          * Didn't find data or a hole (due to delalloc) in the
3593                          * implicit hole range, so need to analyze the extent.
3594                          */
3595                 }
3596
3597                 extent = btrfs_item_ptr(leaf, path->slots[0],
3598                                         struct btrfs_file_extent_item);
3599                 type = btrfs_file_extent_type(leaf, extent);
3600
3601                 /*
3602                  * Can't access the extent's disk_bytenr field if this is an
3603                  * inline extent, since at that offset, it's where the extent
3604                  * data starts.
3605                  */
3606                 if (type == BTRFS_FILE_EXTENT_PREALLOC ||
3607                     (type == BTRFS_FILE_EXTENT_REG &&
3608                      btrfs_file_extent_disk_bytenr(leaf, extent) == 0)) {
3609                         /*
3610                          * Explicit hole or prealloc extent, search for delalloc.
3611                          * A prealloc extent is treated like a hole.
3612                          */
3613                         u64 search_start = key.offset;
3614                         u64 found_start;
3615
3616                         /*
3617                          * First iteration, @start matches @offset and it's
3618                          * within the hole.
3619                          */
3620                         if (start == offset)
3621                                 search_start = offset;
3622
3623                         found = find_desired_extent_in_hole(inode, whence,
3624                                                             delalloc_cached_state,
3625                                                             search_start,
3626                                                             extent_end - 1,
3627                                                             &found_start);
3628                         if (found) {
3629                                 start = found_start;
3630                                 break;
3631                         }
3632                         /*
3633                          * Didn't find data or a hole (due to delalloc) in the
3634                          * implicit hole range, so need to analyze the next
3635                          * extent item.
3636                          */
3637                 } else {
3638                         /*
3639                          * Found a regular or inline extent.
3640                          * If we are seeking for data, adjust the start offset
3641                          * and stop, we're done.
3642                          */
3643                         if (whence == SEEK_DATA) {
3644                                 start = max_t(u64, key.offset, offset);
3645                                 found = true;
3646                                 break;
3647                         }
3648                         /*
3649                          * Else, we are seeking for a hole, check the next file
3650                          * extent item.
3651                          */
3652                 }
3653
3654                 start = extent_end;
3655                 last_extent_end = extent_end;
3656                 path->slots[0]++;
3657                 if (fatal_signal_pending(current)) {
3658                         ret = -EINTR;
3659                         goto out;
3660                 }
3661                 cond_resched();
3662         }
3663
3664         /* We have an implicit hole from the last extent found up to i_size. */
3665         if (!found && start < i_size) {
3666                 found = find_desired_extent_in_hole(inode, whence,
3667                                                     delalloc_cached_state, start,
3668                                                     i_size - 1, &start);
3669                 if (!found)
3670                         start = i_size;
3671         }
3672
3673 out:
3674         unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
3675         btrfs_free_path(path);
3676
3677         if (ret < 0)
3678                 return ret;
3679
3680         if (whence == SEEK_DATA && start >= i_size)
3681                 return -ENXIO;
3682
3683         return min_t(loff_t, start, i_size);
3684 }
3685
3686 static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
3687 {
3688         struct inode *inode = file->f_mapping->host;
3689
3690         switch (whence) {
3691         default:
3692                 return generic_file_llseek(file, offset, whence);
3693         case SEEK_DATA:
3694         case SEEK_HOLE:
3695                 btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3696                 offset = find_desired_extent(file, offset, whence);
3697                 btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3698                 break;
3699         }
3700
3701         if (offset < 0)
3702                 return offset;
3703
3704         return vfs_setpos(file, offset, inode->i_sb->s_maxbytes);
3705 }
3706
3707 static int btrfs_file_open(struct inode *inode, struct file *filp)
3708 {
3709         int ret;
3710
3711         filp->f_mode |= FMODE_NOWAIT | FMODE_BUF_RASYNC | FMODE_BUF_WASYNC |
3712                         FMODE_CAN_ODIRECT;
3713
3714         ret = fsverity_file_open(inode, filp);
3715         if (ret)
3716                 return ret;
3717         return generic_file_open(inode, filp);
3718 }
3719
3720 static int check_direct_read(struct btrfs_fs_info *fs_info,
3721                              const struct iov_iter *iter, loff_t offset)
3722 {
3723         int ret;
3724         int i, seg;
3725
3726         ret = check_direct_IO(fs_info, iter, offset);
3727         if (ret < 0)
3728                 return ret;
3729
3730         if (!iter_is_iovec(iter))
3731                 return 0;
3732
3733         for (seg = 0; seg < iter->nr_segs; seg++) {
3734                 for (i = seg + 1; i < iter->nr_segs; i++) {
3735                         const struct iovec *iov1 = iter_iov(iter) + seg;
3736                         const struct iovec *iov2 = iter_iov(iter) + i;
3737
3738                         if (iov1->iov_base == iov2->iov_base)
3739                                 return -EINVAL;
3740                 }
3741         }
3742         return 0;
3743 }
3744
3745 static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
3746 {
3747         struct inode *inode = file_inode(iocb->ki_filp);
3748         size_t prev_left = 0;
3749         ssize_t read = 0;
3750         ssize_t ret;
3751
3752         if (fsverity_active(inode))
3753                 return 0;
3754
3755         if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
3756                 return 0;
3757
3758         btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3759 again:
3760         /*
3761          * This is similar to what we do for direct IO writes, see the comment
3762          * at btrfs_direct_write(), but we also disable page faults in addition
3763          * to disabling them only at the iov_iter level. This is because when
3764          * reading from a hole or prealloc extent, iomap calls iov_iter_zero(),
3765          * which can still trigger page fault ins despite having set ->nofault
3766          * to true of our 'to' iov_iter.
3767          *
3768          * The difference to direct IO writes is that we deadlock when trying
3769          * to lock the extent range in the inode's tree during he page reads
3770          * triggered by the fault in (while for writes it is due to waiting for
3771          * our own ordered extent). This is because for direct IO reads,
3772          * btrfs_dio_iomap_begin() returns with the extent range locked, which
3773          * is only unlocked in the endio callback (end_bio_extent_readpage()).
3774          */
3775         pagefault_disable();
3776         to->nofault = true;
3777         ret = btrfs_dio_read(iocb, to, read);
3778         to->nofault = false;
3779         pagefault_enable();
3780
3781         /* No increment (+=) because iomap returns a cumulative value. */
3782         if (ret > 0)
3783                 read = ret;
3784
3785         if (iov_iter_count(to) > 0 && (ret == -EFAULT || ret > 0)) {
3786                 const size_t left = iov_iter_count(to);
3787
3788                 if (left == prev_left) {
3789                         /*
3790                          * We didn't make any progress since the last attempt,
3791                          * fallback to a buffered read for the remainder of the
3792                          * range. This is just to avoid any possibility of looping
3793                          * for too long.
3794                          */
3795                         ret = read;
3796                 } else {
3797                         /*
3798                          * We made some progress since the last retry or this is
3799                          * the first time we are retrying. Fault in as many pages
3800                          * as possible and retry.
3801                          */
3802                         fault_in_iov_iter_writeable(to, left);
3803                         prev_left = left;
3804                         goto again;
3805                 }
3806         }
3807         btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
3808         return ret < 0 ? ret : read;
3809 }
3810
3811 static ssize_t btrfs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
3812 {
3813         ssize_t ret = 0;
3814
3815         if (iocb->ki_flags & IOCB_DIRECT) {
3816                 ret = btrfs_direct_read(iocb, to);
3817                 if (ret < 0 || !iov_iter_count(to) ||
3818                     iocb->ki_pos >= i_size_read(file_inode(iocb->ki_filp)))
3819                         return ret;
3820         }
3821
3822         return filemap_read(iocb, to, ret);
3823 }
3824
3825 const struct file_operations btrfs_file_operations = {
3826         .llseek         = btrfs_file_llseek,
3827         .read_iter      = btrfs_file_read_iter,
3828         .splice_read    = filemap_splice_read,
3829         .write_iter     = btrfs_file_write_iter,
3830         .splice_write   = iter_file_splice_write,
3831         .mmap           = btrfs_file_mmap,
3832         .open           = btrfs_file_open,
3833         .release        = btrfs_release_file,
3834         .get_unmapped_area = thp_get_unmapped_area,
3835         .fsync          = btrfs_sync_file,
3836         .fallocate      = btrfs_fallocate,
3837         .unlocked_ioctl = btrfs_ioctl,
3838 #ifdef CONFIG_COMPAT
3839         .compat_ioctl   = btrfs_compat_ioctl,
3840 #endif
3841         .remap_file_range = btrfs_remap_file_range,
3842 };
3843
3844 int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
3845 {
3846         int ret;
3847
3848         /*
3849          * So with compression we will find and lock a dirty page and clear the
3850          * first one as dirty, setup an async extent, and immediately return
3851          * with the entire range locked but with nobody actually marked with
3852          * writeback.  So we can't just filemap_write_and_wait_range() and
3853          * expect it to work since it will just kick off a thread to do the
3854          * actual work.  So we need to call filemap_fdatawrite_range _again_
3855          * since it will wait on the page lock, which won't be unlocked until
3856          * after the pages have been marked as writeback and so we're good to go
3857          * from there.  We have to do this otherwise we'll miss the ordered
3858          * extents and that results in badness.  Please Josef, do not think you
3859          * know better and pull this out at some point in the future, it is
3860          * right and you are wrong.
3861          */
3862         ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3863         if (!ret && test_bit(BTRFS_INODE_HAS_ASYNC_EXTENT,
3864                              &BTRFS_I(inode)->runtime_flags))
3865                 ret = filemap_fdatawrite_range(inode->i_mapping, start, end);
3866
3867         return ret;
3868 }