Merge branch 'for-6.0/dax' into libnvdimm-fixes

[people/ms/linux.git] / fs / btrfs / inode.c
diff --git a/fs/btrfs/inode.c b/fs/btrfs/inode.c

index 05e0c4a5affda0268cbce2260ff84998d862266e..1372210869b14cda075d3e10fb583ac98d2d4a07 100644 (file)
--- a/fs/btrfs/inode.c
+++ b/fs/btrfs/inode.c
@@ -114,21 +114,17 @@ struct kmem_cache *btrfs_free_space_bitmap_cachep;
  
  static int btrfs_setsize(struct inode *inode, struct iattr *attr);
  static int btrfs_truncate(struct inode *inode, bool skip_writeback);
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent);
  static noinline int cow_file_range(struct btrfs_inode *inode,
                                    struct page *locked_page,
                                    u64 start, u64 end, int *page_started,
-                                  unsigned long *nr_written, int unlock);
+                                  unsigned long *nr_written, int unlock,
+                                  u64 *done_offset);
  static struct extent_map *create_io_em(struct btrfs_inode *inode, u64 start,
                                        u64 len, u64 orig_start, u64 block_start,
                                        u64 block_len, u64 orig_block_len,
                                        u64 ram_bytes, int compress_type,
                                        int type);
  
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
-                                        const u64 offset, const u64 bytes,
-                                        const bool uptodate);
-
  /*
   * btrfs_inode_lock - lock inode i_rwsem based on arguments passed
   *
@@ -195,11 +191,14 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  {
         unsigned long index = offset >> PAGE_SHIFT;
         unsigned long end_index = (offset + bytes - 1) >> PAGE_SHIFT;
-       u64 page_start = page_offset(locked_page);
-       u64 page_end = page_start + PAGE_SIZE - 1;
-
+       u64 page_start, page_end;
         struct page *page;
  
+       if (locked_page) {
+               page_start = page_offset(locked_page);
+               page_end = page_start + PAGE_SIZE - 1;
+       }
+
         while (index <= end_index) {
                 /*
                  * For locked page, we will call end_extent_writepage() on it
@@ -212,7 +211,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
                  * btrfs_mark_ordered_io_finished() would skip the accounting
                  * for the page range, and the ordered extent will never finish.
                  */
-               if (index == (page_offset(locked_page) >> PAGE_SHIFT)) {
+               if (locked_page && index == (page_start >> PAGE_SHIFT)) {
                         index++;
                         continue;
                 }
@@ -223,7 +222,7 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
  
                 /*
                  * Here we just clear all Ordered bits for every page in the
-                * range, then __endio_write_update_ordered() will handle
+                * range, then btrfs_mark_ordered_io_finished() will handle
                  * the ordered extent accounting for the range.
                  */
                 btrfs_page_clamp_clear_ordered(inode->root->fs_info, page,
@@ -231,20 +230,23 @@ static inline void btrfs_cleanup_ordered_extents(struct btrfs_inode *inode,
                 put_page(page);
         }
  
-       /* The locked page covers the full range, nothing needs to be done */
-       if (bytes + offset <= page_offset(locked_page) + PAGE_SIZE)
-               return;
-       /*
-        * In case this page belongs to the delalloc range being instantiated
-        * then skip it, since the first page of a range is going to be
-        * properly cleaned up by the caller of run_delalloc_range
-        */
-       if (page_start >= offset && page_end <= (offset + bytes - 1)) {
-               bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
-               offset = page_offset(locked_page) + PAGE_SIZE;
+       if (locked_page) {
+               /* The locked page covers the full range, nothing needs to be done */
+               if (bytes + offset <= page_start + PAGE_SIZE)
+                       return;
+               /*
+                * In case this page belongs to the delalloc range being
+                * instantiated then skip it, since the first page of a range is
+                * going to be properly cleaned up by the caller of
+                * run_delalloc_range
+                */
+               if (page_start >= offset && page_end <= (offset + bytes - 1)) {
+                       bytes = offset + bytes - page_offset(locked_page) - PAGE_SIZE;
+                       offset = page_offset(locked_page) + PAGE_SIZE;
+               }
         }
  
-       return __endio_write_update_ordered(inode, offset, bytes, false);
+       return btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes, false);
  }
  
  static int btrfs_dirty_inode(struct inode *inode);
@@ -332,9 +334,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
                         cur_size = min_t(unsigned long, compressed_size,
                                        PAGE_SIZE);
  
-                       kaddr = kmap_atomic(cpage);
+                       kaddr = kmap_local_page(cpage);
                         write_extent_buffer(leaf, kaddr, ptr, cur_size);
-                       kunmap_atomic(kaddr);
+                       kunmap_local(kaddr);
  
                         i++;
                         ptr += cur_size;
@@ -345,9 +347,9 @@ static int insert_inline_extent(struct btrfs_trans_handle *trans,
         } else {
                 page = find_get_page(inode->vfs_inode.i_mapping, 0);
                 btrfs_set_file_extent_compression(leaf, ei, 0);
-               kaddr = kmap_atomic(page);
+               kaddr = kmap_local_page(page);
                 write_extent_buffer(leaf, kaddr, ptr, size);
-               kunmap_atomic(kaddr);
+               kunmap_local(kaddr);
                 put_page(page);
         }
         btrfs_mark_buffer_dirty(leaf);
@@ -485,7 +487,7 @@ struct async_chunk {
         struct page *locked_page;
         u64 start;
         u64 end;
-       unsigned int write_flags;
+       blk_opf_t write_flags;
         struct list_head extents;
         struct cgroup_subsys_state *blkcg_css;
         struct btrfs_work work;
@@ -560,8 +562,8 @@ static inline int inode_need_compress(struct btrfs_inode *inode, u64 start,
          * will unlock the full page.
          */
         if (fs_info->sectorsize < PAGE_SIZE) {
-               if (!IS_ALIGNED(start, PAGE_SIZE) ||
-                   !IS_ALIGNED(end + 1, PAGE_SIZE))
+               if (!PAGE_ALIGNED(start) ||
+                   !PAGE_ALIGNED(end + 1))
                         return 0;
         }
  
@@ -678,8 +680,8 @@ again:
          * Thus we must also check against @actual_end, not just @end.
          */
         if (blocksize < PAGE_SIZE) {
-               if (!IS_ALIGNED(start, PAGE_SIZE) ||
-                   !IS_ALIGNED(round_up(actual_end, blocksize), PAGE_SIZE))
+               if (!PAGE_ALIGNED(start) ||
+                   !PAGE_ALIGNED(round_up(actual_end, blocksize)))
                         goto cleanup_and_bail_uncompressed;
         }
  
@@ -920,15 +922,25 @@ static int submit_uncompressed_range(struct btrfs_inode *inode,
          * can directly submit them without interruption.
          */
         ret = cow_file_range(inode, locked_page, start, end, &page_started,
-                            &nr_written, 0);
+                            &nr_written, 0, NULL);
         /* Inline extent inserted, page gets unlocked and everything is done */
         if (page_started) {
                 ret = 0;
                 goto out;
         }
         if (ret < 0) {
-               if (locked_page)
+               btrfs_cleanup_ordered_extents(inode, locked_page, start, end - start + 1);
+               if (locked_page) {
+                       const u64 page_start = page_offset(locked_page);
+                       const u64 page_end = page_start + PAGE_SIZE - 1;
+
+                       btrfs_page_set_error(inode->root->fs_info, locked_page,
+                                            page_start, PAGE_SIZE);
+                       set_page_writeback(locked_page);
+                       end_page_writeback(locked_page);
+                       end_extent_writepage(locked_page, ret, page_start, page_end);
                         unlock_page(locked_page);
+               }
                 goto out;
         }
  
@@ -1133,15 +1145,39 @@ static u64 get_extent_allocation_hint(struct btrfs_inode *inode, u64 start,
   * *page_started is set to one if we unlock locked_page and do everything
   * required to start IO on it.  It may be clean and already done with
   * IO when we return.
+ *
+ * When unlock == 1, we unlock the pages in successfully allocated regions.
+ * When unlock == 0, we leave them locked for writing them out.
+ *
+ * However, we unlock all the pages except @locked_page in case of failure.
+ *
+ * In summary, page locking state will be as follow:
+ *
+ * - page_started == 1 (return value)
+ *     - All the pages are unlocked. IO is started.
+ *     - Note that this can happen only on success
+ * - unlock == 1
+ *     - All the pages except @locked_page are unlocked in any case
+ * - unlock == 0
+ *     - On success, all the pages are locked for writing out them
+ *     - On failure, all the pages except @locked_page are unlocked
+ *
+ * When a failure happens in the second or later iteration of the
+ * while-loop, the ordered extents created in previous iterations are kept
+ * intact. So, the caller must clean them up by calling
+ * btrfs_cleanup_ordered_extents(). See btrfs_run_delalloc_range() for
+ * example.
   */
  static noinline int cow_file_range(struct btrfs_inode *inode,
                                    struct page *locked_page,
                                    u64 start, u64 end, int *page_started,
-                                  unsigned long *nr_written, int unlock)
+                                  unsigned long *nr_written, int unlock,
+                                  u64 *done_offset)
  {
         struct btrfs_root *root = inode->root;
         struct btrfs_fs_info *fs_info = root->fs_info;
         u64 alloc_hint = 0;
+       u64 orig_start = start;
         u64 num_bytes;
         unsigned long ram_size;
         u64 cur_alloc_size = 0;
@@ -1329,18 +1365,62 @@ out_reserve:
         btrfs_dec_block_group_reservations(fs_info, ins.objectid);
         btrfs_free_reserved_extent(fs_info, ins.objectid, ins.offset, 1);
  out_unlock:
+       /*
+        * If done_offset is non-NULL and ret == -EAGAIN, we expect the
+        * caller to write out the successfully allocated region and retry.
+        */
+       if (done_offset && ret == -EAGAIN) {
+               if (orig_start < start)
+                       *done_offset = start - 1;
+               else
+                       *done_offset = start;
+               return ret;
+       } else if (ret == -EAGAIN) {
+               /* Convert to -ENOSPC since the caller cannot retry. */
+               ret = -ENOSPC;
+       }
+
+       /*
+        * Now, we have three regions to clean up:
+        *
+        * |-------(1)----|---(2)---|-------------(3)----------|
+        * `- orig_start  `- start  `- start + cur_alloc_size  `- end
+        *
+        * We process each region below.
+        */
+
         clear_bits = EXTENT_LOCKED | EXTENT_DELALLOC | EXTENT_DELALLOC_NEW |
                 EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV;
         page_ops = PAGE_UNLOCK | PAGE_START_WRITEBACK | PAGE_END_WRITEBACK;
+
         /*
-        * If we reserved an extent for our delalloc range (or a subrange) and
-        * failed to create the respective ordered extent, then it means that
-        * when we reserved the extent we decremented the extent's size from
-        * the data space_info's bytes_may_use counter and incremented the
-        * space_info's bytes_reserved counter by the same amount. We must make
-        * sure extent_clear_unlock_delalloc() does not try to decrement again
-        * the data space_info's bytes_may_use counter, therefore we do not pass
-        * it the flag EXTENT_CLEAR_DATA_RESV.
+        * For the range (1). We have already instantiated the ordered extents
+        * for this region. They are cleaned up by
+        * btrfs_cleanup_ordered_extents() in e.g,
+        * btrfs_run_delalloc_range(). EXTENT_LOCKED | EXTENT_DELALLOC are
+        * already cleared in the above loop. And, EXTENT_DELALLOC_NEW |
+        * EXTENT_DEFRAG | EXTENT_CLEAR_META_RESV are handled by the cleanup
+        * function.
+        *
+        * However, in case of unlock == 0, we still need to unlock the pages
+        * (except @locked_page) to ensure all the pages are unlocked.
+        */
+       if (!unlock && orig_start < start) {
+               if (!locked_page)
+                       mapping_set_error(inode->vfs_inode.i_mapping, ret);
+               extent_clear_unlock_delalloc(inode, orig_start, start - 1,
+                                            locked_page, 0, page_ops);
+       }
+
+       /*
+        * For the range (2). If we reserved an extent for our delalloc range
+        * (or a subrange) and failed to create the respective ordered extent,
+        * then it means that when we reserved the extent we decremented the
+        * extent's size from the data space_info's bytes_may_use counter and
+        * incremented the space_info's bytes_reserved counter by the same
+        * amount. We must make sure extent_clear_unlock_delalloc() does not try
+        * to decrement again the data space_info's bytes_may_use counter,
+        * therefore we do not pass it the flag EXTENT_CLEAR_DATA_RESV.
          */
         if (extent_reserved) {
                 extent_clear_unlock_delalloc(inode, start,
@@ -1350,12 +1430,19 @@ out_unlock:
                                              page_ops);
                 start += cur_alloc_size;
                 if (start >= end)
-                       goto out;
+                       return ret;
         }
+
+       /*
+        * For the range (3). We never touched the region. In addition to the
+        * clear_bits above, we add EXTENT_CLEAR_DATA_RESV to release the data
+        * space_info's bytes_may_use counter, reserved in
+        * btrfs_check_data_free_space().
+        */
         extent_clear_unlock_delalloc(inode, start, end, locked_page,
                                      clear_bits | EXTENT_CLEAR_DATA_RESV,
                                      page_ops);
-       goto out;
+       return ret;
  }
  
  /*
@@ -1435,7 +1522,7 @@ static int cow_file_range_async(struct btrfs_inode *inode,
         int i;
         bool should_compress;
         unsigned nofs_flag;
-       const unsigned int write_flags = wbc_to_write_flags(wbc);
+       const blk_opf_t write_flags = wbc_to_write_flags(wbc);
  
         unlock_extent(&inode->io_tree, start, end);
  
@@ -1538,19 +1625,41 @@ static noinline int run_delalloc_zoned(struct btrfs_inode *inode,
                                        u64 end, int *page_started,
                                        unsigned long *nr_written)
  {
+       u64 done_offset = end;
         int ret;
+       bool locked_page_done = false;
  
-       ret = cow_file_range(inode, locked_page, start, end, page_started,
-                            nr_written, 0);
-       if (ret)
-               return ret;
+       while (start <= end) {
+               ret = cow_file_range(inode, locked_page, start, end, page_started,
+                                    nr_written, 0, &done_offset);
+               if (ret && ret != -EAGAIN)
+                       return ret;
  
-       if (*page_started)
-               return 0;
+               if (*page_started) {
+                       ASSERT(ret == 0);
+                       return 0;
+               }
+
+               if (ret == 0)
+                       done_offset = end;
+
+               if (done_offset == start) {
+                       wait_on_bit_io(&inode->root->fs_info->flags,
+                                      BTRFS_FS_NEED_ZONE_FINISH,
+                                      TASK_UNINTERRUPTIBLE);
+                       continue;
+               }
+
+               if (!locked_page_done) {
+                       __set_page_dirty_nobuffers(locked_page);
+                       account_page_redirty(locked_page);
+               }
+               locked_page_done = true;
+               extent_write_locked_range(&inode->vfs_inode, start, done_offset);
+
+               start = done_offset + 1;
+       }
  
-       __set_page_dirty_nobuffers(locked_page);
-       account_page_redirty(locked_page);
-       extent_write_locked_range(&inode->vfs_inode, start, end);
         *page_started = 1;
  
         return 0;
@@ -1642,7 +1751,7 @@ static int fallback_to_cow(struct btrfs_inode *inode, struct page *locked_page,
         }
  
         return cow_file_range(inode, locked_page, start, end, page_started,
-                             nr_written, 1);
+                             nr_written, 1, NULL);
  }
  
  struct can_nocow_file_extent_args {
@@ -2115,7 +2224,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
                                                  page_started, nr_written);
                 else
                         ret = cow_file_range(inode, locked_page, start, end,
-                                            page_started, nr_written, 1);
+                                            page_started, nr_written, 1, NULL);
         } else {
                 set_bit(BTRFS_INODE_HAS_ASYNC_EXTENT, &inode->runtime_flags);
                 ret = cow_file_range_async(inode, wbc, locked_page, start, end,
@@ -2131,6 +2240,7 @@ int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page
  void btrfs_split_delalloc_extent(struct inode *inode,
                                  struct extent_state *orig, u64 split)
  {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 size;
  
         /* not delalloc, ignore it */
@@ -2138,7 +2248,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
                 return;
  
         size = orig->end - orig->start + 1;
-       if (size > BTRFS_MAX_EXTENT_SIZE) {
+       if (size > fs_info->max_extent_size) {
                 u32 num_extents;
                 u64 new_size;
  
@@ -2147,10 +2257,10 @@ void btrfs_split_delalloc_extent(struct inode *inode,
                  * applies here, just in reverse.
                  */
                 new_size = orig->end - split + 1;
-               num_extents = count_max_extents(new_size);
+               num_extents = count_max_extents(fs_info, new_size);
                 new_size = split - orig->start;
-               num_extents += count_max_extents(new_size);
-               if (count_max_extents(size) >= num_extents)
+               num_extents += count_max_extents(fs_info, new_size);
+               if (count_max_extents(fs_info, size) >= num_extents)
                         return;
         }
  
@@ -2167,6 +2277,7 @@ void btrfs_split_delalloc_extent(struct inode *inode,
  void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
                                  struct extent_state *other)
  {
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         u64 new_size, old_size;
         u32 num_extents;
  
@@ -2180,7 +2291,7 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
                 new_size = other->end - new->start + 1;
  
         /* we're not bigger than the max, unreserve the space and go */
-       if (new_size <= BTRFS_MAX_EXTENT_SIZE) {
+       if (new_size <= fs_info->max_extent_size) {
                 spin_lock(&BTRFS_I(inode)->lock);
                 btrfs_mod_outstanding_extents(BTRFS_I(inode), -1);
                 spin_unlock(&BTRFS_I(inode)->lock);
@@ -2206,10 +2317,10 @@ void btrfs_merge_delalloc_extent(struct inode *inode, struct extent_state *new,
          * this case.
          */
         old_size = other->end - other->start + 1;
-       num_extents = count_max_extents(old_size);
+       num_extents = count_max_extents(fs_info, old_size);
         old_size = new->end - new->start + 1;
-       num_extents += count_max_extents(old_size);
-       if (count_max_extents(new_size) >= num_extents)
+       num_extents += count_max_extents(fs_info, old_size);
+       if (count_max_extents(fs_info, new_size) >= num_extents)
                 return;
  
         spin_lock(&BTRFS_I(inode)->lock);
@@ -2274,21 +2385,21 @@ static void btrfs_del_delalloc_inode(struct btrfs_root *root,
   * list of inodes that have pending delalloc work to be done.
   */
  void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
-                              unsigned *bits)
+                              u32 bits)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
  
-       if ((*bits & EXTENT_DEFRAG) && !(*bits & EXTENT_DELALLOC))
+       if ((bits & EXTENT_DEFRAG) && !(bits & EXTENT_DELALLOC))
                 WARN_ON(1);
         /*
          * set_bit and clear bit hooks normally require _irqsave/restore
          * but in this case, we are only testing for the DELALLOC
          * bit, which is only set or cleared with irqs on
          */
-       if (!(state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+       if (!(state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                 struct btrfs_root *root = BTRFS_I(inode)->root;
                 u64 len = state->end + 1 - state->start;
-               u32 num_extents = count_max_extents(len);
+               u32 num_extents = count_max_extents(fs_info, len);
                 bool do_list = !btrfs_is_free_space_inode(BTRFS_I(inode));
  
                 spin_lock(&BTRFS_I(inode)->lock);
@@ -2303,7 +2414,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
                                          fs_info->delalloc_batch);
                 spin_lock(&BTRFS_I(inode)->lock);
                 BTRFS_I(inode)->delalloc_bytes += len;
-               if (*bits & EXTENT_DEFRAG)
+               if (bits & EXTENT_DEFRAG)
                         BTRFS_I(inode)->defrag_bytes += len;
                 if (do_list && !test_bit(BTRFS_INODE_IN_DELALLOC_LIST,
                                          &BTRFS_I(inode)->runtime_flags))
@@ -2312,7 +2423,7 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
         }
  
         if (!(state->state & EXTENT_DELALLOC_NEW) &&
-           (*bits & EXTENT_DELALLOC_NEW)) {
+           (bits & EXTENT_DELALLOC_NEW)) {
                 spin_lock(&BTRFS_I(inode)->lock);
                 BTRFS_I(inode)->new_delalloc_bytes += state->end + 1 -
                         state->start;
@@ -2325,14 +2436,14 @@ void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
   * accounting happens.
   */
  void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
-                                struct extent_state *state, unsigned *bits)
+                                struct extent_state *state, u32 bits)
  {
         struct btrfs_inode *inode = BTRFS_I(vfs_inode);
         struct btrfs_fs_info *fs_info = btrfs_sb(vfs_inode->i_sb);
         u64 len = state->end + 1 - state->start;
-       u32 num_extents = count_max_extents(len);
+       u32 num_extents = count_max_extents(fs_info, len);
  
-       if ((state->state & EXTENT_DEFRAG) && (*bits & EXTENT_DEFRAG)) {
+       if ((state->state & EXTENT_DEFRAG) && (bits & EXTENT_DEFRAG)) {
                 spin_lock(&inode->lock);
                 inode->defrag_bytes -= len;
                 spin_unlock(&inode->lock);
@@ -2343,7 +2454,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
          * but in this case, we are only testing for the DELALLOC
          * bit, which is only set or cleared with irqs on
          */
-       if ((state->state & EXTENT_DELALLOC) && (*bits & EXTENT_DELALLOC)) {
+       if ((state->state & EXTENT_DELALLOC) && (bits & EXTENT_DELALLOC)) {
                 struct btrfs_root *root = inode->root;
                 bool do_list = !btrfs_is_free_space_inode(inode);
  
@@ -2356,7 +2467,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
                  * don't need to call delalloc_release_metadata if there is an
                  * error.
                  */
-               if (*bits & EXTENT_CLEAR_META_RESV &&
+               if (bits & EXTENT_CLEAR_META_RESV &&
                     root != fs_info->tree_root)
                         btrfs_delalloc_release_metadata(inode, len, false);
  
@@ -2366,7 +2477,7 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
  
                 if (!btrfs_is_data_reloc_root(root) &&
                     do_list && !(state->state & EXTENT_NORESERVE) &&
-                   (*bits & EXTENT_CLEAR_DATA_RESV))
+                   (bits & EXTENT_CLEAR_DATA_RESV))
                         btrfs_free_reserved_data_space_noquota(fs_info, len);
  
                 percpu_counter_add_batch(&fs_info->delalloc_bytes, -len,
@@ -2381,11 +2492,11 @@ void btrfs_clear_delalloc_extent(struct inode *vfs_inode,
         }
  
         if ((state->state & EXTENT_DELALLOC_NEW) &&
-           (*bits & EXTENT_DELALLOC_NEW)) {
+           (bits & EXTENT_DELALLOC_NEW)) {
                 spin_lock(&inode->lock);
                 ASSERT(inode->new_delalloc_bytes >= len);
                 inode->new_delalloc_bytes -= len;
-               if (*bits & EXTENT_ADD_INODE_BYTES)
+               if (bits & EXTENT_ADD_INODE_BYTES)
                         inode_add_bytes(&inode->vfs_inode, len);
                 spin_unlock(&inode->lock);
         }
@@ -2580,95 +2691,78 @@ out:
         return errno_to_blk_status(ret);
  }
  
-/*
- * extent_io.c submission hook. This does the right thing for csum calculation
- * on write, or reading the csums from the tree before a read.
- *
- * Rules about async/sync submit,
- * a) read:                            sync submit
- *
- * b) write without checksum:          sync submit
- *
- * c) write with checksum:
- *    c-1) if bio is issued by fsync:  sync submit
- *         (sync_writers != 0)
- *
- *    c-2) if root is reloc root:      sync submit
- *         (only in case of buffered IO)
- *
- *    c-3) otherwise:                  async submit
- */
-void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
-                          int mirror_num, enum btrfs_compression_type compress_type)
+void btrfs_submit_data_write_bio(struct inode *inode, struct bio *bio, int mirror_num)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       struct btrfs_root *root = BTRFS_I(inode)->root;
-       enum btrfs_wq_endio_type metadata = BTRFS_WQ_ENDIO_DATA;
-       blk_status_t ret = 0;
-       int skip_sum;
-       int async = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
-       skip_sum = (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM) ||
-               test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state);
-
-       if (btrfs_is_free_space_inode(BTRFS_I(inode)))
-               metadata = BTRFS_WQ_ENDIO_FREE_SPACE;
+       struct btrfs_inode *bi = BTRFS_I(inode);
+       blk_status_t ret;
  
         if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
-               struct page *page = bio_first_bvec_all(bio)->bv_page;
-               loff_t file_offset = page_offset(page);
-
-               ret = extract_ordered_extent(BTRFS_I(inode), bio, file_offset);
+               ret = extract_ordered_extent(bi, bio,
+                               page_offset(bio_first_bvec_all(bio)->bv_page));
                 if (ret)
                         goto out;
         }
  
-       if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
-               ret = btrfs_bio_wq_end_io(fs_info, bio, metadata);
-               if (ret)
-                       goto out;
-
-               if (compress_type != BTRFS_COMPRESS_NONE) {
-                       /*
-                        * btrfs_submit_compressed_read will handle completing
-                        * the bio if there were any errors, so just return
-                        * here.
-                        */
-                       btrfs_submit_compressed_read(inode, bio, mirror_num);
+       /*
+        * If we need to checksum, and the I/O is not issued by fsync and
+        * friends, that is ->sync_writers != 0, defer the submission to a
+        * workqueue to parallelize it.
+        *
+        * Csum items for reloc roots have already been cloned at this point,
+        * so they are handled as part of the no-checksum case.
+        */
+       if (!(bi->flags & BTRFS_INODE_NODATASUM) &&
+           !test_bit(BTRFS_FS_STATE_NO_CSUMS, &fs_info->fs_state) &&
+           !btrfs_is_data_reloc_root(bi->root)) {
+               if (!atomic_read(&bi->sync_writers) &&
+                   btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
+                                       btrfs_submit_bio_start))
                         return;
-               } else {
-                       /*
-                        * Lookup bio sums does extra checks around whether we
-                        * need to csum or not, which is why we ignore skip_sum
-                        * here.
-                        */
-                       ret = btrfs_lookup_bio_sums(inode, bio, NULL);
-                       if (ret)
-                               goto out;
-               }
-               goto mapit;
-       } else if (async && !skip_sum) {
-               /* csum items have already been cloned */
-               if (btrfs_is_data_reloc_root(root))
-                       goto mapit;
-               /* we're doing a write, do the async checksumming */
-               ret = btrfs_wq_submit_bio(inode, bio, mirror_num,
-                                         0, btrfs_submit_bio_start);
-               goto out;
-       } else if (!skip_sum) {
-               ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, (u64)-1, false);
+
+               ret = btrfs_csum_one_bio(bi, bio, (u64)-1, false);
                 if (ret)
                         goto out;
         }
+       btrfs_submit_bio(fs_info, bio, mirror_num);
+       return;
+out:
+       if (ret) {
+               bio->bi_status = ret;
+               bio_endio(bio);
+       }
+}
  
-mapit:
-       ret = btrfs_map_bio(fs_info, bio, mirror_num);
+void btrfs_submit_data_read_bio(struct inode *inode, struct bio *bio,
+                       int mirror_num, enum btrfs_compression_type compress_type)
+{
+       struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
+       blk_status_t ret;
  
-out:
+       if (compress_type != BTRFS_COMPRESS_NONE) {
+               /*
+                * btrfs_submit_compressed_read will handle completing the bio
+                * if there were any errors, so just return here.
+                */
+               btrfs_submit_compressed_read(inode, bio, mirror_num);
+               return;
+       }
+
+       /* Save the original iter for read repair */
+       btrfs_bio(bio)->iter = bio->bi_iter;
+
+       /*
+        * Lookup bio sums does extra checks around whether we need to csum or
+        * not, which is why we ignore skip_sum here.
+        */
+       ret = btrfs_lookup_bio_sums(inode, bio, NULL);
         if (ret) {
                 bio->bi_status = ret;
                 bio_endio(bio);
+               return;
         }
+
+       btrfs_submit_bio(fs_info, bio, mirror_num);
  }
  
  /*
@@ -3075,8 +3169,10 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
         btrfs_set_stack_file_extent_disk_num_bytes(&stack_fi,
                                                    oe->disk_num_bytes);
         btrfs_set_stack_file_extent_offset(&stack_fi, oe->offset);
-       if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags))
-               num_bytes = ram_bytes = oe->truncated_len;
+       if (test_bit(BTRFS_ORDERED_TRUNCATED, &oe->flags)) {
+               num_bytes = oe->truncated_len;
+               ram_bytes = num_bytes;
+       }
         btrfs_set_stack_file_extent_num_bytes(&stack_fi, num_bytes);
         btrfs_set_stack_file_extent_ram_bytes(&stack_fi, ram_bytes);
         btrfs_set_stack_file_extent_compression(&stack_fi, oe->compress_type);
@@ -3102,7 +3198,7 @@ static int insert_ordered_extent_file_extent(struct btrfs_trans_handle *trans,
   * an ordered extent if the range of bytes in the file it covers are
   * fully written.
   */
-static int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
+int btrfs_finish_ordered_io(struct btrfs_ordered_extent *ordered_extent)
  {
         struct btrfs_inode *inode = BTRFS_I(ordered_extent->inode);
         struct btrfs_root *root = inode->root;
@@ -3311,65 +3407,71 @@ out:
         return ret;
  }
  
-static void finish_ordered_fn(struct btrfs_work *work)
-{
-       struct btrfs_ordered_extent *ordered_extent;
-       ordered_extent = container_of(work, struct btrfs_ordered_extent, work);
-       btrfs_finish_ordered_io(ordered_extent);
-}
-
  void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
                                           struct page *page, u64 start,
                                           u64 end, bool uptodate)
  {
         trace_btrfs_writepage_end_io_hook(inode, start, end, uptodate);
  
-       btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start,
-                                      finish_ordered_fn, uptodate);
+       btrfs_mark_ordered_io_finished(inode, page, start, end + 1 - start, uptodate);
+}
+
+/*
+ * Verify the checksum for a single sector without any extra action that depend
+ * on the type of I/O.
+ */
+int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
+                           u32 pgoff, u8 *csum, const u8 * const csum_expected)
+{
+       SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
+       char *kaddr;
+
+       ASSERT(pgoff + fs_info->sectorsize <= PAGE_SIZE);
+
+       shash->tfm = fs_info->csum_shash;
+
+       kaddr = kmap_local_page(page) + pgoff;
+       crypto_shash_digest(shash, kaddr, fs_info->sectorsize, csum);
+       kunmap_local(kaddr);
+
+       if (memcmp(csum, csum_expected, fs_info->csum_size))
+               return -EIO;
+       return 0;
  }
  
  /*
   * check_data_csum - verify checksum of one sector of uncompressed data
   * @inode:     inode
- * @io_bio:    btrfs_io_bio which contains the csum
+ * @bbio:      btrfs_bio which contains the csum
   * @bio_offset:        offset to the beginning of the bio (in bytes)
   * @page:      page where is the data to be verified
   * @pgoff:     offset inside the page
- * @start:     logical offset in the file
   *
   * The length of such check is always one sector size.
+ *
+ * When csum mismatch is detected, we will also report the error and fill the
+ * corrupted range with zero. (Thus it needs the extra parameters)
   */
-static int check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
-                          u32 bio_offset, struct page *page, u32 pgoff,
-                          u64 start)
+int btrfs_check_data_csum(struct inode *inode, struct btrfs_bio *bbio,
+                         u32 bio_offset, struct page *page, u32 pgoff)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
-       SHASH_DESC_ON_STACK(shash, fs_info->csum_shash);
-       char *kaddr;
         u32 len = fs_info->sectorsize;
-       const u32 csum_size = fs_info->csum_size;
-       unsigned int offset_sectors;
         u8 *csum_expected;
         u8 csum[BTRFS_CSUM_SIZE];
  
         ASSERT(pgoff + len <= PAGE_SIZE);
  
-       offset_sectors = bio_offset >> fs_info->sectorsize_bits;
-       csum_expected = ((u8 *)bbio->csum) + offset_sectors * csum_size;
+       csum_expected = btrfs_csum_ptr(fs_info, bbio->csum, bio_offset);
  
-       kaddr = kmap_atomic(page);
-       shash->tfm = fs_info->csum_shash;
-
-       crypto_shash_digest(shash, kaddr + pgoff, len, csum);
-       kunmap_atomic(kaddr);
-
-       if (memcmp(csum, csum_expected, csum_size))
+       if (btrfs_check_sector_csum(fs_info, page, pgoff, csum, csum_expected))
                 goto zeroit;
-
         return 0;
+
  zeroit:
-       btrfs_print_data_csum_error(BTRFS_I(inode), start, csum, csum_expected,
-                                   bbio->mirror_num);
+       btrfs_print_data_csum_error(BTRFS_I(inode),
+                                   bbio->file_offset + bio_offset,
+                                   csum, csum_expected, bbio->mirror_num);
         if (bbio->device)
                 btrfs_dev_stat_inc_and_print(bbio->device,
                                              BTRFS_DEV_STAT_CORRUPTION_ERRS);
@@ -3401,11 +3503,6 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
         u32 pg_off;
         unsigned int result = 0;
  
-       if (btrfs_page_test_checked(fs_info, page, start, end + 1 - start)) {
-               btrfs_page_clear_checked(fs_info, page, start, end + 1 - start);
-               return 0;
-       }
-
         /*
          * This only happens for NODATASUM or compressed read.
          * Normally this should be covered by above check for compressed read
@@ -3438,8 +3535,7 @@ unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
                                           EXTENT_NODATASUM);
                         continue;
                 }
-               ret = check_data_csum(inode, bbio, bio_offset, page, pg_off,
-                                     page_offset(page) + pg_off);
+               ret = btrfs_check_data_csum(inode, bbio, bio_offset, page, pg_off);
                 if (ret < 0) {
                         const int nr_bit = (pg_off - offset_in_page(start)) >>
                                      root->fs_info->sectorsize_bits;
@@ -3578,7 +3674,6 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
         u64 last_objectid = 0;
         int ret = 0, nr_unlink = 0;
  
-       /* Bail out if the cleanup is already running. */
         if (test_and_set_bit(BTRFS_ROOT_ORPHAN_CLEANUP, &root->state))
                 return 0;
  
@@ -3661,17 +3756,17 @@ int btrfs_orphan_cleanup(struct btrfs_root *root)
                          *
                          * btrfs_find_orphan_roots() ran before us, which has
                          * found all deleted roots and loaded them into
-                        * fs_info->fs_roots. So here we can find if an
+                        * fs_info->fs_roots_radix. So here we can find if an
                          * orphan item corresponds to a deleted root by looking
-                        * up the root from that xarray.
+                        * up the root from that radix tree.
                          */
  
-                       spin_lock(&fs_info->fs_roots_lock);
-                       dead_root = xa_load(&fs_info->fs_roots,
-                                           (unsigned long)found_key.objectid);
+                       spin_lock(&fs_info->fs_roots_radix_lock);
+                       dead_root = radix_tree_lookup(&fs_info->fs_roots_radix,
+                                                        (unsigned long)found_key.objectid);
                         if (dead_root && btrfs_root_refs(&dead_root->root_item) == 0)
                                 is_dead_root = 1;
-                       spin_unlock(&fs_info->fs_roots_lock);
+                       spin_unlock(&fs_info->fs_roots_radix_lock);
  
                         if (is_dead_root) {
                                 /* prevent this orphan from being found again */
@@ -3911,7 +4006,7 @@ cache_index:
          * cache.
          *
          * This is required for both inode re-read from disk and delayed inode
-        * in the delayed_nodes xarray.
+        * in delayed_nodes_tree.
          */
         if (BTRFS_I(inode)->last_trans == fs_info->generation)
                 set_bit(BTRFS_INODE_NEEDS_FULL_SYNC,
@@ -4229,7 +4324,7 @@ skip_backref:
         /*
          * If we are in a rename context, we don't need to update anything in the
          * log. That will be done later during the rename by btrfs_log_new_name().
-        * Besides that, doing it here would only cause extra unncessary btree
+        * Besides that, doing it here would only cause extra unnecessary btree
          * operations on the log tree, increasing latency for applications.
          */
         if (!rename_ctx) {
@@ -4257,8 +4352,9 @@ err:
         btrfs_i_size_write(dir, dir->vfs_inode.i_size - name_len * 2);
         inode_inc_iversion(&inode->vfs_inode);
         inode_inc_iversion(&dir->vfs_inode);
-       inode->vfs_inode.i_ctime = dir->vfs_inode.i_mtime =
-               dir->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+       inode->vfs_inode.i_ctime = current_time(&inode->vfs_inode);
+       dir->vfs_inode.i_mtime = inode->vfs_inode.i_ctime;
+       dir->vfs_inode.i_ctime = inode->vfs_inode.i_ctime;
         ret = btrfs_update_inode(trans, root, dir);
  out:
         return ret;
@@ -4420,7 +4516,8 @@ static int btrfs_unlink_subvol(struct btrfs_trans_handle *trans,
  
         btrfs_i_size_write(BTRFS_I(dir), dir->i_size - name_len * 2);
         inode_inc_iversion(dir);
-       dir->i_mtime = dir->i_ctime = current_time(dir);
+       dir->i_mtime = current_time(dir);
+       dir->i_ctime = dir->i_mtime;
         ret = btrfs_update_inode_fallback(trans, root, BTRFS_I(dir));
         if (ret)
                 btrfs_abort_transaction(trans, ret);
@@ -4859,7 +4956,6 @@ again:
                 else
                         memzero_page(page, (block_start - page_offset(page)) + offset,
                                      len);
-               flush_dcache_page(page);
         }
         btrfs_page_clear_checked(fs_info, page, block_start,
                                  block_end + 1 - block_start);
@@ -5062,9 +5158,10 @@ static int btrfs_setsize(struct inode *inode, struct iattr *attr)
          */
         if (newsize != oldsize) {
                 inode_inc_iversion(inode);
-               if (!(mask & (ATTR_CTIME | ATTR_MTIME)))
-                       inode->i_ctime = inode->i_mtime =
-                               current_time(inode);
+               if (!(mask & (ATTR_CTIME | ATTR_MTIME))) {
+                       inode->i_mtime = current_time(inode);
+                       inode->i_ctime = inode->i_mtime;
+               }
         }
  
         if (newsize > oldsize) {
@@ -5372,7 +5469,7 @@ void btrfs_evict_inode(struct inode *inode)
         if (!rsv)
                 goto no_delete;
         rsv->size = btrfs_calc_metadata_size(fs_info, 1);
-       rsv->failfast = 1;
+       rsv->failfast = true;
  
         btrfs_i_size_write(BTRFS_I(inode), 0);
  
@@ -5764,14 +5861,14 @@ struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry)
                 if (ret != -ENOENT)
                         inode = ERR_PTR(ret);
                 else
-                       inode = new_simple_dir(dir->i_sb, &location, sub_root);
+                       inode = new_simple_dir(dir->i_sb, &location, root);
         } else {
                 inode = btrfs_iget(dir->i_sb, location.objectid, sub_root);
-       }
-       if (root != sub_root)
                 btrfs_put_root(sub_root);
  
-       if (!IS_ERR(inode) && root != sub_root) {
+               if (IS_ERR(inode))
+                       return inode;
+
                 down_read(&fs_info->cleanup_work_sem);
                 if (!sb_rdonly(inode->i_sb))
                         ret = btrfs_orphan_cleanup(sub_root);
@@ -6367,7 +6464,13 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
         }
  
         btrfs_mark_buffer_dirty(path->nodes[0]);
-       btrfs_release_path(path);
+       /*
+        * We don't need the path anymore, plus inheriting properties, adding
+        * ACLs, security xattrs, orphan item or adding the link, will result in
+        * allocating yet another path. So just free our path.
+        */
+       btrfs_free_path(path);
+       path = NULL;
  
         if (args->subvol) {
                 struct inode *parent;
@@ -6424,8 +6527,7 @@ int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
                 goto discard;
         }
  
-       ret = 0;
-       goto out;
+       return 0;
  
  discard:
         /*
@@ -7507,7 +7609,8 @@ static int btrfs_get_blocks_direct_write(struct extent_map **map,
                 btrfs_dec_nocow_writers(bg);
                 if (type == BTRFS_ORDERED_PREALLOC) {
                         free_extent_map(em);
-                       *map = em = em2;
+                       *map = em2;
+                       em = em2;
                 }
  
                 if (IS_ERR(em2)) {
@@ -7589,8 +7692,26 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
         const u64 data_alloc_len = length;
         bool unlock_extents = false;
  
+       /*
+        * We could potentially fault if we have a buffer > PAGE_SIZE, and if
+        * we're NOWAIT we may submit a bio for a partial range and return
+        * EIOCBQUEUED, which would result in an errant short read.
+        *
+        * The best way to handle this would be to allow for partial completions
+        * of iocb's, so we could submit the partial bio, return and fault in
+        * the rest of the pages, and then submit the io for the rest of the
+        * range.  However we don't have that currently, so simply return
+        * -EAGAIN at this point so that the normal path is used.
+        */
+       if (!write && (flags & IOMAP_NOWAIT) && length > PAGE_SIZE)
+               return -EAGAIN;
+
+       /*
+        * Cap the size of reads to that usually seen in buffered I/O as we need
+        * to allocate a contiguous array for the checksums.
+        */
         if (!write)
-               len = min_t(u64, len, fs_info->sectorsize);
+               len = min_t(u64, len, fs_info->sectorsize * BTRFS_MAX_BIO_SECTORS);
  
         lockstart = start;
         lockend = start + len - 1;
@@ -7681,7 +7802,19 @@ static int btrfs_dio_iomap_begin(struct inode *inode, loff_t start,
         if (test_bit(EXTENT_FLAG_COMPRESSED, &em->flags) ||
             em->block_start == EXTENT_MAP_INLINE) {
                 free_extent_map(em);
-               ret = -ENOTBLK;
+               /*
+                * If we are in a NOWAIT context, return -EAGAIN in order to
+                * fallback to buffered IO. This is not only because we can
+                * block with buffered IO (no support for NOWAIT semantics at
+                * the moment) but also to avoid returning short reads to user
+                * space - this happens if we were able to read some data from
+                * previous non-compressed extents and then when we fallback to
+                * buffered IO, at btrfs_file_read_iter() by calling
+                * filemap_read(), we fail to fault in pages for the read buffer,
+                * in which case filemap_read() returns a short read (the number
+                * of bytes previously read is > 0, so it does not return -EFAULT).
+                */
+               ret = (flags & IOMAP_NOWAIT) ? -EAGAIN : -ENOTBLK;
                 goto unlock_err;
         }
  
@@ -7813,8 +7946,8 @@ static int btrfs_dio_iomap_end(struct inode *inode, loff_t pos, loff_t length,
                 pos += submitted;
                 length -= submitted;
                 if (write)
-                       __endio_write_update_ordered(BTRFS_I(inode), pos,
-                                       length, false);
+                       btrfs_mark_ordered_io_finished(BTRFS_I(inode), NULL,
+                                                      pos, length, false);
                 else
                         unlock_extent(&BTRFS_I(inode)->io_tree, pos,
                                       pos + length - 1);
@@ -7836,10 +7969,9 @@ static void btrfs_dio_private_put(struct btrfs_dio_private *dip)
                 return;
  
         if (btrfs_op(&dip->bio) == BTRFS_MAP_WRITE) {
-               __endio_write_update_ordered(BTRFS_I(dip->inode),
-                                            dip->file_offset,
-                                            dip->bytes,
-                                            !dip->bio.bi_status);
+               btrfs_mark_ordered_io_finished(BTRFS_I(dip->inode), NULL,
+                                              dip->file_offset, dip->bytes,
+                                              !dip->bio.bi_status);
         } else {
                 unlock_extent(&BTRFS_I(dip->inode)->io_tree,
                               dip->file_offset,
@@ -7859,12 +7991,8 @@ static void submit_dio_repair_bio(struct inode *inode, struct bio *bio,
  
         BUG_ON(bio_op(bio) == REQ_OP_WRITE);
  
-       if (btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA))
-               return;
-
         refcount_inc(&dip->refs);
-       if (btrfs_map_bio(fs_info, bio, mirror_num))
-               refcount_dec(&dip->refs);
+       btrfs_submit_bio(fs_info, bio, mirror_num);
  }
  
  static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
@@ -7873,56 +8001,35 @@ static blk_status_t btrfs_check_read_dio_bio(struct btrfs_dio_private *dip,
  {
         struct inode *inode = dip->inode;
         struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
-       const u32 sectorsize = fs_info->sectorsize;
         struct extent_io_tree *failure_tree = &BTRFS_I(inode)->io_failure_tree;
         struct extent_io_tree *io_tree = &BTRFS_I(inode)->io_tree;
         const bool csum = !(BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM);
-       struct bio_vec bvec;
-       struct bvec_iter iter;
-       u32 bio_offset = 0;
         blk_status_t err = BLK_STS_OK;
+       struct bvec_iter iter;
+       struct bio_vec bv;
+       u32 offset;
+
+       btrfs_bio_for_each_sector(fs_info, bv, bbio, iter, offset) {
+               u64 start = bbio->file_offset + offset;
+
+               if (uptodate &&
+                   (!csum || !btrfs_check_data_csum(inode, bbio, offset, bv.bv_page,
+                                              bv.bv_offset))) {
+                       clean_io_failure(fs_info, failure_tree, io_tree, start,
+                                        bv.bv_page, btrfs_ino(BTRFS_I(inode)),
+                                        bv.bv_offset);
+               } else {
+                       int ret;
  
-       __bio_for_each_segment(bvec, &bbio->bio, iter, bbio->iter) {
-               unsigned int i, nr_sectors, pgoff;
-
-               nr_sectors = BTRFS_BYTES_TO_BLKS(fs_info, bvec.bv_len);
-               pgoff = bvec.bv_offset;
-               for (i = 0; i < nr_sectors; i++) {
-                       u64 start = bbio->file_offset + bio_offset;
-
-                       ASSERT(pgoff < PAGE_SIZE);
-                       if (uptodate &&
-                           (!csum || !check_data_csum(inode, bbio,
-                                                      bio_offset, bvec.bv_page,
-                                                      pgoff, start))) {
-                               clean_io_failure(fs_info, failure_tree, io_tree,
-                                                start, bvec.bv_page,
-                                                btrfs_ino(BTRFS_I(inode)),
-                                                pgoff);
-                       } else {
-                               int ret;
-
-                               ret = btrfs_repair_one_sector(inode, &bbio->bio,
-                                               bio_offset, bvec.bv_page, pgoff,
-                                               start, bbio->mirror_num,
-                                               submit_dio_repair_bio);
-                               if (ret)
-                                       err = errno_to_blk_status(ret);
-                       }
-                       ASSERT(bio_offset + sectorsize > bio_offset);
-                       bio_offset += sectorsize;
-                       pgoff += sectorsize;
+                       ret = btrfs_repair_one_sector(inode, bbio, offset,
+                                       bv.bv_page, bv.bv_offset,
+                                       submit_dio_repair_bio);
+                       if (ret)
+                               err = errno_to_blk_status(ret);
                 }
         }
-       return err;
-}
  
-static void __endio_write_update_ordered(struct btrfs_inode *inode,
-                                        const u64 offset, const u64 bytes,
-                                        const bool uptodate)
-{
-       btrfs_mark_ordered_io_finished(inode, NULL, offset, bytes,
-                                      finish_ordered_fn, uptodate);
+       return err;
  }
  
  static blk_status_t btrfs_submit_bio_start_direct_io(struct inode *inode,
@@ -7957,51 +8064,43 @@ static void btrfs_end_dio_bio(struct bio *bio)
         btrfs_dio_private_put(dip);
  }
  
-static inline blk_status_t btrfs_submit_dio_bio(struct bio *bio,
-               struct inode *inode, u64 file_offset, int async_submit)
+static void btrfs_submit_dio_bio(struct bio *bio, struct inode *inode,
+                                u64 file_offset, int async_submit)
  {
         struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
         struct btrfs_dio_private *dip = bio->bi_private;
-       bool write = btrfs_op(bio) == BTRFS_MAP_WRITE;
         blk_status_t ret;
  
-       /* Check btrfs_submit_bio_hook() for rules about async submit. */
-       if (async_submit)
-               async_submit = !atomic_read(&BTRFS_I(inode)->sync_writers);
-
-       if (!write) {
-               ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-               if (ret)
-                       goto err;
-       }
+       /* Save the original iter for read repair */
+       if (btrfs_op(bio) == BTRFS_MAP_READ)
+               btrfs_bio(bio)->iter = bio->bi_iter;
  
         if (BTRFS_I(inode)->flags & BTRFS_INODE_NODATASUM)
                 goto map;
  
-       if (write && async_submit) {
-               ret = btrfs_wq_submit_bio(inode, bio, 0, file_offset,
-                                         btrfs_submit_bio_start_direct_io);
-               goto err;
-       } else if (write) {
+       if (btrfs_op(bio) == BTRFS_MAP_WRITE) {
+               /* Check btrfs_submit_data_write_bio() for async submit rules */
+               if (async_submit && !atomic_read(&BTRFS_I(inode)->sync_writers) &&
+                   btrfs_wq_submit_bio(inode, bio, 0, file_offset,
+                                       btrfs_submit_bio_start_direct_io))
+                       return;
+
                 /*
                  * If we aren't doing async submit, calculate the csum of the
                  * bio now.
                  */
                 ret = btrfs_csum_one_bio(BTRFS_I(inode), bio, file_offset, false);
-               if (ret)
-                       goto err;
+               if (ret) {
+                       bio->bi_status = ret;
+                       bio_endio(bio);
+                       return;
+               }
         } else {
-               u64 csum_offset;
-
-               csum_offset = file_offset - dip->file_offset;
-               csum_offset >>= fs_info->sectorsize_bits;
-               csum_offset *= fs_info->csum_size;
-               btrfs_bio(bio)->csum = dip->csums + csum_offset;
+               btrfs_bio(bio)->csum = btrfs_csum_ptr(fs_info, dip->csums,
+                                                     file_offset - dip->file_offset);
         }
  map:
-       ret = btrfs_map_bio(fs_info, bio, 0);
-err:
-       return ret;
+       btrfs_submit_bio(fs_info, bio, 0);
  }
  
  static void btrfs_submit_direct(const struct iomap_iter *iter,
@@ -8114,14 +8213,7 @@ static void btrfs_submit_direct(const struct iomap_iter *iter,
                                 async_submit = 1;
                 }
  
-               status = btrfs_submit_dio_bio(bio, inode, file_offset,
-                                               async_submit);
-               if (status) {
-                       bio_put(bio);
-                       if (submit_len > 0)
-                               refcount_dec(&dip->refs);
-                       goto out_err_em;
-               }
+               btrfs_submit_dio_bio(bio, inode, file_offset, async_submit);
  
                 dio_data->submitted += clone_len;
                 clone_offset += clone_len;
@@ -8154,7 +8246,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo
         struct btrfs_dio_data data;
  
         return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
-                           IOMAP_DIO_PARTIAL, &data, done_before);
+                           IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
+                           &data, done_before);
  }
  
  static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
@@ -8169,31 +8262,6 @@ static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
         return extent_fiemap(BTRFS_I(inode), fieinfo, start, len);
  }
  
-static int btrfs_writepage(struct page *page, struct writeback_control *wbc)
-{
-       struct inode *inode = page->mapping->host;
-       int ret;
-
-       if (current->flags & PF_MEMALLOC) {
-               redirty_page_for_writepage(wbc, page);
-               unlock_page(page);
-               return 0;
-       }
-
-       /*
-        * If we are under memory pressure we will call this directly from the
-        * VM, we need to make sure we have the inode referenced for the ordered
-        * extent.  If not just return like we didn't do anything.
-        */
-       if (!igrab(inode)) {
-               redirty_page_for_writepage(wbc, page);
-               return AOP_WRITEPAGE_ACTIVATE;
-       }
-       ret = extent_write_full_page(page, wbc);
-       btrfs_add_delayed_iput(inode);
-       return ret;
-}
-
  static int btrfs_writepages(struct address_space *mapping,
                             struct writeback_control *wbc)
  {
@@ -8257,30 +8325,24 @@ static bool btrfs_release_folio(struct folio *folio, gfp_t gfp_flags)
  }
  
  #ifdef CONFIG_MIGRATION
-static int btrfs_migratepage(struct address_space *mapping,
-                            struct page *newpage, struct page *page,
+static int btrfs_migrate_folio(struct address_space *mapping,
+                            struct folio *dst, struct folio *src,
                              enum migrate_mode mode)
  {
-       int ret;
+       int ret = filemap_migrate_folio(mapping, dst, src, mode);
  
-       ret = migrate_page_move_mapping(mapping, newpage, page, 0);
         if (ret != MIGRATEPAGE_SUCCESS)
                 return ret;
  
-       if (page_has_private(page))
-               attach_page_private(newpage, detach_page_private(page));
-
-       if (PageOrdered(page)) {
-               ClearPageOrdered(page);
-               SetPageOrdered(newpage);
+       if (folio_test_ordered(src)) {
+               folio_clear_ordered(src);
+               folio_set_ordered(dst);
         }
  
-       if (mode != MIGRATE_SYNC_NO_COPY)
-               migrate_page_copy(newpage, page);
-       else
-               migrate_page_states(newpage, page);
         return MIGRATEPAGE_SUCCESS;
  }
+#else
+#define btrfs_migrate_folio NULL
  #endif
  
  static void btrfs_invalidate_folio(struct folio *folio, size_t offset,
@@ -8497,7 +8559,7 @@ vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf)
          * Reserving delalloc space after obtaining the page lock can lead to
          * deadlock. For example, if a dirty page is locked by this function
          * and the call to btrfs_delalloc_reserve_space() ends up triggering
-        * dirty page write out, then the btrfs_writepage() function could
+        * dirty page write out, then the btrfs_writepages() function could
          * end up waiting indefinitely to get a lock on the page currently
          * being processed by btrfs_page_mkwrite() function.
          */
@@ -8588,10 +8650,9 @@ again:
         else
                 zero_start = PAGE_SIZE;
  
-       if (zero_start != PAGE_SIZE) {
+       if (zero_start != PAGE_SIZE)
                 memzero_page(page, zero_start, PAGE_SIZE - zero_start);
-               flush_dcache_page(page);
-       }
+
         btrfs_page_clear_checked(fs_info, page, page_start, PAGE_SIZE);
         btrfs_page_set_dirty(fs_info, page, page_start, end + 1 - page_start);
         btrfs_page_set_uptodate(fs_info, page, page_start, end + 1 - page_start);
@@ -8674,7 +8735,7 @@ static int btrfs_truncate(struct inode *inode, bool skip_writeback)
         if (!rsv)
                 return -ENOMEM;
         rsv->size = min_size;
-       rsv->failfast = 1;
+       rsv->failfast = true;
  
         /*
          * 1 for the truncate slack space
@@ -9195,8 +9256,10 @@ static int btrfs_rename_exchange(struct inode *old_dir,
         inode_inc_iversion(new_dir);
         inode_inc_iversion(old_inode);
         inode_inc_iversion(new_inode);
-       old_dir->i_ctime = old_dir->i_mtime = ctime;
-       new_dir->i_ctime = new_dir->i_mtime = ctime;
+       old_dir->i_mtime = ctime;
+       old_dir->i_ctime = ctime;
+       new_dir->i_mtime = ctime;
+       new_dir->i_ctime = ctime;
         old_inode->i_ctime = ctime;
         new_inode->i_ctime = ctime;
  
@@ -9459,9 +9522,11 @@ static int btrfs_rename(struct user_namespace *mnt_userns,
         inode_inc_iversion(old_dir);
         inode_inc_iversion(new_dir);
         inode_inc_iversion(old_inode);
-       old_dir->i_ctime = old_dir->i_mtime =
-       new_dir->i_ctime = new_dir->i_mtime =
-       old_inode->i_ctime = current_time(old_dir);
+       old_dir->i_mtime = current_time(old_dir);
+       old_dir->i_ctime = old_dir->i_mtime;
+       new_dir->i_mtime = old_dir->i_mtime;
+       new_dir->i_ctime = old_dir->i_mtime;
+       old_inode->i_ctime = old_dir->i_mtime;
  
         if (old_dentry->d_parent != new_dentry->d_parent)
                 btrfs_record_unlink_dir(trans, BTRFS_I(old_dir),
@@ -9549,15 +9614,21 @@ static int btrfs_rename2(struct user_namespace *mnt_userns, struct inode *old_di
                          struct dentry *old_dentry, struct inode *new_dir,
                          struct dentry *new_dentry, unsigned int flags)
  {
+       int ret;
+
         if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE | RENAME_WHITEOUT))
                 return -EINVAL;
  
         if (flags & RENAME_EXCHANGE)
-               return btrfs_rename_exchange(old_dir, old_dentry, new_dir,
-                                         new_dentry);
+               ret = btrfs_rename_exchange(old_dir, old_dentry, new_dir,
+                                           new_dentry);
+       else
+               ret = btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
+                                  new_dentry, flags);
+
+       btrfs_btree_balance_dirty(BTRFS_I(new_dir)->root->fs_info);
  
-       return btrfs_rename(mnt_userns, old_dir, old_dentry, new_dir,
-                           new_dentry, flags);
+       return ret;
  }
  
  struct btrfs_delalloc_work {
@@ -10177,9 +10248,8 @@ void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end)
         }
  }
  
-static int btrfs_encoded_io_compression_from_extent(
-                               struct btrfs_fs_info *fs_info,
-                               int compress_type)
+int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
+                                            int compress_type)
  {
         switch (compress_type) {
         case BTRFS_COMPRESS_NONE:
@@ -10302,7 +10372,6 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
                                             struct bio *bio, int mirror_num)
  {
         struct btrfs_encoded_read_private *priv = bio->bi_private;
-       struct btrfs_bio *bbio = btrfs_bio(bio);
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         blk_status_t ret;
  
@@ -10312,19 +10381,9 @@ static blk_status_t submit_encoded_read_bio(struct btrfs_inode *inode,
                         return ret;
         }
  
-       ret = btrfs_bio_wq_end_io(fs_info, bio, BTRFS_WQ_ENDIO_DATA);
-       if (ret) {
-               btrfs_bio_free_csum(bbio);
-               return ret;
-       }
-
         atomic_inc(&priv->pending);
-       ret = btrfs_map_bio(fs_info, bio, mirror_num);
-       if (ret) {
-               atomic_dec(&priv->pending);
-               btrfs_bio_free_csum(bbio);
-       }
-       return ret;
+       btrfs_submit_bio(fs_info, bio, mirror_num);
+       return BLK_STS_OK;
  }
  
  static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
@@ -10336,7 +10395,6 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
         u32 sectorsize = fs_info->sectorsize;
         struct bio_vec *bvec;
         struct bvec_iter_all iter_all;
-       u64 start = priv->file_offset;
         u32 bio_offset = 0;
  
         if (priv->skip_csum || !uptodate)
@@ -10349,10 +10407,9 @@ static blk_status_t btrfs_encoded_read_verify_csum(struct btrfs_bio *bbio)
                 pgoff = bvec->bv_offset;
                 for (i = 0; i < nr_sectors; i++) {
                         ASSERT(pgoff < PAGE_SIZE);
-                       if (check_data_csum(&inode->vfs_inode, bbio, bio_offset,
-                                           bvec->bv_page, pgoff, start))
+                       if (btrfs_check_data_csum(&inode->vfs_inode, bbio, bio_offset,
+                                           bvec->bv_page, pgoff))
                                 return BLK_STS_IOERR;
-                       start += sectorsize;
                         bio_offset += sectorsize;
                         pgoff += sectorsize;
                 }
@@ -10384,11 +10441,9 @@ static void btrfs_encoded_read_endio(struct bio *bio)
         bio_put(bio);
  }
  
-static int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
-                                                u64 file_offset,
-                                                u64 disk_bytenr,
-                                                u64 disk_io_size,
-                                                struct page **pages)
+int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
+                                         u64 file_offset, u64 disk_bytenr,
+                                         u64 disk_io_size, struct page **pages)
  {
         struct btrfs_fs_info *fs_info = inode->root->fs_info;
         struct btrfs_encoded_read_private priv = {
@@ -10619,7 +10674,8 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
                         ret = -ENOBUFS;
                         goto out_em;
                 }
-               disk_io_size = count = em->block_len;
+               disk_io_size = em->block_len;
+               count = em->block_len;
                 encoded->unencoded_len = em->ram_bytes;
                 encoded->unencoded_offset = iocb->ki_pos - em->orig_start;
                 ret = btrfs_encoded_io_compression_from_extent(fs_info,
@@ -10782,15 +10838,15 @@ ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
                         ret = -ENOMEM;
                         goto out_pages;
                 }
-               kaddr = kmap(pages[i]);
+               kaddr = kmap_local_page(pages[i]);
                 if (copy_from_iter(kaddr, bytes, from) != bytes) {
-                       kunmap(pages[i]);
+                       kunmap_local(kaddr);
                         ret = -EFAULT;
                         goto out_pages;
                 }
                 if (bytes < PAGE_SIZE)
                         memset(kaddr + bytes, 0, PAGE_SIZE - bytes);
-               kunmap(pages[i]);
+               kunmap_local(kaddr);
         }
  
         for (;;) {
@@ -11419,15 +11475,12 @@ static const struct file_operations btrfs_dir_file_operations = {
   */
  static const struct address_space_operations btrfs_aops = {
         .read_folio     = btrfs_read_folio,
-       .writepage      = btrfs_writepage,
         .writepages     = btrfs_writepages,
         .readahead      = btrfs_readahead,
         .direct_IO      = noop_direct_IO,
         .invalidate_folio = btrfs_invalidate_folio,
         .release_folio  = btrfs_release_folio,
-#ifdef CONFIG_MIGRATION
-       .migratepage    = btrfs_migratepage,
-#endif
+       .migrate_folio  = btrfs_migrate_folio,
         .dirty_folio    = filemap_dirty_folio,
         .error_remove_page = generic_error_remove_page,
         .swap_activate  = btrfs_swap_activate,