5.15-stable patches

author Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)

committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>

Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)
author Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)
committer Greg Kroah-Hartman <gregkh@linuxfoundation.org>
Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)
diff --git a/queue-5.15/block-drop-unused-includes-in-linux-genhd.h.patch b/queue-5.15/block-drop-unused-includes-in-linux-genhd.h.patch

new file mode 100644 (file)

index 0000000..b181afa
--- /dev/null
+++ b/queue-5.15/block-drop-unused-includes-in-linux-genhd.h.patch
@@ -0,0 +1,234 @@
+From b81e0c2372e65e5627864ba034433b64b2fc73f5 Mon Sep 17 00:00:00 2001
+From: Christoph Hellwig <hch@lst.de>
+Date: Mon, 20 Sep 2021 14:33:25 +0200
+Subject: block: drop unused includes in <linux/genhd.h>
+
+From: Christoph Hellwig <hch@lst.de>
+
+commit b81e0c2372e65e5627864ba034433b64b2fc73f5 upstream.
+
+Drop various include not actually used in genhd.h itself, and
+move the remaning includes closer together.
+
+Signed-off-by: Christoph Hellwig <hch@lst.de>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Link: https://lore.kernel.org/r/20210920123328.1399408-15-hch@lst.de
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Reported-by: Sudip Mukherjee <sudipm.mukherjee@gmail.com>a
+Reported-by: "H. Nikolaus Schaller" <hns@goldelico.com>
+Reported-by: Guenter Roeck <linux@roeck-us.net>
+Cc: "Maciej W. Rozycki" <macro@orcam.me.uk>
+[ resolves MIPS build failure by luck, root cause needs to be fixed in
+  Linus's tree properly, but this is needed for now to fix the build - gregkh ]
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/um/drivers/ubd_kern.c      |    1 +
+ block/genhd.c                   |    1 +
+ block/holder.c                  |    1 +
+ block/partitions/core.c         |    1 +
+ drivers/block/amiflop.c         |    1 +
+ drivers/block/ataflop.c         |    1 +
+ drivers/block/floppy.c          |    1 +
+ drivers/block/swim.c            |    1 +
+ drivers/block/xen-blkfront.c    |    1 +
+ drivers/md/md.c                 |    1 +
+ drivers/s390/block/dasd_genhd.c |    1 +
+ drivers/scsi/sd.c               |    1 +
+ drivers/scsi/sg.c               |    1 +
+ drivers/scsi/sr.c               |    1 +
+ drivers/scsi/st.c               |    1 +
+ include/linux/genhd.h           |   14 ++------------
+ include/linux/part_stat.h       |    1 +
+ 17 files changed, 18 insertions(+), 12 deletions(-)
+
+--- a/arch/um/drivers/ubd_kern.c
++++ b/arch/um/drivers/ubd_kern.c
+@@ -27,6 +27,7 @@
+ #include <linux/blk-mq.h>
+ #include <linux/ata.h>
+ #include <linux/hdreg.h>
++#include <linux/major.h>
+ #include <linux/cdrom.h>
+ #include <linux/proc_fs.h>
+ #include <linux/seq_file.h>
+--- a/block/genhd.c
++++ b/block/genhd.c
+@@ -19,6 +19,7 @@
+ #include <linux/seq_file.h>
+ #include <linux/slab.h>
+ #include <linux/kmod.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/idr.h>
+ #include <linux/log2.h>
+--- a/block/holder.c
++++ b/block/holder.c
+@@ -1,5 +1,6 @@
+ // SPDX-License-Identifier: GPL-2.0-only
+ #include <linux/genhd.h>
++#include <linux/slab.h>
+ 
+ struct bd_holder_disk {
+       struct list_head        list;
+--- a/block/partitions/core.c
++++ b/block/partitions/core.c
+@@ -5,6 +5,7 @@
+  * Copyright (C) 2020 Christoph Hellwig
+  */
+ #include <linux/fs.h>
++#include <linux/major.h>
+ #include <linux/slab.h>
+ #include <linux/ctype.h>
+ #include <linux/genhd.h>
+--- a/drivers/block/amiflop.c
++++ b/drivers/block/amiflop.c
+@@ -61,6 +61,7 @@
+ #include <linux/hdreg.h>
+ #include <linux/delay.h>
+ #include <linux/init.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/fs.h>
+ #include <linux/blk-mq.h>
+--- a/drivers/block/ataflop.c
++++ b/drivers/block/ataflop.c
+@@ -68,6 +68,7 @@
+ #include <linux/delay.h>
+ #include <linux/init.h>
+ #include <linux/blk-mq.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/completion.h>
+ #include <linux/wait.h>
+--- a/drivers/block/floppy.c
++++ b/drivers/block/floppy.c
+@@ -184,6 +184,7 @@ static int print_unex = 1;
+ #include <linux/ioport.h>
+ #include <linux/interrupt.h>
+ #include <linux/init.h>
++#include <linux/major.h>
+ #include <linux/platform_device.h>
+ #include <linux/mod_devicetable.h>
+ #include <linux/mutex.h>
+--- a/drivers/block/swim.c
++++ b/drivers/block/swim.c
+@@ -16,6 +16,7 @@
+ #include <linux/fd.h>
+ #include <linux/slab.h>
+ #include <linux/blk-mq.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/hdreg.h>
+ #include <linux/kernel.h>
+--- a/drivers/block/xen-blkfront.c
++++ b/drivers/block/xen-blkfront.c
+@@ -42,6 +42,7 @@
+ #include <linux/cdrom.h>
+ #include <linux/module.h>
+ #include <linux/slab.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/scatterlist.h>
+ #include <linux/bitmap.h>
+--- a/drivers/md/md.c
++++ b/drivers/md/md.c
+@@ -51,6 +51,7 @@
+ #include <linux/hdreg.h>
+ #include <linux/proc_fs.h>
+ #include <linux/random.h>
++#include <linux/major.h>
+ #include <linux/module.h>
+ #include <linux/reboot.h>
+ #include <linux/file.h>
+--- a/drivers/s390/block/dasd_genhd.c
++++ b/drivers/s390/block/dasd_genhd.c
+@@ -14,6 +14,7 @@
+ #define KMSG_COMPONENT "dasd"
+ 
+ #include <linux/interrupt.h>
++#include <linux/major.h>
+ #include <linux/fs.h>
+ #include <linux/blkpg.h>
+ 
+--- a/drivers/scsi/sd.c
++++ b/drivers/scsi/sd.c
+@@ -48,6 +48,7 @@
+ #include <linux/blkpg.h>
+ #include <linux/blk-pm.h>
+ #include <linux/delay.h>
++#include <linux/major.h>
+ #include <linux/mutex.h>
+ #include <linux/string_helpers.h>
+ #include <linux/async.h>
+--- a/drivers/scsi/sg.c
++++ b/drivers/scsi/sg.c
+@@ -31,6 +31,7 @@ static int sg_version_num = 30536;   /* 2
+ #include <linux/errno.h>
+ #include <linux/mtio.h>
+ #include <linux/ioctl.h>
++#include <linux/major.h>
+ #include <linux/slab.h>
+ #include <linux/fcntl.h>
+ #include <linux/init.h>
+--- a/drivers/scsi/sr.c
++++ b/drivers/scsi/sr.c
+@@ -44,6 +44,7 @@
+ #include <linux/cdrom.h>
+ #include <linux/interrupt.h>
+ #include <linux/init.h>
++#include <linux/major.h>
+ #include <linux/blkdev.h>
+ #include <linux/blk-pm.h>
+ #include <linux/mutex.h>
+--- a/drivers/scsi/st.c
++++ b/drivers/scsi/st.c
+@@ -32,6 +32,7 @@ static const char *verstr = "20160209";
+ #include <linux/slab.h>
+ #include <linux/errno.h>
+ #include <linux/mtio.h>
++#include <linux/major.h>
+ #include <linux/cdrom.h>
+ #include <linux/ioctl.h>
+ #include <linux/fcntl.h>
+--- a/include/linux/genhd.h
++++ b/include/linux/genhd.h
+@@ -12,12 +12,10 @@
+ 
+ #include <linux/types.h>
+ #include <linux/kdev_t.h>
+-#include <linux/rcupdate.h>
+-#include <linux/slab.h>
+-#include <linux/percpu-refcount.h>
+ #include <linux/uuid.h>
+ #include <linux/blk_types.h>
+-#include <asm/local.h>
++#include <linux/device.h>
++#include <linux/xarray.h>
+ 
+ extern const struct device_type disk_type;
+ extern struct device_type part_type;
+@@ -26,14 +24,6 @@ extern struct class block_class;
+ #define DISK_MAX_PARTS                        256
+ #define DISK_NAME_LEN                 32
+ 
+-#include <linux/major.h>
+-#include <linux/device.h>
+-#include <linux/smp.h>
+-#include <linux/string.h>
+-#include <linux/fs.h>
+-#include <linux/workqueue.h>
+-#include <linux/xarray.h>
+-
+ #define PARTITION_META_INFO_VOLNAMELTH        64
+ /*
+  * Enough for the string representation of any kind of UUID plus NULL.
+--- a/include/linux/part_stat.h
++++ b/include/linux/part_stat.h
+@@ -3,6 +3,7 @@
+ #define _LINUX_PART_STAT_H
+ 
+ #include <linux/genhd.h>
++#include <asm/local.h>
+ 
+ struct disk_stats {
+       u64 nsecs[NR_STAT_GROUPS];
diff --git a/queue-5.15/btrfs-make-send-work-with-concurrent-block-group-relocation.patch b/queue-5.15/btrfs-make-send-work-with-concurrent-block-group-relocation.patch

new file mode 100644 (file)

index 0000000..64c647e
--- /dev/null
+++ b/queue-5.15/btrfs-make-send-work-with-concurrent-block-group-relocation.patch
@@ -0,0 +1,998 @@
+From d96b34248c2f4ea8cd09286090f2f6f77102eaab Mon Sep 17 00:00:00 2001
+From: Filipe Manana <fdmanana@suse.com>
+Date: Mon, 22 Nov 2021 12:03:38 +0000
+Subject: btrfs: make send work with concurrent block group relocation
+
+From: Filipe Manana <fdmanana@suse.com>
+
+commit d96b34248c2f4ea8cd09286090f2f6f77102eaab upstream.
+
+We don't allow send and balance/relocation to run in parallel in order
+to prevent send failing or silently producing some bad stream. This is
+because while send is using an extent (specially metadata) or about to
+read a metadata extent and expecting it belongs to a specific parent
+node, relocation can run, the transaction used for the relocation is
+committed and the extent gets reallocated while send is still using the
+extent, so it ends up with a different content than expected. This can
+result in just failing to read a metadata extent due to failure of the
+validation checks (parent transid, level, etc), failure to find a
+backreference for a data extent, and other unexpected failures. Besides
+reallocation, there's also a similar problem of an extent getting
+discarded when it's unpinned after the transaction used for block group
+relocation is committed.
+
+The restriction between balance and send was added in commit 9e967495e0e0
+("Btrfs: prevent send failures and crashes due to concurrent relocation"),
+kernel 5.3, while the more general restriction between send and relocation
+was added in commit 1cea5cf0e664 ("btrfs: ensure relocation never runs
+while we have send operations running"), kernel 5.14.
+
+Both send and relocation can be very long running operations. Relocation
+because it has to do a lot of IO and expensive backreference lookups in
+case there are many snapshots, and send due to read IO when operating on
+very large trees. This makes it inconvenient for users and tools to deal
+with scheduling both operations.
+
+For zoned filesystem we also have automatic block group relocation, so
+send can fail with -EAGAIN when users least expect it or send can end up
+delaying the block group relocation for too long. In the future we might
+also get the automatic block group relocation for non zoned filesystems.
+
+This change makes it possible for send and relocation to run in parallel.
+This is achieved the following way:
+
+1) For all tree searches, send acquires a read lock on the commit root
+   semaphore;
+
+2) After each tree search, and before releasing the commit root semaphore,
+   the leaf is cloned and placed in the search path (struct btrfs_path);
+
+3) After releasing the commit root semaphore, the changed_cb() callback
+   is invoked, which operates on the leaf and writes commands to the pipe
+   (or file in case send/receive is not used with a pipe). It's important
+   here to not hold a lock on the commit root semaphore, because if we did
+   we could deadlock when sending and receiving to the same filesystem
+   using a pipe - the send task blocks on the pipe because it's full, the
+   receive task, which is the only consumer of the pipe, triggers a
+   transaction commit when attempting to create a subvolume or reserve
+   space for a write operation for example, but the transaction commit
+   blocks trying to write lock the commit root semaphore, resulting in a
+   deadlock;
+
+4) Before moving to the next key, or advancing to the next change in case
+   of an incremental send, check if a transaction used for relocation was
+   committed (or is about to finish its commit). If so, release the search
+   path(s) and restart the search, to where we were before, so that we
+   don't operate on stale extent buffers. The search restarts are always
+   possible because both the send and parent roots are RO, and no one can
+   add, remove of update keys (change their offset) in RO trees - the
+   only exception is deduplication, but that is still not allowed to run
+   in parallel with send;
+
+5) Periodically check if there is contention on the commit root semaphore,
+   which means there is a transaction commit trying to write lock it, and
+   release the semaphore and reschedule if there is contention, so as to
+   avoid causing any significant delays to transaction commits.
+
+This leaves some room for optimizations for send to have less path
+releases and re searching the trees when there's relocation running, but
+for now it's kept simple as it performs quite well (on very large trees
+with resulting send streams in the order of a few hundred gigabytes).
+
+Test case btrfs/187, from fstests, stresses relocation, send and
+deduplication attempting to run in parallel, but without verifying if send
+succeeds and if it produces correct streams. A new test case will be added
+that exercises relocation happening in parallel with send and then checks
+that send succeeds and the resulting streams are correct.
+
+A final note is that for now this still leaves the mutual exclusion
+between send operations and deduplication on files belonging to a root
+used by send operations. A solution for that will be slightly more complex
+but it will eventually be built on top of this change.
+
+Signed-off-by: Filipe Manana <fdmanana@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Anand Jain <anand.jain@oracle.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ fs/btrfs/block-group.c |    9 -
+ fs/btrfs/ctree.c       |   98 ++++++++++---
+ fs/btrfs/ctree.h       |   14 -
+ fs/btrfs/disk-io.c     |    4 
+ fs/btrfs/relocation.c  |   13 -
+ fs/btrfs/send.c        |  357 ++++++++++++++++++++++++++++++++++++++++++-------
+ fs/btrfs/transaction.c |    4 
+ 7 files changed, 395 insertions(+), 104 deletions(-)
+
+--- a/fs/btrfs/block-group.c
++++ b/fs/btrfs/block-group.c
+@@ -1491,7 +1491,6 @@ void btrfs_reclaim_bgs_work(struct work_
+               container_of(work, struct btrfs_fs_info, reclaim_bgs_work);
+       struct btrfs_block_group *bg;
+       struct btrfs_space_info *space_info;
+-      LIST_HEAD(again_list);
+ 
+       if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
+               return;
+@@ -1562,18 +1561,14 @@ void btrfs_reclaim_bgs_work(struct work_
+                               div64_u64(zone_unusable * 100, bg->length));
+               trace_btrfs_reclaim_block_group(bg);
+               ret = btrfs_relocate_chunk(fs_info, bg->start);
+-              if (ret && ret != -EAGAIN)
++              if (ret)
+                       btrfs_err(fs_info, "error relocating chunk %llu",
+                                 bg->start);
+ 
+ next:
++              btrfs_put_block_group(bg);
+               spin_lock(&fs_info->unused_bgs_lock);
+-              if (ret == -EAGAIN && list_empty(&bg->bg_list))
+-                      list_add_tail(&bg->bg_list, &again_list);
+-              else
+-                      btrfs_put_block_group(bg);
+       }
+-      list_splice_tail(&again_list, &fs_info->reclaim_bgs);
+       spin_unlock(&fs_info->unused_bgs_lock);
+       mutex_unlock(&fs_info->reclaim_bgs_lock);
+       btrfs_exclop_finish(fs_info);
+--- a/fs/btrfs/ctree.c
++++ b/fs/btrfs/ctree.c
+@@ -1566,32 +1566,13 @@ static struct extent_buffer *btrfs_searc
+                                                       struct btrfs_path *p,
+                                                       int write_lock_level)
+ {
+-      struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_buffer *b;
+       int root_lock = 0;
+       int level = 0;
+ 
+       if (p->search_commit_root) {
+-              /*
+-               * The commit roots are read only so we always do read locks,
+-               * and we always must hold the commit_root_sem when doing
+-               * searches on them, the only exception is send where we don't
+-               * want to block transaction commits for a long time, so
+-               * we need to clone the commit root in order to avoid races
+-               * with transaction commits that create a snapshot of one of
+-               * the roots used by a send operation.
+-               */
+-              if (p->need_commit_sem) {
+-                      down_read(&fs_info->commit_root_sem);
+-                      b = btrfs_clone_extent_buffer(root->commit_root);
+-                      up_read(&fs_info->commit_root_sem);
+-                      if (!b)
+-                              return ERR_PTR(-ENOMEM);
+-
+-              } else {
+-                      b = root->commit_root;
+-                      atomic_inc(&b->refs);
+-              }
++              b = root->commit_root;
++              atomic_inc(&b->refs);
+               level = btrfs_header_level(b);
+               /*
+                * Ensure that all callers have set skip_locking when
+@@ -1657,6 +1638,42 @@ out:
+       return b;
+ }
+ 
++/*
++ * Replace the extent buffer at the lowest level of the path with a cloned
++ * version. The purpose is to be able to use it safely, after releasing the
++ * commit root semaphore, even if relocation is happening in parallel, the
++ * transaction used for relocation is committed and the extent buffer is
++ * reallocated in the next transaction.
++ *
++ * This is used in a context where the caller does not prevent transaction
++ * commits from happening, either by holding a transaction handle or holding
++ * some lock, while it's doing searches through a commit root.
++ * At the moment it's only used for send operations.
++ */
++static int finish_need_commit_sem_search(struct btrfs_path *path)
++{
++      const int i = path->lowest_level;
++      const int slot = path->slots[i];
++      struct extent_buffer *lowest = path->nodes[i];
++      struct extent_buffer *clone;
++
++      ASSERT(path->need_commit_sem);
++
++      if (!lowest)
++              return 0;
++
++      lockdep_assert_held_read(&lowest->fs_info->commit_root_sem);
++
++      clone = btrfs_clone_extent_buffer(lowest);
++      if (!clone)
++              return -ENOMEM;
++
++      btrfs_release_path(path);
++      path->nodes[i] = clone;
++      path->slots[i] = slot;
++
++      return 0;
++}
+ 
+ /*
+  * btrfs_search_slot - look for a key in a tree and perform necessary
+@@ -1693,6 +1710,7 @@ int btrfs_search_slot(struct btrfs_trans
+                     const struct btrfs_key *key, struct btrfs_path *p,
+                     int ins_len, int cow)
+ {
++      struct btrfs_fs_info *fs_info = root->fs_info;
+       struct extent_buffer *b;
+       int slot;
+       int ret;
+@@ -1734,6 +1752,11 @@ int btrfs_search_slot(struct btrfs_trans
+ 
+       min_write_lock_level = write_lock_level;
+ 
++      if (p->need_commit_sem) {
++              ASSERT(p->search_commit_root);
++              down_read(&fs_info->commit_root_sem);
++      }
++
+ again:
+       prev_cmp = -1;
+       b = btrfs_search_slot_get_root(root, p, write_lock_level);
+@@ -1928,6 +1951,16 @@ cow_done:
+ done:
+       if (ret < 0 && !p->skip_release_on_error)
+               btrfs_release_path(p);
++
++      if (p->need_commit_sem) {
++              int ret2;
++
++              ret2 = finish_need_commit_sem_search(p);
++              up_read(&fs_info->commit_root_sem);
++              if (ret2)
++                      ret = ret2;
++      }
++
+       return ret;
+ }
+ ALLOW_ERROR_INJECTION(btrfs_search_slot, ERRNO);
+@@ -4396,7 +4429,9 @@ int btrfs_next_old_leaf(struct btrfs_roo
+       int level;
+       struct extent_buffer *c;
+       struct extent_buffer *next;
++      struct btrfs_fs_info *fs_info = root->fs_info;
+       struct btrfs_key key;
++      bool need_commit_sem = false;
+       u32 nritems;
+       int ret;
+       int i;
+@@ -4413,14 +4448,20 @@ again:
+ 
+       path->keep_locks = 1;
+ 
+-      if (time_seq)
++      if (time_seq) {
+               ret = btrfs_search_old_slot(root, &key, path, time_seq);
+-      else
++      } else {
++              if (path->need_commit_sem) {
++                      path->need_commit_sem = 0;
++                      need_commit_sem = true;
++                      down_read(&fs_info->commit_root_sem);
++              }
+               ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
++      }
+       path->keep_locks = 0;
+ 
+       if (ret < 0)
+-              return ret;
++              goto done;
+ 
+       nritems = btrfs_header_nritems(path->nodes[0]);
+       /*
+@@ -4543,6 +4584,15 @@ again:
+       ret = 0;
+ done:
+       unlock_up(path, 0, 1, 0, NULL);
++      if (need_commit_sem) {
++              int ret2;
++
++              path->need_commit_sem = 1;
++              ret2 = finish_need_commit_sem_search(path);
++              up_read(&fs_info->commit_root_sem);
++              if (ret2)
++                      ret = ret2;
++      }
+ 
+       return ret;
+ }
+--- a/fs/btrfs/ctree.h
++++ b/fs/btrfs/ctree.h
+@@ -568,7 +568,6 @@ enum {
+       /*
+        * Indicate that relocation of a chunk has started, it's set per chunk
+        * and is toggled between chunks.
+-       * Set, tested and cleared while holding fs_info::send_reloc_lock.
+        */
+       BTRFS_FS_RELOC_RUNNING,
+ 
+@@ -668,6 +667,12 @@ struct btrfs_fs_info {
+ 
+       u64 generation;
+       u64 last_trans_committed;
++      /*
++       * Generation of the last transaction used for block group relocation
++       * since the filesystem was last mounted (or 0 if none happened yet).
++       * Must be written and read while holding btrfs_fs_info::commit_root_sem.
++       */
++      u64 last_reloc_trans;
+       u64 avg_delayed_ref_runtime;
+ 
+       /*
+@@ -997,13 +1002,6 @@ struct btrfs_fs_info {
+ 
+       struct crypto_shash *csum_shash;
+ 
+-      spinlock_t send_reloc_lock;
+-      /*
+-       * Number of send operations in progress.
+-       * Updated while holding fs_info::send_reloc_lock.
+-       */
+-      int send_in_progress;
+-
+       /* Type of exclusive operation running, protected by super_lock */
+       enum btrfs_exclusive_operation exclusive_operation;
+ 
+--- a/fs/btrfs/disk-io.c
++++ b/fs/btrfs/disk-io.c
+@@ -2859,6 +2859,7 @@ static int __cold init_tree_roots(struct
+               /* All successful */
+               fs_info->generation = generation;
+               fs_info->last_trans_committed = generation;
++              fs_info->last_reloc_trans = 0;
+ 
+               /* Always begin writing backup roots after the one being used */
+               if (backup_index < 0) {
+@@ -2992,9 +2993,6 @@ void btrfs_init_fs_info(struct btrfs_fs_
+       spin_lock_init(&fs_info->swapfile_pins_lock);
+       fs_info->swapfile_pins = RB_ROOT;
+ 
+-      spin_lock_init(&fs_info->send_reloc_lock);
+-      fs_info->send_in_progress = 0;
+-
+       fs_info->bg_reclaim_threshold = BTRFS_DEFAULT_RECLAIM_THRESH;
+       INIT_WORK(&fs_info->reclaim_bgs_work, btrfs_reclaim_bgs_work);
+ }
+--- a/fs/btrfs/relocation.c
++++ b/fs/btrfs/relocation.c
+@@ -3854,25 +3854,14 @@ out:
+  *   0             success
+  *   -EINPROGRESS  operation is already in progress, that's probably a bug
+  *   -ECANCELED    cancellation request was set before the operation started
+- *   -EAGAIN       can not start because there are ongoing send operations
+  */
+ static int reloc_chunk_start(struct btrfs_fs_info *fs_info)
+ {
+-      spin_lock(&fs_info->send_reloc_lock);
+-      if (fs_info->send_in_progress) {
+-              btrfs_warn_rl(fs_info,
+-"cannot run relocation while send operations are in progress (%d in progress)",
+-                            fs_info->send_in_progress);
+-              spin_unlock(&fs_info->send_reloc_lock);
+-              return -EAGAIN;
+-      }
+       if (test_and_set_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+               /* This should not happen */
+-              spin_unlock(&fs_info->send_reloc_lock);
+               btrfs_err(fs_info, "reloc already running, cannot start");
+               return -EINPROGRESS;
+       }
+-      spin_unlock(&fs_info->send_reloc_lock);
+ 
+       if (atomic_read(&fs_info->reloc_cancel_req) > 0) {
+               btrfs_info(fs_info, "chunk relocation canceled on start");
+@@ -3894,9 +3883,7 @@ static void reloc_chunk_end(struct btrfs
+       /* Requested after start, clear bit first so any waiters can continue */
+       if (atomic_read(&fs_info->reloc_cancel_req) > 0)
+               btrfs_info(fs_info, "chunk relocation canceled during operation");
+-      spin_lock(&fs_info->send_reloc_lock);
+       clear_and_wake_up_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags);
+-      spin_unlock(&fs_info->send_reloc_lock);
+       atomic_set(&fs_info->reloc_cancel_req, 0);
+ }
+ 
+--- a/fs/btrfs/send.c
++++ b/fs/btrfs/send.c
+@@ -24,6 +24,7 @@
+ #include "transaction.h"
+ #include "compression.h"
+ #include "xattr.h"
++#include "print-tree.h"
+ 
+ /*
+  * Maximum number of references an extent can have in order for us to attempt to
+@@ -96,6 +97,15 @@ struct send_ctx {
+       struct btrfs_key *cmp_key;
+ 
+       /*
++       * Keep track of the generation of the last transaction that was used
++       * for relocating a block group. This is periodically checked in order
++       * to detect if a relocation happened since the last check, so that we
++       * don't operate on stale extent buffers for nodes (level >= 1) or on
++       * stale disk_bytenr values of file extent items.
++       */
++      u64 last_reloc_trans;
++
++      /*
+        * infos of the currently processed inode. In case of deleted inodes,
+        * these are the values from the deleted inode.
+        */
+@@ -1415,6 +1425,26 @@ static int find_extent_clone(struct send
+       if (ret < 0)
+               goto out;
+ 
++      down_read(&fs_info->commit_root_sem);
++      if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
++              /*
++               * A transaction commit for a transaction in which block group
++               * relocation was done just happened.
++               * The disk_bytenr of the file extent item we processed is
++               * possibly stale, referring to the extent's location before
++               * relocation. So act as if we haven't found any clone sources
++               * and fallback to write commands, which will read the correct
++               * data from the new extent location. Otherwise we will fail
++               * below because we haven't found our own back reference or we
++               * could be getting incorrect sources in case the old extent
++               * was already reallocated after the relocation.
++               */
++              up_read(&fs_info->commit_root_sem);
++              ret = -ENOENT;
++              goto out;
++      }
++      up_read(&fs_info->commit_root_sem);
++
+       if (!backref_ctx.found_itself) {
+               /* found a bug in backref code? */
+               ret = -EIO;
+@@ -6596,6 +6626,50 @@ static int changed_cb(struct btrfs_path
+ {
+       int ret = 0;
+ 
++      /*
++       * We can not hold the commit root semaphore here. This is because in
++       * the case of sending and receiving to the same filesystem, using a
++       * pipe, could result in a deadlock:
++       *
++       * 1) The task running send blocks on the pipe because it's full;
++       *
++       * 2) The task running receive, which is the only consumer of the pipe,
++       *    is waiting for a transaction commit (for example due to a space
++       *    reservation when doing a write or triggering a transaction commit
++       *    when creating a subvolume);
++       *
++       * 3) The transaction is waiting to write lock the commit root semaphore,
++       *    but can not acquire it since it's being held at 1).
++       *
++       * Down this call chain we write to the pipe through kernel_write().
++       * The same type of problem can also happen when sending to a file that
++       * is stored in the same filesystem - when reserving space for a write
++       * into the file, we can trigger a transaction commit.
++       *
++       * Our caller has supplied us with clones of leaves from the send and
++       * parent roots, so we're safe here from a concurrent relocation and
++       * further reallocation of metadata extents while we are here. Below we
++       * also assert that the leaves are clones.
++       */
++      lockdep_assert_not_held(&sctx->send_root->fs_info->commit_root_sem);
++
++      /*
++       * We always have a send root, so left_path is never NULL. We will not
++       * have a leaf when we have reached the end of the send root but have
++       * not yet reached the end of the parent root.
++       */
++      if (left_path->nodes[0])
++              ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
++                              &left_path->nodes[0]->bflags));
++      /*
++       * When doing a full send we don't have a parent root, so right_path is
++       * NULL. When doing an incremental send, we may have reached the end of
++       * the parent root already, so we don't have a leaf at right_path.
++       */
++      if (right_path && right_path->nodes[0])
++              ASSERT(test_bit(EXTENT_BUFFER_UNMAPPED,
++                              &right_path->nodes[0]->bflags));
++
+       if (result == BTRFS_COMPARE_TREE_SAME) {
+               if (key->type == BTRFS_INODE_REF_KEY ||
+                   key->type == BTRFS_INODE_EXTREF_KEY) {
+@@ -6642,14 +6716,46 @@ out:
+       return ret;
+ }
+ 
++static int search_key_again(const struct send_ctx *sctx,
++                          struct btrfs_root *root,
++                          struct btrfs_path *path,
++                          const struct btrfs_key *key)
++{
++      int ret;
++
++      if (!path->need_commit_sem)
++              lockdep_assert_held_read(&root->fs_info->commit_root_sem);
++
++      /*
++       * Roots used for send operations are readonly and no one can add,
++       * update or remove keys from them, so we should be able to find our
++       * key again. The only exception is deduplication, which can operate on
++       * readonly roots and add, update or remove keys to/from them - but at
++       * the moment we don't allow it to run in parallel with send.
++       */
++      ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
++      ASSERT(ret <= 0);
++      if (ret > 0) {
++              btrfs_print_tree(path->nodes[path->lowest_level], false);
++              btrfs_err(root->fs_info,
++"send: key (%llu %u %llu) not found in %s root %llu, lowest_level %d, slot %d",
++                        key->objectid, key->type, key->offset,
++                        (root == sctx->parent_root ? "parent" : "send"),
++                        root->root_key.objectid, path->lowest_level,
++                        path->slots[path->lowest_level]);
++              return -EUCLEAN;
++      }
++
++      return ret;
++}
++
+ static int full_send_tree(struct send_ctx *sctx)
+ {
+       int ret;
+       struct btrfs_root *send_root = sctx->send_root;
+       struct btrfs_key key;
++      struct btrfs_fs_info *fs_info = send_root->fs_info;
+       struct btrfs_path *path;
+-      struct extent_buffer *eb;
+-      int slot;
+ 
+       path = alloc_path_for_send();
+       if (!path)
+@@ -6660,6 +6766,10 @@ static int full_send_tree(struct send_ct
+       key.type = BTRFS_INODE_ITEM_KEY;
+       key.offset = 0;
+ 
++      down_read(&fs_info->commit_root_sem);
++      sctx->last_reloc_trans = fs_info->last_reloc_trans;
++      up_read(&fs_info->commit_root_sem);
++
+       ret = btrfs_search_slot_for_read(send_root, &key, path, 1, 0);
+       if (ret < 0)
+               goto out;
+@@ -6667,15 +6777,35 @@ static int full_send_tree(struct send_ct
+               goto out_finish;
+ 
+       while (1) {
+-              eb = path->nodes[0];
+-              slot = path->slots[0];
+-              btrfs_item_key_to_cpu(eb, &key, slot);
++              btrfs_item_key_to_cpu(path->nodes[0], &key, path->slots[0]);
+ 
+               ret = changed_cb(path, NULL, &key,
+                                BTRFS_COMPARE_TREE_NEW, sctx);
+               if (ret < 0)
+                       goto out;
+ 
++              down_read(&fs_info->commit_root_sem);
++              if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
++                      sctx->last_reloc_trans = fs_info->last_reloc_trans;
++                      up_read(&fs_info->commit_root_sem);
++                      /*
++                       * A transaction used for relocating a block group was
++                       * committed or is about to finish its commit. Release
++                       * our path (leaf) and restart the search, so that we
++                       * avoid operating on any file extent items that are
++                       * stale, with a disk_bytenr that reflects a pre
++                       * relocation value. This way we avoid as much as
++                       * possible to fallback to regular writes when checking
++                       * if we can clone file ranges.
++                       */
++                      btrfs_release_path(path);
++                      ret = search_key_again(sctx, send_root, path, &key);
++                      if (ret < 0)
++                              goto out;
++              } else {
++                      up_read(&fs_info->commit_root_sem);
++              }
++
+               ret = btrfs_next_item(send_root, path);
+               if (ret < 0)
+                       goto out;
+@@ -6693,6 +6823,20 @@ out:
+       return ret;
+ }
+ 
++static int replace_node_with_clone(struct btrfs_path *path, int level)
++{
++      struct extent_buffer *clone;
++
++      clone = btrfs_clone_extent_buffer(path->nodes[level]);
++      if (!clone)
++              return -ENOMEM;
++
++      free_extent_buffer(path->nodes[level]);
++      path->nodes[level] = clone;
++
++      return 0;
++}
++
+ static int tree_move_down(struct btrfs_path *path, int *level, u64 reada_min_gen)
+ {
+       struct extent_buffer *eb;
+@@ -6702,6 +6846,8 @@ static int tree_move_down(struct btrfs_p
+       u64 reada_max;
+       u64 reada_done = 0;
+ 
++      lockdep_assert_held_read(&parent->fs_info->commit_root_sem);
++
+       BUG_ON(*level == 0);
+       eb = btrfs_read_node_slot(parent, slot);
+       if (IS_ERR(eb))
+@@ -6725,6 +6871,10 @@ static int tree_move_down(struct btrfs_p
+       path->nodes[*level - 1] = eb;
+       path->slots[*level - 1] = 0;
+       (*level)--;
++
++      if (*level == 0)
++              return replace_node_with_clone(path, 0);
++
+       return 0;
+ }
+ 
+@@ -6738,8 +6888,10 @@ static int tree_move_next_or_upnext(stru
+       path->slots[*level]++;
+ 
+       while (path->slots[*level] >= nritems) {
+-              if (*level == root_level)
++              if (*level == root_level) {
++                      path->slots[*level] = nritems - 1;
+                       return -1;
++              }
+ 
+               /* move upnext */
+               path->slots[*level] = 0;
+@@ -6771,14 +6923,20 @@ static int tree_advance(struct btrfs_pat
+       } else {
+               ret = tree_move_down(path, level, reada_min_gen);
+       }
+-      if (ret >= 0) {
+-              if (*level == 0)
+-                      btrfs_item_key_to_cpu(path->nodes[*level], key,
+-                                      path->slots[*level]);
+-              else
+-                      btrfs_node_key_to_cpu(path->nodes[*level], key,
+-                                      path->slots[*level]);
+-      }
++
++      /*
++       * Even if we have reached the end of a tree, ret is -1, update the key
++       * anyway, so that in case we need to restart due to a block group
++       * relocation, we can assert that the last key of the root node still
++       * exists in the tree.
++       */
++      if (*level == 0)
++              btrfs_item_key_to_cpu(path->nodes[*level], key,
++                                    path->slots[*level]);
++      else
++              btrfs_node_key_to_cpu(path->nodes[*level], key,
++                                    path->slots[*level]);
++
+       return ret;
+ }
+ 
+@@ -6808,6 +6966,97 @@ static int tree_compare_item(struct btrf
+ }
+ 
+ /*
++ * A transaction used for relocating a block group was committed or is about to
++ * finish its commit. Release our paths and restart the search, so that we are
++ * not using stale extent buffers:
++ *
++ * 1) For levels > 0, we are only holding references of extent buffers, without
++ *    any locks on them, which does not prevent them from having been relocated
++ *    and reallocated after the last time we released the commit root semaphore.
++ *    The exception are the root nodes, for which we always have a clone, see
++ *    the comment at btrfs_compare_trees();
++ *
++ * 2) For leaves, level 0, we are holding copies (clones) of extent buffers, so
++ *    we are safe from the concurrent relocation and reallocation. However they
++ *    can have file extent items with a pre relocation disk_bytenr value, so we
++ *    restart the start from the current commit roots and clone the new leaves so
++ *    that we get the post relocation disk_bytenr values. Not doing so, could
++ *    make us clone the wrong data in case there are new extents using the old
++ *    disk_bytenr that happen to be shared.
++ */
++static int restart_after_relocation(struct btrfs_path *left_path,
++                                  struct btrfs_path *right_path,
++                                  const struct btrfs_key *left_key,
++                                  const struct btrfs_key *right_key,
++                                  int left_level,
++                                  int right_level,
++                                  const struct send_ctx *sctx)
++{
++      int root_level;
++      int ret;
++
++      lockdep_assert_held_read(&sctx->send_root->fs_info->commit_root_sem);
++
++      btrfs_release_path(left_path);
++      btrfs_release_path(right_path);
++
++      /*
++       * Since keys can not be added or removed to/from our roots because they
++       * are readonly and we do not allow deduplication to run in parallel
++       * (which can add, remove or change keys), the layout of the trees should
++       * not change.
++       */
++      left_path->lowest_level = left_level;
++      ret = search_key_again(sctx, sctx->send_root, left_path, left_key);
++      if (ret < 0)
++              return ret;
++
++      right_path->lowest_level = right_level;
++      ret = search_key_again(sctx, sctx->parent_root, right_path, right_key);
++      if (ret < 0)
++              return ret;
++
++      /*
++       * If the lowest level nodes are leaves, clone them so that they can be
++       * safely used by changed_cb() while not under the protection of the
++       * commit root semaphore, even if relocation and reallocation happens in
++       * parallel.
++       */
++      if (left_level == 0) {
++              ret = replace_node_with_clone(left_path, 0);
++              if (ret < 0)
++                      return ret;
++      }
++
++      if (right_level == 0) {
++              ret = replace_node_with_clone(right_path, 0);
++              if (ret < 0)
++                      return ret;
++      }
++
++      /*
++       * Now clone the root nodes (unless they happen to be the leaves we have
++       * already cloned). This is to protect against concurrent snapshotting of
++       * the send and parent roots (see the comment at btrfs_compare_trees()).
++       */
++      root_level = btrfs_header_level(sctx->send_root->commit_root);
++      if (root_level > 0) {
++              ret = replace_node_with_clone(left_path, root_level);
++              if (ret < 0)
++                      return ret;
++      }
++
++      root_level = btrfs_header_level(sctx->parent_root->commit_root);
++      if (root_level > 0) {
++              ret = replace_node_with_clone(right_path, root_level);
++              if (ret < 0)
++                      return ret;
++      }
++
++      return 0;
++}
++
++/*
+  * This function compares two trees and calls the provided callback for
+  * every changed/new/deleted item it finds.
+  * If shared tree blocks are encountered, whole subtrees are skipped, making
+@@ -6835,10 +7084,10 @@ static int btrfs_compare_trees(struct bt
+       int right_root_level;
+       int left_level;
+       int right_level;
+-      int left_end_reached;
+-      int right_end_reached;
+-      int advance_left;
+-      int advance_right;
++      int left_end_reached = 0;
++      int right_end_reached = 0;
++      int advance_left = 0;
++      int advance_right = 0;
+       u64 left_blockptr;
+       u64 right_blockptr;
+       u64 left_gen;
+@@ -6906,12 +7155,18 @@ static int btrfs_compare_trees(struct bt
+       down_read(&fs_info->commit_root_sem);
+       left_level = btrfs_header_level(left_root->commit_root);
+       left_root_level = left_level;
++      /*
++       * We clone the root node of the send and parent roots to prevent races
++       * with snapshot creation of these roots. Snapshot creation COWs the
++       * root node of a tree, so after the transaction is committed the old
++       * extent can be reallocated while this send operation is still ongoing.
++       * So we clone them, under the commit root semaphore, to be race free.
++       */
+       left_path->nodes[left_level] =
+                       btrfs_clone_extent_buffer(left_root->commit_root);
+       if (!left_path->nodes[left_level]) {
+-              up_read(&fs_info->commit_root_sem);
+               ret = -ENOMEM;
+-              goto out;
++              goto out_unlock;
+       }
+ 
+       right_level = btrfs_header_level(right_root->commit_root);
+@@ -6919,9 +7174,8 @@ static int btrfs_compare_trees(struct bt
+       right_path->nodes[right_level] =
+                       btrfs_clone_extent_buffer(right_root->commit_root);
+       if (!right_path->nodes[right_level]) {
+-              up_read(&fs_info->commit_root_sem);
+               ret = -ENOMEM;
+-              goto out;
++              goto out_unlock;
+       }
+       /*
+        * Our right root is the parent root, while the left root is the "send"
+@@ -6931,7 +7185,6 @@ static int btrfs_compare_trees(struct bt
+        * will need to read them at some point.
+        */
+       reada_min_gen = btrfs_header_generation(right_root->commit_root);
+-      up_read(&fs_info->commit_root_sem);
+ 
+       if (left_level == 0)
+               btrfs_item_key_to_cpu(left_path->nodes[left_level],
+@@ -6946,11 +7199,26 @@ static int btrfs_compare_trees(struct bt
+               btrfs_node_key_to_cpu(right_path->nodes[right_level],
+                               &right_key, right_path->slots[right_level]);
+ 
+-      left_end_reached = right_end_reached = 0;
+-      advance_left = advance_right = 0;
++      sctx->last_reloc_trans = fs_info->last_reloc_trans;
+ 
+       while (1) {
+-              cond_resched();
++              if (need_resched() ||
++                  rwsem_is_contended(&fs_info->commit_root_sem)) {
++                      up_read(&fs_info->commit_root_sem);
++                      cond_resched();
++                      down_read(&fs_info->commit_root_sem);
++              }
++
++              if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
++                      ret = restart_after_relocation(left_path, right_path,
++                                                     &left_key, &right_key,
++                                                     left_level, right_level,
++                                                     sctx);
++                      if (ret < 0)
++                              goto out_unlock;
++                      sctx->last_reloc_trans = fs_info->last_reloc_trans;
++              }
++
+               if (advance_left && !left_end_reached) {
+                       ret = tree_advance(left_path, &left_level,
+                                       left_root_level,
+@@ -6959,7 +7227,7 @@ static int btrfs_compare_trees(struct bt
+                       if (ret == -1)
+                               left_end_reached = ADVANCE;
+                       else if (ret < 0)
+-                              goto out;
++                              goto out_unlock;
+                       advance_left = 0;
+               }
+               if (advance_right && !right_end_reached) {
+@@ -6970,54 +7238,55 @@ static int btrfs_compare_trees(struct bt
+                       if (ret == -1)
+                               right_end_reached = ADVANCE;
+                       else if (ret < 0)
+-                              goto out;
++                              goto out_unlock;
+                       advance_right = 0;
+               }
+ 
+               if (left_end_reached && right_end_reached) {
+                       ret = 0;
+-                      goto out;
++                      goto out_unlock;
+               } else if (left_end_reached) {
+                       if (right_level == 0) {
++                              up_read(&fs_info->commit_root_sem);
+                               ret = changed_cb(left_path, right_path,
+                                               &right_key,
+                                               BTRFS_COMPARE_TREE_DELETED,
+                                               sctx);
+                               if (ret < 0)
+                                       goto out;
++                              down_read(&fs_info->commit_root_sem);
+                       }
+                       advance_right = ADVANCE;
+                       continue;
+               } else if (right_end_reached) {
+                       if (left_level == 0) {
++                              up_read(&fs_info->commit_root_sem);
+                               ret = changed_cb(left_path, right_path,
+                                               &left_key,
+                                               BTRFS_COMPARE_TREE_NEW,
+                                               sctx);
+                               if (ret < 0)
+                                       goto out;
++                              down_read(&fs_info->commit_root_sem);
+                       }
+                       advance_left = ADVANCE;
+                       continue;
+               }
+ 
+               if (left_level == 0 && right_level == 0) {
++                      up_read(&fs_info->commit_root_sem);
+                       cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+                       if (cmp < 0) {
+                               ret = changed_cb(left_path, right_path,
+                                               &left_key,
+                                               BTRFS_COMPARE_TREE_NEW,
+                                               sctx);
+-                              if (ret < 0)
+-                                      goto out;
+                               advance_left = ADVANCE;
+                       } else if (cmp > 0) {
+                               ret = changed_cb(left_path, right_path,
+                                               &right_key,
+                                               BTRFS_COMPARE_TREE_DELETED,
+                                               sctx);
+-                              if (ret < 0)
+-                                      goto out;
+                               advance_right = ADVANCE;
+                       } else {
+                               enum btrfs_compare_tree_result result;
+@@ -7031,11 +7300,13 @@ static int btrfs_compare_trees(struct bt
+                                       result = BTRFS_COMPARE_TREE_SAME;
+                               ret = changed_cb(left_path, right_path,
+                                                &left_key, result, sctx);
+-                              if (ret < 0)
+-                                      goto out;
+                               advance_left = ADVANCE;
+                               advance_right = ADVANCE;
+                       }
++
++                      if (ret < 0)
++                              goto out;
++                      down_read(&fs_info->commit_root_sem);
+               } else if (left_level == right_level) {
+                       cmp = btrfs_comp_cpu_keys(&left_key, &right_key);
+                       if (cmp < 0) {
+@@ -7075,6 +7346,8 @@ static int btrfs_compare_trees(struct bt
+               }
+       }
+ 
++out_unlock:
++      up_read(&fs_info->commit_root_sem);
+ out:
+       btrfs_free_path(left_path);
+       btrfs_free_path(right_path);
+@@ -7413,21 +7686,7 @@ long btrfs_ioctl_send(struct file *mnt_f
+       if (ret)
+               goto out;
+ 
+-      spin_lock(&fs_info->send_reloc_lock);
+-      if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags)) {
+-              spin_unlock(&fs_info->send_reloc_lock);
+-              btrfs_warn_rl(fs_info,
+-              "cannot run send because a relocation operation is in progress");
+-              ret = -EAGAIN;
+-              goto out;
+-      }
+-      fs_info->send_in_progress++;
+-      spin_unlock(&fs_info->send_reloc_lock);
+-
+       ret = send_subvol(sctx);
+-      spin_lock(&fs_info->send_reloc_lock);
+-      fs_info->send_in_progress--;
+-      spin_unlock(&fs_info->send_reloc_lock);
+       if (ret < 0)
+               goto out;
+ 
+--- a/fs/btrfs/transaction.c
++++ b/fs/btrfs/transaction.c
+@@ -163,6 +163,10 @@ static noinline void switch_commit_roots
+       struct btrfs_caching_control *caching_ctl, *next;
+ 
+       down_write(&fs_info->commit_root_sem);
++
++      if (test_bit(BTRFS_FS_RELOC_RUNNING, &fs_info->flags))
++              fs_info->last_reloc_trans = trans->transid;
++
+       list_for_each_entry_safe(root, tmp, &cur_trans->switch_commits,
+                                dirty_list) {
+               list_del_init(&root->dirty_list);
diff --git a/queue-5.15/drm-i915-workaround-broken-bios-dbuf-configuration-on-tgl-rkl.patch b/queue-5.15/drm-i915-workaround-broken-bios-dbuf-configuration-on-tgl-rkl.patch

new file mode 100644 (file)

index 0000000..18745b1
--- /dev/null
+++ b/queue-5.15/drm-i915-workaround-broken-bios-dbuf-configuration-on-tgl-rkl.patch
@@ -0,0 +1,165 @@
+From 4e6f55120c7eccf6f9323bb681632e23cbcb3f3c Mon Sep 17 00:00:00 2001
+From: =?UTF-8?q?Ville=20Syrj=C3=A4l=C3=A4?= <ville.syrjala@linux.intel.com>
+Date: Fri, 4 Feb 2022 16:18:18 +0200
+Subject: drm/i915: Workaround broken BIOS DBUF configuration on TGL/RKL
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Ville Syrjälä <ville.syrjala@linux.intel.com>
+
+commit 4e6f55120c7eccf6f9323bb681632e23cbcb3f3c upstream.
+
+On TGL/RKL the BIOS likes to use some kind of bogus DBUF layout
+that doesn't match what the spec recommends. With a single active
+pipe that is not going to be a problem, but with multiple pipes
+active skl_commit_modeset_enables() goes into an infinite loop
+since it can't figure out any order in which it can commit the
+pipes without causing DBUF overlaps between the planes.
+
+We'd need some kind of extra DBUF defrag stage in between to
+make the transition possible. But that is clearly way too complex
+a solution, so in the name of simplicity let's just sanitize the
+DBUF state by simply turning off all planes when we detect a
+pipe encroaching on its neighbours' DBUF slices. We only have
+to disable the primary planes as all other planes should have
+already been disabled (if they somehow were enabled) by
+earlier sanitization steps.
+
+And for good measure let's also sanitize in case the DBUF
+allocations of the pipes already seem to overlap each other.
+
+Cc: <stable@vger.kernel.org> # v5.14+
+Closes: https://gitlab.freedesktop.org/drm/intel/-/issues/4762
+Signed-off-by: Ville Syrjälä <ville.syrjala@linux.intel.com>
+Link: https://patchwork.freedesktop.org/patch/msgid/20220204141818.1900-3-ville.syrjala@linux.intel.com
+Reviewed-by: Stanislav Lisovskiy <stanislav.lisovskiy@intel.com>
+(cherry picked from commit 15512021eb3975a8c2366e3883337e252bb0eee5)
+Signed-off-by: Tvrtko Ursulin <tvrtko.ursulin@intel.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ drivers/gpu/drm/i915/display/intel_display.c |    5 +
+ drivers/gpu/drm/i915/display/intel_display.h |    2 
+ drivers/gpu/drm/i915/intel_pm.c              |   68 +++++++++++++++++++++++++++
+ drivers/gpu/drm/i915/intel_pm.h              |    1 
+ 4 files changed, 74 insertions(+), 2 deletions(-)
+
+--- a/drivers/gpu/drm/i915/display/intel_display.c
++++ b/drivers/gpu/drm/i915/display/intel_display.c
+@@ -1658,8 +1658,8 @@ static void fixup_plane_bitmasks(struct
+       }
+ }
+ 
+-static void intel_plane_disable_noatomic(struct intel_crtc *crtc,
+-                                       struct intel_plane *plane)
++void intel_plane_disable_noatomic(struct intel_crtc *crtc,
++                                struct intel_plane *plane)
+ {
+       struct drm_i915_private *dev_priv = to_i915(crtc->base.dev);
+       struct intel_crtc_state *crtc_state =
+@@ -13217,6 +13217,7 @@ intel_modeset_setup_hw_state(struct drm_
+               vlv_wm_sanitize(dev_priv);
+       } else if (DISPLAY_VER(dev_priv) >= 9) {
+               skl_wm_get_hw_state(dev_priv);
++              skl_wm_sanitize(dev_priv);
+       } else if (HAS_PCH_SPLIT(dev_priv)) {
+               ilk_wm_get_hw_state(dev_priv);
+       }
+--- a/drivers/gpu/drm/i915/display/intel_display.h
++++ b/drivers/gpu/drm/i915/display/intel_display.h
+@@ -629,6 +629,8 @@ void intel_plane_unpin_fb(struct intel_p
+ struct intel_encoder *
+ intel_get_crtc_new_encoder(const struct intel_atomic_state *state,
+                          const struct intel_crtc_state *crtc_state);
++void intel_plane_disable_noatomic(struct intel_crtc *crtc,
++                                struct intel_plane *plane);
+ 
+ unsigned int intel_surf_alignment(const struct drm_framebuffer *fb,
+                                 int color_plane);
+--- a/drivers/gpu/drm/i915/intel_pm.c
++++ b/drivers/gpu/drm/i915/intel_pm.c
+@@ -6681,6 +6681,74 @@ void skl_wm_get_hw_state(struct drm_i915
+       dbuf_state->enabled_slices = dev_priv->dbuf.enabled_slices;
+ }
+ 
++static bool skl_dbuf_is_misconfigured(struct drm_i915_private *i915)
++{
++      const struct intel_dbuf_state *dbuf_state =
++              to_intel_dbuf_state(i915->dbuf.obj.state);
++      struct skl_ddb_entry entries[I915_MAX_PIPES] = {};
++      struct intel_crtc *crtc;
++
++      for_each_intel_crtc(&i915->drm, crtc) {
++              const struct intel_crtc_state *crtc_state =
++                      to_intel_crtc_state(crtc->base.state);
++
++              entries[crtc->pipe] = crtc_state->wm.skl.ddb;
++      }
++
++      for_each_intel_crtc(&i915->drm, crtc) {
++              const struct intel_crtc_state *crtc_state =
++                      to_intel_crtc_state(crtc->base.state);
++              u8 slices;
++
++              slices = skl_compute_dbuf_slices(crtc, dbuf_state->active_pipes,
++                                               dbuf_state->joined_mbus);
++              if (dbuf_state->slices[crtc->pipe] & ~slices)
++                      return true;
++
++              if (skl_ddb_allocation_overlaps(&crtc_state->wm.skl.ddb, entries,
++                                              I915_MAX_PIPES, crtc->pipe))
++                      return true;
++      }
++
++      return false;
++}
++
++void skl_wm_sanitize(struct drm_i915_private *i915)
++{
++      struct intel_crtc *crtc;
++
++      /*
++       * On TGL/RKL (at least) the BIOS likes to assign the planes
++       * to the wrong DBUF slices. This will cause an infinite loop
++       * in skl_commit_modeset_enables() as it can't find a way to
++       * transition between the old bogus DBUF layout to the new
++       * proper DBUF layout without DBUF allocation overlaps between
++       * the planes (which cannot be allowed or else the hardware
++       * may hang). If we detect a bogus DBUF layout just turn off
++       * all the planes so that skl_commit_modeset_enables() can
++       * simply ignore them.
++       */
++      if (!skl_dbuf_is_misconfigured(i915))
++              return;
++
++      drm_dbg_kms(&i915->drm, "BIOS has misprogrammed the DBUF, disabling all planes\n");
++
++      for_each_intel_crtc(&i915->drm, crtc) {
++              struct intel_plane *plane = to_intel_plane(crtc->base.primary);
++              const struct intel_plane_state *plane_state =
++                      to_intel_plane_state(plane->base.state);
++              struct intel_crtc_state *crtc_state =
++                      to_intel_crtc_state(crtc->base.state);
++
++              if (plane_state->uapi.visible)
++                      intel_plane_disable_noatomic(crtc, plane);
++
++              drm_WARN_ON(&i915->drm, crtc_state->active_planes != 0);
++
++              memset(&crtc_state->wm.skl.ddb, 0, sizeof(crtc_state->wm.skl.ddb));
++      }
++}
++
+ static void ilk_pipe_wm_get_hw_state(struct intel_crtc *crtc)
+ {
+       struct drm_device *dev = crtc->base.dev;
+--- a/drivers/gpu/drm/i915/intel_pm.h
++++ b/drivers/gpu/drm/i915/intel_pm.h
+@@ -48,6 +48,7 @@ void skl_pipe_wm_get_hw_state(struct int
+                             struct skl_pipe_wm *out);
+ void g4x_wm_sanitize(struct drm_i915_private *dev_priv);
+ void vlv_wm_sanitize(struct drm_i915_private *dev_priv);
++void skl_wm_sanitize(struct drm_i915_private *dev_priv);
+ bool intel_can_enable_sagv(struct drm_i915_private *dev_priv,
+                          const struct intel_bw_state *bw_state);
+ void intel_sagv_pre_plane_update(struct intel_atomic_state *state);
diff --git a/queue-5.15/riscv-dts-k210-fix-broken-irqs-on-hart1.patch b/queue-5.15/riscv-dts-k210-fix-broken-irqs-on-hart1.patch

new file mode 100644 (file)

index 0000000..8f40889
--- /dev/null
+++ b/queue-5.15/riscv-dts-k210-fix-broken-irqs-on-hart1.patch
@@ -0,0 +1,49 @@
+From 74583f1b92cb3bbba1a3741cea237545c56f506c Mon Sep 17 00:00:00 2001
+From: Niklas Cassel <niklas.cassel@wdc.com>
+Date: Tue, 1 Mar 2022 00:44:18 +0000
+Subject: riscv: dts: k210: fix broken IRQs on hart1
+
+From: Niklas Cassel <niklas.cassel@wdc.com>
+
+commit 74583f1b92cb3bbba1a3741cea237545c56f506c upstream.
+
+Commit 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree")
+incorrectly removed two entries from the PLIC interrupt-controller node's
+interrupts-extended property.
+
+The PLIC driver cannot know the mapping between hart contexts and hart ids,
+so this information has to be provided by device tree, as specified by the
+PLIC device tree binding.
+
+The PLIC driver uses the interrupts-extended property, and initializes the
+hart context registers in the exact same order as provided by the
+interrupts-extended property.
+
+In other words, if we don't specify the S-mode interrupts, the PLIC driver
+will simply initialize the hart0 S-mode hart context with the hart1 M-mode
+configuration. It is therefore essential to specify the S-mode IRQs even
+though the system itself will only ever be running in M-mode.
+
+Re-add the S-mode interrupts, so that we get working IRQs on hart1 again.
+
+Cc: <stable@vger.kernel.org>
+Fixes: 67d96729a9e7 ("riscv: Update Canaan Kendryte K210 device tree")
+Signed-off-by: Niklas Cassel <niklas.cassel@wdc.com>
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ arch/riscv/boot/dts/canaan/k210.dtsi |    3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+--- a/arch/riscv/boot/dts/canaan/k210.dtsi
++++ b/arch/riscv/boot/dts/canaan/k210.dtsi
+@@ -113,7 +113,8 @@
+                       compatible = "canaan,k210-plic", "sifive,plic-1.0.0";
+                       reg = <0xC000000 0x4000000>;
+                       interrupt-controller;
+-                      interrupts-extended = <&cpu0_intc 11 &cpu1_intc 11>;
++                      interrupts-extended = <&cpu0_intc 11>, <&cpu0_intc 9>,
++                                            <&cpu1_intc 11>, <&cpu1_intc 9>;
+                       riscv,ndev = <65>;
+               };
+ 
diff --git a/queue-5.15/series b/queue-5.15/series

index 4f51c409f12f9ec62f986c8b16f6dfcc212f437c..8452a6cad4d07e518b61de174c232affeb8887cb 100644 (file)
--- a/queue-5.15/series
+++ b/queue-5.15/series
@@ -103,3 +103,7 @@ x86-boot-add-setup_indirect-support-in-early_memremap_is_setup_data.patch
  x86-sgx-free-backing-memory-after-faulting-the-enclave-page.patch
  x86-traps-mark-do_int3-nokprobe_symbol.patch
  drm-panel-select-drm_dp_helper-for-drm_panel_edp.patch
+btrfs-make-send-work-with-concurrent-block-group-relocation.patch
+drm-i915-workaround-broken-bios-dbuf-configuration-on-tgl-rkl.patch
+riscv-dts-k210-fix-broken-irqs-on-hart1.patch
+block-drop-unused-includes-in-linux-genhd.h.patch
author	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)
committer	Greg Kroah-Hartman <gregkh@linuxfoundation.org>
	Mon, 14 Mar 2022 09:51:12 +0000 (10:51 +0100)
queue-5.15/block-drop-unused-includes-in-linux-genhd.h.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/btrfs-make-send-work-with-concurrent-block-group-relocation.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/drm-i915-workaround-broken-bios-dbuf-configuration-on-tgl-rkl.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/riscv-dts-k210-fix-broken-irqs-on-hart1.patch	[new file with mode: 0644]	patch \| blob
queue-5.15/series		patch \| blob \| blame \| history