--- /dev/null
+From debd1c065d2037919a7da67baf55cc683fee09f0 Mon Sep 17 00:00:00 2001
+From: Nikolay Borisov <nborisov@suse.com>
+Date: Fri, 17 May 2019 10:44:25 +0300
+Subject: btrfs: Ensure replaced device doesn't have pending chunk allocation
+
+From: Nikolay Borisov <nborisov@suse.com>
+
+commit debd1c065d2037919a7da67baf55cc683fee09f0 upstream.
+
+Recent FITRIM work, namely bbbf7243d62d ("btrfs: combine device update
+operations during transaction commit") combined the way certain
+operations are recoded in a transaction. As a result an ASSERT was added
+in dev_replace_finish to ensure the new code works correctly.
+Unfortunately I got reports that it's possible to trigger the assert,
+meaning that during a device replace it's possible to have an unfinished
+chunk allocation on the source device.
+
+This is supposed to be prevented by the fact that a transaction is
+committed before finishing the replace oepration and alter acquiring the
+chunk mutex. This is not sufficient since by the time the transaction is
+committed and the chunk mutex acquired it's possible to allocate a chunk
+depending on the workload being executed on the replaced device. This
+bug has been present ever since device replace was introduced but there
+was never code which checks for it.
+
+The correct way to fix is to ensure that there is no pending device
+modification operation when the chunk mutex is acquire and if there is
+repeat transaction commit. Unfortunately it's not possible to just
+exclude the source device from btrfs_fs_devices::dev_alloc_list since
+this causes ENOSPC to be hit in transaction commit.
+
+Fixing that in another way would need to add special cases to handle the
+last writes and forbid new ones. The looped transaction fix is more
+obvious, and can be easily backported. The runtime of dev-replace is
+long so there's no noticeable delay caused by that.
+
+Reported-by: David Sterba <dsterba@suse.com>
+Fixes: 391cd9df81ac ("Btrfs: fix unprotected alloc list insertion during the finishing procedure of replace")
+CC: stable@vger.kernel.org # 4.4+
+Signed-off-by: Nikolay Borisov <nborisov@suse.com>
+Reviewed-by: David Sterba <dsterba@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+
+---
+ fs/btrfs/dev-replace.c | 29 +++++++++++++++++++----------
+ fs/btrfs/volumes.c | 2 ++
+ fs/btrfs/volumes.h | 5 +++++
+ 3 files changed, 26 insertions(+), 10 deletions(-)
+
+--- a/fs/btrfs/dev-replace.c
++++ b/fs/btrfs/dev-replace.c
+@@ -511,18 +511,27 @@ static int btrfs_dev_replace_finishing(s
+ }
+ btrfs_wait_ordered_roots(root->fs_info, -1, 0, (u64)-1);
+
+- trans = btrfs_start_transaction(root, 0);
+- if (IS_ERR(trans)) {
+- mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
+- return PTR_ERR(trans);
++ while (1) {
++ trans = btrfs_start_transaction(root, 0);
++ if (IS_ERR(trans)) {
++ mutex_unlock(&dev_replace->lock_finishing_cancel_unmount);
++ return PTR_ERR(trans);
++ }
++ ret = btrfs_commit_transaction(trans, root);
++ WARN_ON(ret);
++ mutex_lock(&uuid_mutex);
++ /* keep away write_all_supers() during the finishing procedure */
++ mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
++ mutex_lock(&root->fs_info->chunk_mutex);
++ if (src_device->has_pending_chunks) {
++ mutex_unlock(&root->fs_info->chunk_mutex);
++ mutex_unlock(&root->fs_info->fs_devices->device_list_mutex);
++ mutex_unlock(&uuid_mutex);
++ } else {
++ break;
++ }
+ }
+- ret = btrfs_commit_transaction(trans, root);
+- WARN_ON(ret);
+
+- mutex_lock(&uuid_mutex);
+- /* keep away write_all_supers() during the finishing procedure */
+- mutex_lock(&root->fs_info->fs_devices->device_list_mutex);
+- mutex_lock(&root->fs_info->chunk_mutex);
+ btrfs_dev_replace_lock(dev_replace, 1);
+ dev_replace->replace_state =
+ scrub_ret ? BTRFS_IOCTL_DEV_REPLACE_STATE_CANCELED
+--- a/fs/btrfs/volumes.c
++++ b/fs/btrfs/volumes.c
+@@ -4876,6 +4876,7 @@ static int __btrfs_alloc_chunk(struct bt
+ for (i = 0; i < map->num_stripes; i++) {
+ num_bytes = map->stripes[i].dev->bytes_used + stripe_size;
+ btrfs_device_set_bytes_used(map->stripes[i].dev, num_bytes);
++ map->stripes[i].dev->has_pending_chunks = true;
+ }
+
+ spin_lock(&extent_root->fs_info->free_chunk_lock);
+@@ -7250,6 +7251,7 @@ void btrfs_update_commit_device_bytes_us
+ for (i = 0; i < map->num_stripes; i++) {
+ dev = map->stripes[i].dev;
+ dev->commit_bytes_used = dev->bytes_used;
++ dev->has_pending_chunks = false;
+ }
+ }
+ unlock_chunks(root);
+--- a/fs/btrfs/volumes.h
++++ b/fs/btrfs/volumes.h
+@@ -62,6 +62,11 @@ struct btrfs_device {
+
+ spinlock_t io_lock ____cacheline_aligned;
+ int running_pending;
++ /* When true means this device has pending chunk alloc in
++ * current transaction. Protected by chunk_mutex.
++ */
++ bool has_pending_chunks;
++
+ /* regular prio bios */
+ struct btrfs_pending_bios pending_bios;
+ /* WRITE_SYNC bios */