From: Greg Kroah-Hartman Date: Sun, 11 Nov 2018 20:56:40 +0000 (-0800) Subject: 4.18-stable patches X-Git-Tag: v4.19.2~14 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=151bc69c04e10baf1d9ac5645495f9744131885f;p=thirdparty%2Fkernel%2Fstable-queue.git 4.18-stable patches added patches: bpf-wait-for-running-bpf-programs-when-updating-map-in-map.patch btrfs-don-t-run-delayed_iputs-in-commit.patch btrfs-don-t-use-ctl-free_space-for-max_extent_size.patch btrfs-fix-assertion-on-fsync-of-regular-file-when-using-no-holes-feature.patch btrfs-fix-deadlock-when-writing-out-free-space-caches.patch btrfs-fix-insert_reserved-error-handling.patch btrfs-fix-null-pointer-dereference-on-compressed-write-path-error.patch btrfs-fix-use-after-free-during-inode-eviction.patch btrfs-fix-use-after-free-when-dumping-free-space.patch btrfs-move-the-dio_sem-higher-up-the-callchain.patch btrfs-only-free-reserved-extent-if-we-didn-t-insert-it.patch btrfs-reset-max_extent_size-properly.patch btrfs-set-max_extent_size-properly.patch net-sched-remove-tca_options-from-policy.patch userns-also-map-extents-in-the-reverse-map-to-kernel-ids.patch --- diff --git a/queue-4.18/bpf-wait-for-running-bpf-programs-when-updating-map-in-map.patch b/queue-4.18/bpf-wait-for-running-bpf-programs-when-updating-map-in-map.patch new file mode 100644 index 00000000000..f1f6aa0ae54 --- /dev/null +++ b/queue-4.18/bpf-wait-for-running-bpf-programs-when-updating-map-in-map.patch @@ -0,0 +1,65 @@ +From 1ae80cf31938c8f77c37a29bbe29e7f1cd492be8 Mon Sep 17 00:00:00 2001 +From: Daniel Colascione +Date: Fri, 12 Oct 2018 03:54:27 -0700 +Subject: bpf: wait for running BPF programs when updating map-in-map + +From: Daniel Colascione + +commit 1ae80cf31938c8f77c37a29bbe29e7f1cd492be8 upstream. + +The map-in-map frequently serves as a mechanism for atomic +snapshotting of state that a BPF program might record. The current +implementation is dangerous to use in this way, however, since +userspace has no way of knowing when all programs that might have +retrieved the "old" value of the map may have completed. + +This change ensures that map update operations on map-in-map map types +always wait for all references to the old map to drop before returning +to userspace. + +Signed-off-by: Daniel Colascione +Reviewed-by: Joel Fernandes (Google) +Signed-off-by: Alexei Starovoitov +Signed-off-by: Chenbo Feng +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/bpf/syscall.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +--- a/kernel/bpf/syscall.c ++++ b/kernel/bpf/syscall.c +@@ -683,6 +683,17 @@ err_put: + return err; + } + ++static void maybe_wait_bpf_programs(struct bpf_map *map) ++{ ++ /* Wait for any running BPF programs to complete so that ++ * userspace, when we return to it, knows that all programs ++ * that could be running use the new map value. ++ */ ++ if (map->map_type == BPF_MAP_TYPE_HASH_OF_MAPS || ++ map->map_type == BPF_MAP_TYPE_ARRAY_OF_MAPS) ++ synchronize_rcu(); ++} ++ + #define BPF_MAP_UPDATE_ELEM_LAST_FIELD flags + + static int map_update_elem(union bpf_attr *attr) +@@ -769,6 +780,7 @@ static int map_update_elem(union bpf_att + } + __this_cpu_dec(bpf_prog_active); + preempt_enable(); ++ maybe_wait_bpf_programs(map); + out: + free_value: + kfree(value); +@@ -821,6 +833,7 @@ static int map_delete_elem(union bpf_att + rcu_read_unlock(); + __this_cpu_dec(bpf_prog_active); + preempt_enable(); ++ maybe_wait_bpf_programs(map); + out: + kfree(key); + err_put: diff --git a/queue-4.18/btrfs-don-t-run-delayed_iputs-in-commit.patch b/queue-4.18/btrfs-don-t-run-delayed_iputs-in-commit.patch new file mode 100644 index 00000000000..3b5eeafb30c --- /dev/null +++ b/queue-4.18/btrfs-don-t-run-delayed_iputs-in-commit.patch @@ -0,0 +1,52 @@ +From 30928e9baac238a7330085a1c5747f0b5df444b4 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Oct 2018 15:54:31 -0400 +Subject: btrfs: don't run delayed_iputs in commit + +From: Josef Bacik + +commit 30928e9baac238a7330085a1c5747f0b5df444b4 upstream. + +This could result in a really bad case where we do something like + +evict + evict_refill_and_join + btrfs_commit_transaction + btrfs_run_delayed_iputs + evict + evict_refill_and_join + btrfs_commit_transaction +... forever + +We have plenty of other places where we run delayed iputs that are much +safer, let those do the work. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/transaction.c | 9 --------- + 1 file changed, 9 deletions(-) + +--- a/fs/btrfs/transaction.c ++++ b/fs/btrfs/transaction.c +@@ -2281,15 +2281,6 @@ int btrfs_commit_transaction(struct btrf + + kmem_cache_free(btrfs_trans_handle_cachep, trans); + +- /* +- * If fs has been frozen, we can not handle delayed iputs, otherwise +- * it'll result in deadlock about SB_FREEZE_FS. +- */ +- if (current != fs_info->transaction_kthread && +- current != fs_info->cleaner_kthread && +- !test_bit(BTRFS_FS_FROZEN, &fs_info->flags)) +- btrfs_run_delayed_iputs(fs_info); +- + return ret; + + scrub_continue: diff --git a/queue-4.18/btrfs-don-t-use-ctl-free_space-for-max_extent_size.patch b/queue-4.18/btrfs-don-t-use-ctl-free_space-for-max_extent_size.patch new file mode 100644 index 00000000000..493b8e9407b --- /dev/null +++ b/queue-4.18/btrfs-don-t-use-ctl-free_space-for-max_extent_size.patch @@ -0,0 +1,56 @@ +From fb5c39d7a887108087de6ff93d3f326b01b4ef41 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Oct 2018 15:54:09 -0400 +Subject: btrfs: don't use ctl->free_space for max_extent_size + +From: Josef Bacik + +commit fb5c39d7a887108087de6ff93d3f326b01b4ef41 upstream. + +max_extent_size is supposed to be the largest contiguous range for the +space info, and ctl->free_space is the total free space in the block +group. We need to keep track of these separately and _only_ use the +max_free_space if we don't have a max_extent_size, as that means our +original request was too large to search any of the block groups for and +therefore wouldn't have a max_extent_size set. + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -7409,6 +7409,7 @@ static noinline int find_free_extent(str + struct btrfs_block_group_cache *block_group = NULL; + u64 search_start = 0; + u64 max_extent_size = 0; ++ u64 max_free_space = 0; + u64 empty_cluster = 0; + struct btrfs_space_info *space_info; + int loop = 0; +@@ -7704,8 +7705,8 @@ unclustered_alloc: + spin_lock(&ctl->tree_lock); + if (ctl->free_space < + num_bytes + empty_cluster + empty_size) { +- if (ctl->free_space > max_extent_size) +- max_extent_size = ctl->free_space; ++ max_free_space = max(max_free_space, ++ ctl->free_space); + spin_unlock(&ctl->tree_lock); + goto loop; + } +@@ -7874,6 +7875,8 @@ loop: + } + out: + if (ret == -ENOSPC) { ++ if (!max_extent_size) ++ max_extent_size = max_free_space; + spin_lock(&space_info->lock); + space_info->max_extent_size = max_extent_size; + spin_unlock(&space_info->lock); diff --git a/queue-4.18/btrfs-fix-assertion-on-fsync-of-regular-file-when-using-no-holes-feature.patch b/queue-4.18/btrfs-fix-assertion-on-fsync-of-regular-file-when-using-no-holes-feature.patch new file mode 100644 index 00000000000..36605a884fa --- /dev/null +++ b/queue-4.18/btrfs-fix-assertion-on-fsync-of-regular-file-when-using-no-holes-feature.patch @@ -0,0 +1,59 @@ +From 7ed586d0a8241e81d58c656c5b315f781fa6fc97 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 15 Oct 2018 09:51:00 +0100 +Subject: Btrfs: fix assertion on fsync of regular file when using no-holes feature + +From: Filipe Manana + +commit 7ed586d0a8241e81d58c656c5b315f781fa6fc97 upstream. + +When using the NO_HOLES feature and logging a regular file, we were +expecting that if we find an inline extent, that either its size in RAM +(uncompressed and unenconded) matches the size of the file or if it does +not, that it matches the sector size and it represents compressed data. +This assertion does not cover a case where the length of the inline extent +is smaller than the sector size and also smaller the file's size, such +case is possible through fallocate. Example: + + $ mkfs.btrfs -f -O no-holes /dev/sdb + $ mount /dev/sdb /mnt + + $ xfs_io -f -c "pwrite -S 0xb60 0 21" /mnt/foobar + $ xfs_io -c "falloc 40 40" /mnt/foobar + $ xfs_io -c "fsync" /mnt/foobar + +In the above example we trigger the assertion because the inline extent's +length is 21 bytes while the file size is 80 bytes. The fallocate() call +merely updated the file's size and did not touch the existing inline +extent, as expected. + +So fix this by adjusting the assertion so that an inline extent length +smaller than the file size is valid if the file size is smaller than the +filesystem's sector size. + +A test case for fstests follows soon. + +Reported-by: Anatoly Trosinenko +Fixes: a89ca6f24ffe ("Btrfs: fix fsync after truncate when no_holes feature is enabled") +CC: stable@vger.kernel.org # 4.14+ +Link: https://lore.kernel.org/linux-btrfs/CAE5jQCfRSBC7n4pUTFJcmHh109=gwyT9mFkCOL+NKfzswmR=_Q@mail.gmail.com/ +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/tree-log.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4803,7 +4803,8 @@ static int btrfs_log_trailing_hole(struc + ASSERT(len == i_size || + (len == fs_info->sectorsize && + btrfs_file_extent_compression(leaf, extent) != +- BTRFS_COMPRESS_NONE)); ++ BTRFS_COMPRESS_NONE) || ++ (len < i_size && i_size < fs_info->sectorsize)); + return 0; + } + diff --git a/queue-4.18/btrfs-fix-deadlock-when-writing-out-free-space-caches.patch b/queue-4.18/btrfs-fix-deadlock-when-writing-out-free-space-caches.patch new file mode 100644 index 00000000000..19b56a350ba --- /dev/null +++ b/queue-4.18/btrfs-fix-deadlock-when-writing-out-free-space-caches.patch @@ -0,0 +1,185 @@ +From 5ce555578e0919237fa4bda92b4670e2dd176f85 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 12 Oct 2018 10:03:55 +0100 +Subject: Btrfs: fix deadlock when writing out free space caches + +From: Filipe Manana + +commit 5ce555578e0919237fa4bda92b4670e2dd176f85 upstream. + +When writing out a block group free space cache we can end deadlocking +with ourselves on an extent buffer lock resulting in a warning like the +following: + + [245043.379979] WARNING: CPU: 4 PID: 2608 at fs/btrfs/locking.c:251 btrfs_tree_lock+0x1be/0x1d0 [btrfs] + [245043.392792] CPU: 4 PID: 2608 Comm: btrfs-transacti Tainted: G + W I 4.16.8 #1 + [245043.395489] RIP: 0010:btrfs_tree_lock+0x1be/0x1d0 [btrfs] + [245043.396791] RSP: 0018:ffffc9000424b840 EFLAGS: 00010246 + [245043.398093] RAX: 0000000000000a30 RBX: ffff8807e20a3d20 RCX: 0000000000000001 + [245043.399414] RDX: 0000000000000001 RSI: 0000000000000002 RDI: ffff8807e20a3d20 + [245043.400732] RBP: 0000000000000001 R08: ffff88041f39a700 R09: ffff880000000000 + [245043.402021] R10: 0000000000000040 R11: ffff8807e20a3d20 R12: ffff8807cb220630 + [245043.403296] R13: 0000000000000001 R14: ffff8807cb220628 R15: ffff88041fbdf000 + [245043.404780] FS: 0000000000000000(0000) GS:ffff88082fc80000(0000) knlGS:0000000000000000 + [245043.406050] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [245043.407321] CR2: 00007fffdbdb9f10 CR3: 0000000001c09005 CR4: 00000000000206e0 + [245043.408670] Call Trace: + [245043.409977] btrfs_search_slot+0x761/0xa60 [btrfs] + [245043.411278] btrfs_insert_empty_items+0x62/0xb0 [btrfs] + [245043.412572] btrfs_insert_item+0x5b/0xc0 [btrfs] + [245043.413922] btrfs_create_pending_block_groups+0xfb/0x1e0 [btrfs] + [245043.415216] do_chunk_alloc+0x1e5/0x2a0 [btrfs] + [245043.416487] find_free_extent+0xcd0/0xf60 [btrfs] + [245043.417813] btrfs_reserve_extent+0x96/0x1e0 [btrfs] + [245043.419105] btrfs_alloc_tree_block+0xfb/0x4a0 [btrfs] + [245043.420378] __btrfs_cow_block+0x127/0x550 [btrfs] + [245043.421652] btrfs_cow_block+0xee/0x190 [btrfs] + [245043.422979] btrfs_search_slot+0x227/0xa60 [btrfs] + [245043.424279] ? btrfs_update_inode_item+0x59/0x100 [btrfs] + [245043.425538] ? iput+0x72/0x1e0 + [245043.426798] write_one_cache_group.isra.49+0x20/0x90 [btrfs] + [245043.428131] btrfs_start_dirty_block_groups+0x102/0x420 [btrfs] + [245043.429419] btrfs_commit_transaction+0x11b/0x880 [btrfs] + [245043.430712] ? start_transaction+0x8e/0x410 [btrfs] + [245043.432006] transaction_kthread+0x184/0x1a0 [btrfs] + [245043.433341] kthread+0xf0/0x130 + [245043.434628] ? btrfs_cleanup_transaction+0x4e0/0x4e0 [btrfs] + [245043.435928] ? kthread_create_worker_on_cpu+0x40/0x40 + [245043.437236] ret_from_fork+0x1f/0x30 + [245043.441054] ---[ end trace 15abaa2aaf36827f ]--- + +This is because at write_one_cache_group() when we are COWing a leaf from +the extent tree we end up allocating a new block group (chunk) and, +because we have hit a threshold on the number of bytes reserved for system +chunks, we attempt to finalize the creation of new block groups from the +current transaction, by calling btrfs_create_pending_block_groups(). +However here we also need to modify the extent tree in order to insert +a block group item, and if the location for this new block group item +happens to be in the same leaf that we were COWing earlier, we deadlock +since btrfs_search_slot() tries to write lock the extent buffer that we +locked before at write_one_cache_group(). + +We have already hit similar cases in the past and commit d9a0540a79f8 +("Btrfs: fix deadlock when finalizing block group creation") fixed some +of those cases by delaying the creation of pending block groups at the +known specific spots that could lead to a deadlock. This change reworks +that commit to be more generic so that we don't have to add similar logic +to every possible path that can lead to a deadlock. This is done by +making __btrfs_cow_block() disallowing the creation of new block groups +(setting the transaction's can_flush_pending_bgs to false) before it +attempts to allocate a new extent buffer for either the extent, chunk or +device trees, since those are the trees that pending block creation +modifies. Once the new extent buffer is allocated, it allows creation of +pending block groups to happen again. + +This change depends on a recent patch from Josef which is not yet in +Linus' tree, named "btrfs: make sure we create all new block groups" in +order to avoid occasional warnings at btrfs_trans_release_chunk_metadata(). + +Fixes: d9a0540a79f8 ("Btrfs: fix deadlock when finalizing block group creation") +CC: stable@vger.kernel.org # 4.4+ +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=199753 +Link: https://lore.kernel.org/linux-btrfs/CAJtFHUTHna09ST-_EEiyWmDH6gAqS6wa=zMNMBsifj8ABu99cw@mail.gmail.com/ +Reported-by: E V +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 17 +++++++++++++++++ + fs/btrfs/extent-tree.c | 16 ++++++---------- + 2 files changed, 23 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1054,9 +1054,26 @@ static noinline int __btrfs_cow_block(st + if ((root->root_key.objectid == BTRFS_TREE_RELOC_OBJECTID) && parent) + parent_start = parent->start; + ++ /* ++ * If we are COWing a node/leaf from the extent, chunk or device trees, ++ * make sure that we do not finish block group creation of pending block ++ * groups. We do this to avoid a deadlock. ++ * COWing can result in allocation of a new chunk, and flushing pending ++ * block groups (btrfs_create_pending_block_groups()) can be triggered ++ * when finishing allocation of a new chunk. Creation of a pending block ++ * group modifies the extent, chunk and device trees, therefore we could ++ * deadlock with ourselves since we are holding a lock on an extent ++ * buffer that btrfs_create_pending_block_groups() may try to COW later. ++ */ ++ if (root == fs_info->extent_root || ++ root == fs_info->chunk_root || ++ root == fs_info->dev_root) ++ trans->can_flush_pending_bgs = false; ++ + cow = btrfs_alloc_tree_block(trans, root, parent_start, + root->root_key.objectid, &disk_key, level, + search_start, empty_size); ++ trans->can_flush_pending_bgs = true; + if (IS_ERR(cow)) + return PTR_ERR(cow); + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -3034,7 +3034,6 @@ int btrfs_run_delayed_refs(struct btrfs_ + struct btrfs_delayed_ref_head *head; + int ret; + int run_all = count == (unsigned long)-1; +- bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + + /* We'll clean this up in btrfs_cleanup_transaction */ + if (trans->aborted) +@@ -3051,7 +3050,6 @@ again: + #ifdef SCRAMBLE_DELAYED_REFS + delayed_refs->run_delayed_start = find_middle(&delayed_refs->root); + #endif +- trans->can_flush_pending_bgs = false; + ret = __btrfs_run_delayed_refs(trans, count); + if (ret < 0) { + btrfs_abort_transaction(trans, ret); +@@ -3082,7 +3080,6 @@ again: + goto again; + } + out: +- trans->can_flush_pending_bgs = can_flush_pending_bgs; + return 0; + } + +@@ -4685,11 +4682,9 @@ out: + * the block groups that were made dirty during the lifetime of the + * transaction. + */ +- if (trans->can_flush_pending_bgs && +- trans->chunk_bytes_reserved >= (u64)SZ_2M) { ++ if (trans->chunk_bytes_reserved >= (u64)SZ_2M) + btrfs_create_pending_block_groups(trans); +- btrfs_trans_release_chunk_metadata(trans); +- } ++ + return ret; + } + +@@ -10209,9 +10204,10 @@ void btrfs_create_pending_block_groups(s + struct btrfs_block_group_item item; + struct btrfs_key key; + int ret = 0; +- bool can_flush_pending_bgs = trans->can_flush_pending_bgs; + +- trans->can_flush_pending_bgs = false; ++ if (!trans->can_flush_pending_bgs) ++ return; ++ + while (!list_empty(&trans->new_bgs)) { + block_group = list_first_entry(&trans->new_bgs, + struct btrfs_block_group_cache, +@@ -10237,7 +10233,7 @@ void btrfs_create_pending_block_groups(s + next: + list_del_init(&block_group->bg_list); + } +- trans->can_flush_pending_bgs = can_flush_pending_bgs; ++ btrfs_trans_release_chunk_metadata(trans); + } + + int btrfs_make_block_group(struct btrfs_trans_handle *trans, diff --git a/queue-4.18/btrfs-fix-insert_reserved-error-handling.patch b/queue-4.18/btrfs-fix-insert_reserved-error-handling.patch new file mode 100644 index 00000000000..c438e0c3399 --- /dev/null +++ b/queue-4.18/btrfs-fix-insert_reserved-error-handling.patch @@ -0,0 +1,62 @@ +From 80ee54bfe8a3850015585ebc84e8d207fcae6831 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Oct 2018 15:54:22 -0400 +Subject: btrfs: fix insert_reserved error handling + +From: Josef Bacik + +commit 80ee54bfe8a3850015585ebc84e8d207fcae6831 upstream. + +We were not handling the reserved byte accounting properly for data +references. Metadata was fine, if it errored out the error paths would +free the bytes_reserved count and pin the extent, but it even missed one +of the error cases. So instead move this handling up into +run_one_delayed_ref so we are sure that both cases are properly cleaned +up in case of a transaction abort. + +CC: stable@vger.kernel.org # 4.18+ +Reviewed-by: Nikolay Borisov +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 12 ++++-------- + 1 file changed, 4 insertions(+), 8 deletions(-) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -2490,6 +2490,9 @@ static int run_one_delayed_ref(struct bt + insert_reserved); + else + BUG(); ++ if (ret && insert_reserved) ++ btrfs_pin_extent(trans->fs_info, node->bytenr, ++ node->num_bytes, 1); + return ret; + } + +@@ -8158,21 +8161,14 @@ static int alloc_reserved_tree_block(str + } + + path = btrfs_alloc_path(); +- if (!path) { +- btrfs_free_and_pin_reserved_extent(fs_info, +- extent_key.objectid, +- fs_info->nodesize); ++ if (!path) + return -ENOMEM; +- } + + path->leave_spinning = 1; + ret = btrfs_insert_empty_item(trans, fs_info->extent_root, path, + &extent_key, size); + if (ret) { + btrfs_free_path(path); +- btrfs_free_and_pin_reserved_extent(fs_info, +- extent_key.objectid, +- fs_info->nodesize); + return ret; + } + diff --git a/queue-4.18/btrfs-fix-null-pointer-dereference-on-compressed-write-path-error.patch b/queue-4.18/btrfs-fix-null-pointer-dereference-on-compressed-write-path-error.patch new file mode 100644 index 00000000000..7e527e56faf --- /dev/null +++ b/queue-4.18/btrfs-fix-null-pointer-dereference-on-compressed-write-path-error.patch @@ -0,0 +1,46 @@ +From 3527a018c00e5dbada2f9d7ed5576437b6dd5cfb Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Sat, 13 Oct 2018 00:37:25 +0100 +Subject: Btrfs: fix null pointer dereference on compressed write path error + +From: Filipe Manana + +commit 3527a018c00e5dbada2f9d7ed5576437b6dd5cfb upstream. + +At inode.c:compress_file_range(), under the "free_pages_out" label, we can +end up dereferencing the "pages" pointer when it has a NULL value. This +case happens when "start" has a value of 0 and we fail to allocate memory +for the "pages" pointer. When that happens we jump to the "cont" label and +then enter the "if (start == 0)" branch where we immediately call the +cow_file_range_inline() function. If that function returns 0 (success +creating an inline extent) or an error (like -ENOMEM for example) we jump +to the "free_pages_out" label and then access "pages[i]" leading to a NULL +pointer dereference, since "nr_pages" has a value greater than zero at +that point. + +Fix this by setting "nr_pages" to 0 when we fail to allocate memory for +the "pages" pointer. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201119 +Fixes: 771ed689d2cd ("Btrfs: Optimize compressed writeback and reads") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Liu Bo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -507,6 +507,7 @@ again: + pages = kcalloc(nr_pages, sizeof(struct page *), GFP_NOFS); + if (!pages) { + /* just bail out to the uncompressed code */ ++ nr_pages = 0; + goto cont; + } + diff --git a/queue-4.18/btrfs-fix-use-after-free-during-inode-eviction.patch b/queue-4.18/btrfs-fix-use-after-free-during-inode-eviction.patch new file mode 100644 index 00000000000..c812b89fdde --- /dev/null +++ b/queue-4.18/btrfs-fix-use-after-free-during-inode-eviction.patch @@ -0,0 +1,61 @@ +From 421f0922a2cfb0c75acd9746454aaa576c711a65 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Fri, 12 Oct 2018 13:02:48 +0100 +Subject: Btrfs: fix use-after-free during inode eviction + +From: Filipe Manana + +commit 421f0922a2cfb0c75acd9746454aaa576c711a65 upstream. + +At inode.c:evict_inode_truncate_pages(), when we iterate over the +inode's extent states, we access an extent state record's "state" field +after we unlocked the inode's io tree lock. This can lead to a +use-after-free issue because after we unlock the io tree that extent +state record might have been freed due to being merged into another +adjacent extent state record (a previous inflight bio for a read +operation finished in the meanwhile which unlocked a range in the io +tree and cause a merge of extent state records, as explained in the +comment before the while loop added in commit 6ca0709756710 ("Btrfs: fix +hang during inode eviction due to concurrent readahead")). + +Fix this by keeping a copy of the extent state's flags in a local +variable and using it after unlocking the io tree. + +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=201189 +Fixes: b9d0b38928e2 ("btrfs: Add handler for invalidate page") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Qu Wenruo +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -5302,11 +5302,13 @@ static void evict_inode_truncate_pages(s + struct extent_state *cached_state = NULL; + u64 start; + u64 end; ++ unsigned state_flags; + + node = rb_first(&io_tree->state); + state = rb_entry(node, struct extent_state, rb_node); + start = state->start; + end = state->end; ++ state_flags = state->state; + spin_unlock(&io_tree->lock); + + lock_extent_bits(io_tree, start, end, &cached_state); +@@ -5319,7 +5321,7 @@ static void evict_inode_truncate_pages(s + * + * Note, end is the bytenr of last byte, so we need + 1 here. + */ +- if (state->state & EXTENT_DELALLOC) ++ if (state_flags & EXTENT_DELALLOC) + btrfs_qgroup_free_data(inode, NULL, start, end - start + 1); + + clear_extent_bit(io_tree, start, end, diff --git a/queue-4.18/btrfs-fix-use-after-free-when-dumping-free-space.patch b/queue-4.18/btrfs-fix-use-after-free-when-dumping-free-space.patch new file mode 100644 index 00000000000..6e72383221f --- /dev/null +++ b/queue-4.18/btrfs-fix-use-after-free-when-dumping-free-space.patch @@ -0,0 +1,221 @@ +From 9084cb6a24bf5838a665af92ded1af8363f9e563 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 22 Oct 2018 10:43:06 +0100 +Subject: Btrfs: fix use-after-free when dumping free space + +From: Filipe Manana + +commit 9084cb6a24bf5838a665af92ded1af8363f9e563 upstream. + +We were iterating a block group's free space cache rbtree without locking +first the lock that protects it (the free_space_ctl->free_space_offset +rbtree is protected by the free_space_ctl->tree_lock spinlock). + +KASAN reported an use-after-free problem when iterating such a rbtree due +to a concurrent rbtree delete: + +[ 9520.359168] ================================================================== +[ 9520.359656] BUG: KASAN: use-after-free in rb_next+0x13/0x90 +[ 9520.359949] Read of size 8 at addr ffff8800b7ada500 by task btrfs-transacti/1721 +[ 9520.360357] +[ 9520.360530] CPU: 4 PID: 1721 Comm: btrfs-transacti Tainted: G L 4.19.0-rc8-nbor #555 +[ 9520.360990] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 +[ 9520.362682] Call Trace: +[ 9520.362887] dump_stack+0xa4/0xf5 +[ 9520.363146] print_address_description+0x78/0x280 +[ 9520.363412] kasan_report+0x263/0x390 +[ 9520.363650] ? rb_next+0x13/0x90 +[ 9520.363873] __asan_load8+0x54/0x90 +[ 9520.364102] rb_next+0x13/0x90 +[ 9520.364380] btrfs_dump_free_space+0x146/0x160 [btrfs] +[ 9520.364697] dump_space_info+0x2cd/0x310 [btrfs] +[ 9520.364997] btrfs_reserve_extent+0x1ee/0x1f0 [btrfs] +[ 9520.365310] __btrfs_prealloc_file_range+0x1cc/0x620 [btrfs] +[ 9520.365646] ? btrfs_update_time+0x180/0x180 [btrfs] +[ 9520.365923] ? _raw_spin_unlock+0x27/0x40 +[ 9520.366204] ? btrfs_alloc_data_chunk_ondemand+0x2c0/0x5c0 [btrfs] +[ 9520.366549] btrfs_prealloc_file_range_trans+0x23/0x30 [btrfs] +[ 9520.366880] cache_save_setup+0x42e/0x580 [btrfs] +[ 9520.367220] ? btrfs_check_data_free_space+0xd0/0xd0 [btrfs] +[ 9520.367518] ? lock_downgrade+0x2f0/0x2f0 +[ 9520.367799] ? btrfs_write_dirty_block_groups+0x11f/0x6e0 [btrfs] +[ 9520.368104] ? kasan_check_read+0x11/0x20 +[ 9520.368349] ? do_raw_spin_unlock+0xa8/0x140 +[ 9520.368638] btrfs_write_dirty_block_groups+0x2af/0x6e0 [btrfs] +[ 9520.368978] ? btrfs_start_dirty_block_groups+0x870/0x870 [btrfs] +[ 9520.369282] ? do_raw_spin_unlock+0xa8/0x140 +[ 9520.369534] ? _raw_spin_unlock+0x27/0x40 +[ 9520.369811] ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs] +[ 9520.370137] commit_cowonly_roots+0x4b9/0x610 [btrfs] +[ 9520.370560] ? commit_fs_roots+0x350/0x350 [btrfs] +[ 9520.370926] ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs] +[ 9520.371285] btrfs_commit_transaction+0x5e5/0x10e0 [btrfs] +[ 9520.371612] ? btrfs_apply_pending_changes+0x90/0x90 [btrfs] +[ 9520.371943] ? start_transaction+0x168/0x6c0 [btrfs] +[ 9520.372257] transaction_kthread+0x21c/0x240 [btrfs] +[ 9520.372537] kthread+0x1d2/0x1f0 +[ 9520.372793] ? btrfs_cleanup_transaction+0xb50/0xb50 [btrfs] +[ 9520.373090] ? kthread_park+0xb0/0xb0 +[ 9520.373329] ret_from_fork+0x3a/0x50 +[ 9520.373567] +[ 9520.373738] Allocated by task 1804: +[ 9520.373974] kasan_kmalloc+0xff/0x180 +[ 9520.374208] kasan_slab_alloc+0x11/0x20 +[ 9520.374447] kmem_cache_alloc+0xfc/0x2d0 +[ 9520.374731] __btrfs_add_free_space+0x40/0x580 [btrfs] +[ 9520.375044] unpin_extent_range+0x4f7/0x7a0 [btrfs] +[ 9520.375383] btrfs_finish_extent_commit+0x15f/0x4d0 [btrfs] +[ 9520.375707] btrfs_commit_transaction+0xb06/0x10e0 [btrfs] +[ 9520.376027] btrfs_alloc_data_chunk_ondemand+0x237/0x5c0 [btrfs] +[ 9520.376365] btrfs_check_data_free_space+0x81/0xd0 [btrfs] +[ 9520.376689] btrfs_delalloc_reserve_space+0x25/0x80 [btrfs] +[ 9520.377018] btrfs_direct_IO+0x42e/0x6d0 [btrfs] +[ 9520.377284] generic_file_direct_write+0x11e/0x220 +[ 9520.377587] btrfs_file_write_iter+0x472/0xac0 [btrfs] +[ 9520.377875] aio_write+0x25c/0x360 +[ 9520.378106] io_submit_one+0xaa0/0xdc0 +[ 9520.378343] __se_sys_io_submit+0xfa/0x2f0 +[ 9520.378589] __x64_sys_io_submit+0x43/0x50 +[ 9520.378840] do_syscall_64+0x7d/0x240 +[ 9520.379081] entry_SYSCALL_64_after_hwframe+0x49/0xbe +[ 9520.379387] +[ 9520.379557] Freed by task 1802: +[ 9520.379782] __kasan_slab_free+0x173/0x260 +[ 9520.380028] kasan_slab_free+0xe/0x10 +[ 9520.380262] kmem_cache_free+0xc1/0x2c0 +[ 9520.380544] btrfs_find_space_for_alloc+0x4cd/0x4e0 [btrfs] +[ 9520.380866] find_free_extent+0xa99/0x17e0 [btrfs] +[ 9520.381166] btrfs_reserve_extent+0xd5/0x1f0 [btrfs] +[ 9520.381474] btrfs_get_blocks_direct+0x60b/0xbd0 [btrfs] +[ 9520.381761] __blockdev_direct_IO+0x10ee/0x58a1 +[ 9520.382059] btrfs_direct_IO+0x25a/0x6d0 [btrfs] +[ 9520.382321] generic_file_direct_write+0x11e/0x220 +[ 9520.382623] btrfs_file_write_iter+0x472/0xac0 [btrfs] +[ 9520.382904] aio_write+0x25c/0x360 +[ 9520.383172] io_submit_one+0xaa0/0xdc0 +[ 9520.383416] __se_sys_io_submit+0xfa/0x2f0 +[ 9520.383678] __x64_sys_io_submit+0x43/0x50 +[ 9520.383927] do_syscall_64+0x7d/0x240 +[ 9520.384165] entry_SYSCALL_64_after_hwframe+0x49/0xbe +[ 9520.384439] +[ 9520.384610] The buggy address belongs to the object at ffff8800b7ada500 + which belongs to the cache btrfs_free_space of size 72 +[ 9520.385175] The buggy address is located 0 bytes inside of + 72-byte region [ffff8800b7ada500, ffff8800b7ada548) +[ 9520.385691] The buggy address belongs to the page: +[ 9520.385957] page:ffffea0002deb680 count:1 mapcount:0 mapping:ffff880108a1d700 index:0x0 compound_mapcount: 0 +[ 9520.388030] flags: 0x8100(slab|head) +[ 9520.388281] raw: 0000000000008100 ffffea0002deb608 ffffea0002728808 ffff880108a1d700 +[ 9520.388722] raw: 0000000000000000 0000000000130013 00000001ffffffff 0000000000000000 +[ 9520.389169] page dumped because: kasan: bad access detected +[ 9520.389473] +[ 9520.389658] Memory state around the buggy address: +[ 9520.389943] ffff8800b7ada400: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 9520.390368] ffff8800b7ada480: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 9520.390796] >ffff8800b7ada500: fb fb fb fb fb fb fb fb fb fc fc fc fc fc fc fc +[ 9520.391223] ^ +[ 9520.391461] ffff8800b7ada580: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 9520.391885] ffff8800b7ada600: fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc fc +[ 9520.392313] ================================================================== +[ 9520.392772] BTRFS critical (device vdc): entry offset 2258497536, bytes 131072, bitmap no +[ 9520.393247] BUG: unable to handle kernel NULL pointer dereference at 0000000000000011 +[ 9520.393705] PGD 800000010dbab067 P4D 800000010dbab067 PUD 107551067 PMD 0 +[ 9520.394059] Oops: 0000 [#1] SMP DEBUG_PAGEALLOC KASAN PTI +[ 9520.394378] CPU: 4 PID: 1721 Comm: btrfs-transacti Tainted: G B L 4.19.0-rc8-nbor #555 +[ 9520.394858] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.10.2-1ubuntu1 04/01/2014 +[ 9520.395350] RIP: 0010:rb_next+0x3c/0x90 +[ 9520.396461] RSP: 0018:ffff8801074ff780 EFLAGS: 00010292 +[ 9520.396762] RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff81b5ac4c +[ 9520.397115] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 0000000000000011 +[ 9520.397468] RBP: ffff8801074ff7a0 R08: ffffed0021d64ccc R09: ffffed0021d64ccc +[ 9520.397821] R10: 0000000000000001 R11: ffffed0021d64ccb R12: ffff8800b91e0000 +[ 9520.398188] R13: ffff8800a3ceba48 R14: ffff8800b627bf80 R15: 0000000000020000 +[ 9520.398555] FS: 0000000000000000(0000) GS:ffff88010eb00000(0000) knlGS:0000000000000000 +[ 9520.399007] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 9520.399335] CR2: 0000000000000011 CR3: 0000000106b52000 CR4: 00000000000006a0 +[ 9520.399679] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 9520.400023] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 9520.400400] Call Trace: +[ 9520.400648] btrfs_dump_free_space+0x146/0x160 [btrfs] +[ 9520.400974] dump_space_info+0x2cd/0x310 [btrfs] +[ 9520.401287] btrfs_reserve_extent+0x1ee/0x1f0 [btrfs] +[ 9520.401609] __btrfs_prealloc_file_range+0x1cc/0x620 [btrfs] +[ 9520.401952] ? btrfs_update_time+0x180/0x180 [btrfs] +[ 9520.402232] ? _raw_spin_unlock+0x27/0x40 +[ 9520.402522] ? btrfs_alloc_data_chunk_ondemand+0x2c0/0x5c0 [btrfs] +[ 9520.402882] btrfs_prealloc_file_range_trans+0x23/0x30 [btrfs] +[ 9520.403261] cache_save_setup+0x42e/0x580 [btrfs] +[ 9520.403570] ? btrfs_check_data_free_space+0xd0/0xd0 [btrfs] +[ 9520.403871] ? lock_downgrade+0x2f0/0x2f0 +[ 9520.404161] ? btrfs_write_dirty_block_groups+0x11f/0x6e0 [btrfs] +[ 9520.404481] ? kasan_check_read+0x11/0x20 +[ 9520.404732] ? do_raw_spin_unlock+0xa8/0x140 +[ 9520.405026] btrfs_write_dirty_block_groups+0x2af/0x6e0 [btrfs] +[ 9520.405375] ? btrfs_start_dirty_block_groups+0x870/0x870 [btrfs] +[ 9520.405694] ? do_raw_spin_unlock+0xa8/0x140 +[ 9520.405958] ? _raw_spin_unlock+0x27/0x40 +[ 9520.406243] ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs] +[ 9520.406574] commit_cowonly_roots+0x4b9/0x610 [btrfs] +[ 9520.406899] ? commit_fs_roots+0x350/0x350 [btrfs] +[ 9520.407253] ? btrfs_run_delayed_refs+0x1b8/0x230 [btrfs] +[ 9520.407589] btrfs_commit_transaction+0x5e5/0x10e0 [btrfs] +[ 9520.407925] ? btrfs_apply_pending_changes+0x90/0x90 [btrfs] +[ 9520.408262] ? start_transaction+0x168/0x6c0 [btrfs] +[ 9520.408582] transaction_kthread+0x21c/0x240 [btrfs] +[ 9520.408870] kthread+0x1d2/0x1f0 +[ 9520.409138] ? btrfs_cleanup_transaction+0xb50/0xb50 [btrfs] +[ 9520.409440] ? kthread_park+0xb0/0xb0 +[ 9520.409682] ret_from_fork+0x3a/0x50 +[ 9520.410508] Dumping ftrace buffer: +[ 9520.410764] (ftrace buffer empty) +[ 9520.411007] CR2: 0000000000000011 +[ 9520.411297] ---[ end trace 01a0863445cf360a ]--- +[ 9520.411568] RIP: 0010:rb_next+0x3c/0x90 +[ 9520.412644] RSP: 0018:ffff8801074ff780 EFLAGS: 00010292 +[ 9520.412932] RAX: 0000000000000000 RBX: 0000000000000001 RCX: ffffffff81b5ac4c +[ 9520.413274] RDX: 0000000000000000 RSI: 0000000000000008 RDI: 0000000000000011 +[ 9520.413616] RBP: ffff8801074ff7a0 R08: ffffed0021d64ccc R09: ffffed0021d64ccc +[ 9520.414007] R10: 0000000000000001 R11: ffffed0021d64ccb R12: ffff8800b91e0000 +[ 9520.414349] R13: ffff8800a3ceba48 R14: ffff8800b627bf80 R15: 0000000000020000 +[ 9520.416074] FS: 0000000000000000(0000) GS:ffff88010eb00000(0000) knlGS:0000000000000000 +[ 9520.416536] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 9520.416848] CR2: 0000000000000011 CR3: 0000000106b52000 CR4: 00000000000006a0 +[ 9520.418477] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 9520.418846] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 9520.419204] Kernel panic - not syncing: Fatal exception +[ 9520.419666] Dumping ftrace buffer: +[ 9520.419930] (ftrace buffer empty) +[ 9520.420168] Kernel Offset: disabled +[ 9520.420406] ---[ end Kernel panic - not syncing: Fatal exception ]--- + +Fix this by acquiring the respective lock before iterating the rbtree. + +Reported-by: Nikolay Borisov +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/free-space-cache.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -2466,6 +2466,7 @@ void btrfs_dump_free_space(struct btrfs_ + struct rb_node *n; + int count = 0; + ++ spin_lock(&ctl->tree_lock); + for (n = rb_first(&ctl->free_space_offset); n; n = rb_next(n)) { + info = rb_entry(n, struct btrfs_free_space, offset_index); + if (info->bytes >= bytes && !block_group->ro) +@@ -2474,6 +2475,7 @@ void btrfs_dump_free_space(struct btrfs_ + info->offset, info->bytes, + (info->bitmap) ? "yes" : "no"); + } ++ spin_unlock(&ctl->tree_lock); + btrfs_info(fs_info, "block group has cluster?: %s", + list_empty(&block_group->cluster_list) ? "no" : "yes"); + btrfs_info(fs_info, diff --git a/queue-4.18/btrfs-move-the-dio_sem-higher-up-the-callchain.patch b/queue-4.18/btrfs-move-the-dio_sem-higher-up-the-callchain.patch new file mode 100644 index 00000000000..f5a8b6ac666 --- /dev/null +++ b/queue-4.18/btrfs-move-the-dio_sem-higher-up-the-callchain.patch @@ -0,0 +1,239 @@ +From c495144bc6962186feae31d687596d2472000e45 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 12 Oct 2018 15:32:32 -0400 +Subject: btrfs: move the dio_sem higher up the callchain + +From: Josef Bacik + +commit c495144bc6962186feae31d687596d2472000e45 upstream. + +We're getting a lockdep splat because we take the dio_sem under the +log_mutex. What we really need is to protect fsync() from logging an +extent map for an extent we never waited on higher up, so just guard the +whole thing with dio_sem. + +====================================================== +WARNING: possible circular locking dependency detected +4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411 Not tainted +------------------------------------------------------ +aio-dio-invalid/30928 is trying to acquire lock: +0000000092621cfd (&mm->mmap_sem){++++}, at: get_user_pages_unlocked+0x5a/0x1e0 + +but task is already holding lock: +00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400 + +which lock already depends on the new lock. + +the existing dependency chain (in reverse order) is: + +-> #5 (&ei->dio_sem){++++}: + lock_acquire+0xbd/0x220 + down_write+0x51/0xb0 + btrfs_log_changed_extents+0x80/0xa40 + btrfs_log_inode+0xbaf/0x1000 + btrfs_log_inode_parent+0x26f/0xa80 + btrfs_log_dentry_safe+0x50/0x70 + btrfs_sync_file+0x357/0x540 + do_fsync+0x38/0x60 + __ia32_sys_fdatasync+0x12/0x20 + do_fast_syscall_32+0x9a/0x2f0 + entry_SYSENTER_compat+0x84/0x96 + +-> #4 (&ei->log_mutex){+.+.}: + lock_acquire+0xbd/0x220 + __mutex_lock+0x86/0xa10 + btrfs_record_unlink_dir+0x2a/0xa0 + btrfs_unlink+0x5a/0xc0 + vfs_unlink+0xb1/0x1a0 + do_unlinkat+0x264/0x2b0 + do_fast_syscall_32+0x9a/0x2f0 + entry_SYSENTER_compat+0x84/0x96 + +-> #3 (sb_internal#2){.+.+}: + lock_acquire+0xbd/0x220 + __sb_start_write+0x14d/0x230 + start_transaction+0x3e6/0x590 + btrfs_evict_inode+0x475/0x640 + evict+0xbf/0x1b0 + btrfs_run_delayed_iputs+0x6c/0x90 + cleaner_kthread+0x124/0x1a0 + kthread+0x106/0x140 + ret_from_fork+0x3a/0x50 + +-> #2 (&fs_info->cleaner_delayed_iput_mutex){+.+.}: + lock_acquire+0xbd/0x220 + __mutex_lock+0x86/0xa10 + btrfs_alloc_data_chunk_ondemand+0x197/0x530 + btrfs_check_data_free_space+0x4c/0x90 + btrfs_delalloc_reserve_space+0x20/0x60 + btrfs_page_mkwrite+0x87/0x520 + do_page_mkwrite+0x31/0xa0 + __handle_mm_fault+0x799/0xb00 + handle_mm_fault+0x7c/0xe0 + __do_page_fault+0x1d3/0x4a0 + async_page_fault+0x1e/0x30 + +-> #1 (sb_pagefaults){.+.+}: + lock_acquire+0xbd/0x220 + __sb_start_write+0x14d/0x230 + btrfs_page_mkwrite+0x6a/0x520 + do_page_mkwrite+0x31/0xa0 + __handle_mm_fault+0x799/0xb00 + handle_mm_fault+0x7c/0xe0 + __do_page_fault+0x1d3/0x4a0 + async_page_fault+0x1e/0x30 + +-> #0 (&mm->mmap_sem){++++}: + __lock_acquire+0x42e/0x7a0 + lock_acquire+0xbd/0x220 + down_read+0x48/0xb0 + get_user_pages_unlocked+0x5a/0x1e0 + get_user_pages_fast+0xa4/0x150 + iov_iter_get_pages+0xc3/0x340 + do_direct_IO+0xf93/0x1d70 + __blockdev_direct_IO+0x32d/0x1c20 + btrfs_direct_IO+0x227/0x400 + generic_file_direct_write+0xcf/0x180 + btrfs_file_write_iter+0x308/0x58c + aio_write+0xf8/0x1d0 + io_submit_one+0x3a9/0x620 + __ia32_compat_sys_io_submit+0xb2/0x270 + do_int80_syscall_32+0x5b/0x1a0 + entry_INT80_compat+0x88/0xa0 + +other info that might help us debug this: + +Chain exists of: + &mm->mmap_sem --> &ei->log_mutex --> &ei->dio_sem + + Possible unsafe locking scenario: + + CPU0 CPU1 + ---- ---- + lock(&ei->dio_sem); + lock(&ei->log_mutex); + lock(&ei->dio_sem); + lock(&mm->mmap_sem); + + *** DEADLOCK *** + +1 lock held by aio-dio-invalid/30928: + #0: 00000000cefe6b35 (&ei->dio_sem){++++}, at: btrfs_direct_IO+0x3be/0x400 + +stack backtrace: +CPU: 0 PID: 30928 Comm: aio-dio-invalid Not tainted 4.18.0-rc4-xfstests-00025-g5de5edbaf1d4 #411 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.0-2.el7 04/01/2014 +Call Trace: + dump_stack+0x7c/0xbb + print_circular_bug.isra.37+0x297/0x2a4 + check_prev_add.constprop.45+0x781/0x7a0 + ? __lock_acquire+0x42e/0x7a0 + validate_chain.isra.41+0x7f0/0xb00 + __lock_acquire+0x42e/0x7a0 + lock_acquire+0xbd/0x220 + ? get_user_pages_unlocked+0x5a/0x1e0 + down_read+0x48/0xb0 + ? get_user_pages_unlocked+0x5a/0x1e0 + get_user_pages_unlocked+0x5a/0x1e0 + get_user_pages_fast+0xa4/0x150 + iov_iter_get_pages+0xc3/0x340 + do_direct_IO+0xf93/0x1d70 + ? __alloc_workqueue_key+0x358/0x490 + ? __blockdev_direct_IO+0x14b/0x1c20 + __blockdev_direct_IO+0x32d/0x1c20 + ? btrfs_run_delalloc_work+0x40/0x40 + ? can_nocow_extent+0x490/0x490 + ? kvm_clock_read+0x1f/0x30 + ? can_nocow_extent+0x490/0x490 + ? btrfs_run_delalloc_work+0x40/0x40 + btrfs_direct_IO+0x227/0x400 + ? btrfs_run_delalloc_work+0x40/0x40 + generic_file_direct_write+0xcf/0x180 + btrfs_file_write_iter+0x308/0x58c + aio_write+0xf8/0x1d0 + ? kvm_clock_read+0x1f/0x30 + ? __might_fault+0x3e/0x90 + io_submit_one+0x3a9/0x620 + ? io_submit_one+0xe5/0x620 + __ia32_compat_sys_io_submit+0xb2/0x270 + do_int80_syscall_32+0x5b/0x1a0 + entry_INT80_compat+0x88/0xa0 + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/file.c | 12 ++++++++++++ + fs/btrfs/tree-log.c | 2 -- + 2 files changed, 12 insertions(+), 2 deletions(-) + +--- a/fs/btrfs/file.c ++++ b/fs/btrfs/file.c +@@ -2082,6 +2082,14 @@ int btrfs_sync_file(struct file *file, l + goto out; + + inode_lock(inode); ++ ++ /* ++ * We take the dio_sem here because the tree log stuff can race with ++ * lockless dio writes and get an extent map logged for an extent we ++ * never waited on. We need it this high up for lockdep reasons. ++ */ ++ down_write(&BTRFS_I(inode)->dio_sem); ++ + atomic_inc(&root->log_batch); + full_sync = test_bit(BTRFS_INODE_NEEDS_FULL_SYNC, + &BTRFS_I(inode)->runtime_flags); +@@ -2133,6 +2141,7 @@ int btrfs_sync_file(struct file *file, l + ret = start_ordered_ops(inode, start, end); + } + if (ret) { ++ up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; + } +@@ -2188,6 +2197,7 @@ int btrfs_sync_file(struct file *file, l + * checked called fsync. + */ + ret = filemap_check_wb_err(inode->i_mapping, file->f_wb_err); ++ up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; + } +@@ -2206,6 +2216,7 @@ int btrfs_sync_file(struct file *file, l + trans = btrfs_start_transaction(root, 0); + if (IS_ERR(trans)) { + ret = PTR_ERR(trans); ++ up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + goto out; + } +@@ -2227,6 +2238,7 @@ int btrfs_sync_file(struct file *file, l + * file again, but that will end up using the synchronization + * inside btrfs_sync_log to keep things safe. + */ ++ up_write(&BTRFS_I(inode)->dio_sem); + inode_unlock(inode); + + /* +--- a/fs/btrfs/tree-log.c ++++ b/fs/btrfs/tree-log.c +@@ -4524,7 +4524,6 @@ static int btrfs_log_changed_extents(str + + INIT_LIST_HEAD(&extents); + +- down_write(&inode->dio_sem); + write_lock(&tree->lock); + test_gen = root->fs_info->last_trans_committed; + logged_start = start; +@@ -4605,7 +4604,6 @@ process: + } + WARN_ON(!list_empty(&extents)); + write_unlock(&tree->lock); +- up_write(&inode->dio_sem); + + btrfs_release_path(path); + if (!ret) diff --git a/queue-4.18/btrfs-only-free-reserved-extent-if-we-didn-t-insert-it.patch b/queue-4.18/btrfs-only-free-reserved-extent-if-we-didn-t-insert-it.patch new file mode 100644 index 00000000000..af0461a2303 --- /dev/null +++ b/queue-4.18/btrfs-only-free-reserved-extent-if-we-didn-t-insert-it.patch @@ -0,0 +1,66 @@ +From 49940bdd57779c78462da7aa5a8650b2fea8c2ff Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Oct 2018 15:54:21 -0400 +Subject: btrfs: only free reserved extent if we didn't insert it + +From: Josef Bacik + +commit 49940bdd57779c78462da7aa5a8650b2fea8c2ff upstream. + +When we insert the file extent once the ordered extent completes we free +the reserved extent reservation as it'll have been migrated to the +bytes_used counter. However if we error out after this step we'll still +clear the reserved extent reservation, resulting in a negative +accounting of the reserved bytes for the block group and space info. +Fix this by only doing the free if we didn't successfully insert a file +extent for this extent. + +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Omar Sandoval +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/inode.c | 10 +++++++++- + 1 file changed, 9 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -2951,6 +2951,7 @@ static int btrfs_finish_ordered_io(struc + bool truncated = false; + bool range_locked = false; + bool clear_new_delalloc_bytes = false; ++ bool clear_reserved_extent = true; + + if (!test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags) && +@@ -3054,10 +3055,12 @@ static int btrfs_finish_ordered_io(struc + logical_len, logical_len, + compress_type, 0, 0, + BTRFS_FILE_EXTENT_REG); +- if (!ret) ++ if (!ret) { ++ clear_reserved_extent = false; + btrfs_release_delalloc_bytes(fs_info, + ordered_extent->start, + ordered_extent->disk_len); ++ } + } + unpin_extent_cache(&BTRFS_I(inode)->extent_tree, + ordered_extent->file_offset, ordered_extent->len, +@@ -3118,8 +3121,13 @@ out: + * wrong we need to return the space for this ordered extent + * back to the allocator. We only free the extent in the + * truncated case if we didn't write out the extent at all. ++ * ++ * If we made it past insert_reserved_file_extent before we ++ * errored out then we don't need to do this as the accounting ++ * has already been done. + */ + if ((ret || !logical_len) && ++ clear_reserved_extent && + !test_bit(BTRFS_ORDERED_NOCOW, &ordered_extent->flags) && + !test_bit(BTRFS_ORDERED_PREALLOC, &ordered_extent->flags)) + btrfs_free_reserved_extent(fs_info, diff --git a/queue-4.18/btrfs-reset-max_extent_size-properly.patch b/queue-4.18/btrfs-reset-max_extent_size-properly.patch new file mode 100644 index 00000000000..226ace44322 --- /dev/null +++ b/queue-4.18/btrfs-reset-max_extent_size-properly.patch @@ -0,0 +1,42 @@ +From 21a94f7acf0f748599ea552af5d9ee7d7e41c72f Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Thu, 11 Oct 2018 15:54:03 -0400 +Subject: btrfs: reset max_extent_size properly + +From: Josef Bacik + +commit 21a94f7acf0f748599ea552af5d9ee7d7e41c72f upstream. + +If we use up our block group before allocating a new one we'll easily +get a max_extent_size that's set really really low, which will result in +a lot of fragmentation. We need to make sure we're resetting the +max_extent_size when we add a new chunk or add new space. + +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Filipe Manana +Signed-off-by: Josef Bacik +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent-tree.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/fs/btrfs/extent-tree.c ++++ b/fs/btrfs/extent-tree.c +@@ -4661,6 +4661,7 @@ again: + goto out; + } else { + ret = 1; ++ space_info->max_extent_size = 0; + } + + space_info->force_alloc = CHUNK_ALLOC_NO_FORCE; +@@ -6576,6 +6577,7 @@ static int btrfs_free_reserved_bytes(str + space_info->bytes_readonly += num_bytes; + cache->reserved -= num_bytes; + space_info->bytes_reserved -= num_bytes; ++ space_info->max_extent_size = 0; + + if (delalloc) + cache->delalloc_bytes -= num_bytes; diff --git a/queue-4.18/btrfs-set-max_extent_size-properly.patch b/queue-4.18/btrfs-set-max_extent_size-properly.patch new file mode 100644 index 00000000000..3d28e0e1947 --- /dev/null +++ b/queue-4.18/btrfs-set-max_extent_size-properly.patch @@ -0,0 +1,97 @@ +From ad22cf6ea47fa20fbe11ac324a0a15c0a9a4a2a9 Mon Sep 17 00:00:00 2001 +From: Josef Bacik +Date: Fri, 12 Oct 2018 15:32:33 -0400 +Subject: btrfs: set max_extent_size properly + +From: Josef Bacik + +commit ad22cf6ea47fa20fbe11ac324a0a15c0a9a4a2a9 upstream. + +We can't use entry->bytes if our entry is a bitmap entry, we need to use +entry->max_extent_size in that case. Fix up all the logic to make this +consistent. + +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Josef Bacik +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/free-space-cache.c | 30 ++++++++++++++++++++---------- + 1 file changed, 20 insertions(+), 10 deletions(-) + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -1779,6 +1779,13 @@ static int search_bitmap(struct btrfs_fr + return -1; + } + ++static inline u64 get_max_extent_size(struct btrfs_free_space *entry) ++{ ++ if (entry->bitmap) ++ return entry->max_extent_size; ++ return entry->bytes; ++} ++ + /* Cache the size of the max extent in bytes */ + static struct btrfs_free_space * + find_free_space(struct btrfs_free_space_ctl *ctl, u64 *offset, u64 *bytes, +@@ -1800,8 +1807,8 @@ find_free_space(struct btrfs_free_space_ + for (node = &entry->offset_index; node; node = rb_next(node)) { + entry = rb_entry(node, struct btrfs_free_space, offset_index); + if (entry->bytes < *bytes) { +- if (entry->bytes > *max_extent_size) +- *max_extent_size = entry->bytes; ++ *max_extent_size = max(get_max_extent_size(entry), ++ *max_extent_size); + continue; + } + +@@ -1819,8 +1826,8 @@ find_free_space(struct btrfs_free_space_ + } + + if (entry->bytes < *bytes + align_off) { +- if (entry->bytes > *max_extent_size) +- *max_extent_size = entry->bytes; ++ *max_extent_size = max(get_max_extent_size(entry), ++ *max_extent_size); + continue; + } + +@@ -1832,8 +1839,10 @@ find_free_space(struct btrfs_free_space_ + *offset = tmp; + *bytes = size; + return entry; +- } else if (size > *max_extent_size) { +- *max_extent_size = size; ++ } else { ++ *max_extent_size = ++ max(get_max_extent_size(entry), ++ *max_extent_size); + } + continue; + } +@@ -2693,8 +2702,8 @@ static u64 btrfs_alloc_from_bitmap(struc + + err = search_bitmap(ctl, entry, &search_start, &search_bytes, true); + if (err) { +- if (search_bytes > *max_extent_size) +- *max_extent_size = search_bytes; ++ *max_extent_size = max(get_max_extent_size(entry), ++ *max_extent_size); + return 0; + } + +@@ -2731,8 +2740,9 @@ u64 btrfs_alloc_from_cluster(struct btrf + + entry = rb_entry(node, struct btrfs_free_space, offset_index); + while (1) { +- if (entry->bytes < bytes && entry->bytes > *max_extent_size) +- *max_extent_size = entry->bytes; ++ if (entry->bytes < bytes) ++ *max_extent_size = max(get_max_extent_size(entry), ++ *max_extent_size); + + if (entry->bytes < bytes || + (!entry->bitmap && entry->offset < min_start)) { diff --git a/queue-4.18/net-sched-remove-tca_options-from-policy.patch b/queue-4.18/net-sched-remove-tca_options-from-policy.patch new file mode 100644 index 00000000000..6f1046e2ede --- /dev/null +++ b/queue-4.18/net-sched-remove-tca_options-from-policy.patch @@ -0,0 +1,36 @@ +From e72bde6b66299602087c8c2350d36a525e75d06e Mon Sep 17 00:00:00 2001 +From: David Ahern +Date: Wed, 24 Oct 2018 08:32:49 -0700 +Subject: net: sched: Remove TCA_OPTIONS from policy + +From: David Ahern + +commit e72bde6b66299602087c8c2350d36a525e75d06e upstream. + +Marco reported an error with hfsc: +root@Calimero:~# tc qdisc add dev eth0 root handle 1:0 hfsc default 1 +Error: Attribute failed policy validation. + +Apparently a few implementations pass TCA_OPTIONS as a binary instead +of nested attribute, so drop TCA_OPTIONS from the policy. + +Fixes: 8b4c3cdd9dd8 ("net: sched: Add policy validation for tc attributes") +Reported-by: Marco Berizzi +Signed-off-by: David Ahern +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman + +--- + net/sched/sch_api.c | 1 - + 1 file changed, 1 deletion(-) + +--- a/net/sched/sch_api.c ++++ b/net/sched/sch_api.c +@@ -1306,7 +1306,6 @@ check_loop_fn(struct Qdisc *q, unsigned + + const struct nla_policy rtm_tca_policy[TCA_MAX + 1] = { + [TCA_KIND] = { .type = NLA_STRING }, +- [TCA_OPTIONS] = { .type = NLA_NESTED }, + [TCA_RATE] = { .type = NLA_BINARY, + .len = sizeof(struct tc_estimator) }, + [TCA_STAB] = { .type = NLA_NESTED }, diff --git a/queue-4.18/series b/queue-4.18/series index 0d092b45cf5..802e4583898 100644 --- a/queue-4.18/series +++ b/queue-4.18/series @@ -333,3 +333,18 @@ btrfs-make-sure-we-create-all-new-block-groups.patch btrfs-fix-warning-when-replaying-log-after-fsync-of-a-tmpfile.patch btrfs-fix-wrong-dentries-after-fsync-of-file-that-got-its-parent-replaced.patch btrfs-qgroup-dirty-all-qgroups-before-rescan.patch +btrfs-fix-null-pointer-dereference-on-compressed-write-path-error.patch +btrfs-fix-assertion-on-fsync-of-regular-file-when-using-no-holes-feature.patch +btrfs-fix-deadlock-when-writing-out-free-space-caches.patch +btrfs-reset-max_extent_size-properly.patch +btrfs-set-max_extent_size-properly.patch +btrfs-don-t-use-ctl-free_space-for-max_extent_size.patch +btrfs-only-free-reserved-extent-if-we-didn-t-insert-it.patch +btrfs-fix-insert_reserved-error-handling.patch +btrfs-don-t-run-delayed_iputs-in-commit.patch +btrfs-move-the-dio_sem-higher-up-the-callchain.patch +btrfs-fix-use-after-free-during-inode-eviction.patch +btrfs-fix-use-after-free-when-dumping-free-space.patch +net-sched-remove-tca_options-from-policy.patch +userns-also-map-extents-in-the-reverse-map-to-kernel-ids.patch +bpf-wait-for-running-bpf-programs-when-updating-map-in-map.patch diff --git a/queue-4.18/userns-also-map-extents-in-the-reverse-map-to-kernel-ids.patch b/queue-4.18/userns-also-map-extents-in-the-reverse-map-to-kernel-ids.patch new file mode 100644 index 00000000000..9ada3ea8fa4 --- /dev/null +++ b/queue-4.18/userns-also-map-extents-in-the-reverse-map-to-kernel-ids.patch @@ -0,0 +1,65 @@ +From d2f007dbe7e4c9583eea6eb04d60001e85c6f1bd Mon Sep 17 00:00:00 2001 +From: Jann Horn +Date: Mon, 5 Nov 2018 20:55:09 +0100 +Subject: userns: also map extents in the reverse map to kernel IDs + +From: Jann Horn + +commit d2f007dbe7e4c9583eea6eb04d60001e85c6f1bd upstream. + +The current logic first clones the extent array and sorts both copies, then +maps the lower IDs of the forward mapping into the lower namespace, but +doesn't map the lower IDs of the reverse mapping. + +This means that code in a nested user namespace with >5 extents will see +incorrect IDs. It also breaks some access checks, like +inode_owner_or_capable() and privileged_wrt_inode_uidgid(), so a process +can incorrectly appear to be capable relative to an inode. + +To fix it, we have to make sure that the "lower_first" members of extents +in both arrays are translated; and we have to make sure that the reverse +map is sorted *after* the translation (since otherwise the translation can +break the sorting). + +This is CVE-2018-18955. + +Fixes: 6397fac4915a ("userns: bump idmap limits to 340") +Cc: stable@vger.kernel.org +Signed-off-by: Jann Horn +Tested-by: Eric W. Biederman +Reviewed-by: Eric W. Biederman +Signed-off-by: Eric W. Biederman +Signed-off-by: Greg Kroah-Hartman + +--- + kernel/user_namespace.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -974,10 +974,6 @@ static ssize_t map_write(struct file *fi + if (!new_idmap_permitted(file, ns, cap_setid, &new_map)) + goto out; + +- ret = sort_idmaps(&new_map); +- if (ret < 0) +- goto out; +- + ret = -EPERM; + /* Map the lower ids from the parent user namespace to the + * kernel global id space. +@@ -1004,6 +1000,14 @@ static ssize_t map_write(struct file *fi + e->lower_first = lower_first; + } + ++ /* ++ * If we want to use binary search for lookup, this clones the extent ++ * array and sorts both copies. ++ */ ++ ret = sort_idmaps(&new_map); ++ if (ret < 0) ++ goto out; ++ + /* Install the map */ + if (new_map.nr_extents <= UID_GID_MAP_MAX_BASE_EXTENTS) { + memcpy(map->extent, new_map.extent,