From 67fc38223b665a7ee178538b63d97cc3fdee82ac Mon Sep 17 00:00:00 2001 From: Greg Kroah-Hartman Date: Thu, 3 Oct 2019 13:56:43 +0200 Subject: [PATCH] 4.19-stable patches added patches: block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch block-mq-deadline-fix-queue-restart-handling.patch btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch btrfs-relinquish-cpus-in-btrfs_compare_trees.patch efifb-bgrt-improve-efifb_bgrt_sanity_check.patch gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch memcg-kmem-do-not-fail-__gfp_nofail-charges.patch memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch ovl-filter-of-trusted-xattr-results-in-audit.patch ovl-fix-dereferencing-possible-err_ptr.patch smb3-allow-disabling-requesting-leases.patch --- ...r-dereference-in-blk_mq_rq_timed_out.patch | 137 ++++++++++++ ...-deadline-fix-queue-restart-handling.patch | 107 ++++++++++ ...-of-free-space-cache-v1-bitmap-pages.patch | 189 ++++++++++++++++ ...and-completing-qgroup-rescan-workers.patch | 202 ++++++++++++++++++ ...when-using-the-tree-modification-log.patch | 99 +++++++++ ...ak-if-we-have-multiple-reserve-calls.patch | 92 ++++++++ ...ree-when-freeing-reserved-data-space.patch | 81 +++++++ ...linquish-cpus-in-btrfs_compare_trees.patch | 69 ++++++ ...bgrt-improve-efifb_bgrt_sanity_check.patch | 71 ++++++ ...-a-transaction-in-sweep_bh_for_rgrps.patch | 39 ++++ ...ble-bit-in-i40e_sync_filters_subtask.patch | 74 +++++++ ...mem-do-not-fail-__gfp_nofail-charges.patch | 87 ++++++++ ...fp_fs-when-invoking-memcg-oom-killer.patch | 182 ++++++++++++++++ ...er-of-trusted-xattr-results-in-audit.patch | 41 ++++ ...l-fix-dereferencing-possible-err_ptr.patch | 35 +++ queue-4.19/series | 16 ++ ...b3-allow-disabling-requesting-leases.patch | 118 ++++++++++ 17 files changed, 1639 insertions(+) create mode 100644 queue-4.19/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch create mode 100644 queue-4.19/block-mq-deadline-fix-queue-restart-handling.patch create mode 100644 queue-4.19/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch create mode 100644 queue-4.19/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch create mode 100644 queue-4.19/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch create mode 100644 queue-4.19/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch create mode 100644 queue-4.19/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch create mode 100644 queue-4.19/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch create mode 100644 queue-4.19/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch create mode 100644 queue-4.19/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch create mode 100644 queue-4.19/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch create mode 100644 queue-4.19/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch create mode 100644 queue-4.19/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch create mode 100644 queue-4.19/ovl-filter-of-trusted-xattr-results-in-audit.patch create mode 100644 queue-4.19/ovl-fix-dereferencing-possible-err_ptr.patch create mode 100644 queue-4.19/smb3-allow-disabling-requesting-leases.patch diff --git a/queue-4.19/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch b/queue-4.19/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch new file mode 100644 index 00000000000..4c10ff8d9b8 --- /dev/null +++ b/queue-4.19/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch @@ -0,0 +1,137 @@ +From 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 Mon Sep 17 00:00:00 2001 +From: Yufen Yu +Date: Fri, 27 Sep 2019 16:19:55 +0800 +Subject: block: fix null pointer dereference in blk_mq_rq_timed_out() + +From: Yufen Yu + +commit 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 upstream. + +We got a null pointer deference BUG_ON in blk_mq_rq_timed_out() +as following: + +[ 108.825472] BUG: kernel NULL pointer dereference, address: 0000000000000040 +[ 108.827059] PGD 0 P4D 0 +[ 108.827313] Oops: 0000 [#1] SMP PTI +[ 108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431 +[ 108.829503] Workqueue: kblockd blk_mq_timeout_work +[ 108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330 +[ 108.838191] Call Trace: +[ 108.838406] bt_iter+0x74/0x80 +[ 108.838665] blk_mq_queue_tag_busy_iter+0x204/0x450 +[ 108.839074] ? __switch_to_asm+0x34/0x70 +[ 108.839405] ? blk_mq_stop_hw_queue+0x40/0x40 +[ 108.839823] ? blk_mq_stop_hw_queue+0x40/0x40 +[ 108.840273] ? syscall_return_via_sysret+0xf/0x7f +[ 108.840732] blk_mq_timeout_work+0x74/0x200 +[ 108.841151] process_one_work+0x297/0x680 +[ 108.841550] worker_thread+0x29c/0x6f0 +[ 108.841926] ? rescuer_thread+0x580/0x580 +[ 108.842344] kthread+0x16a/0x1a0 +[ 108.842666] ? kthread_flush_work+0x170/0x170 +[ 108.843100] ret_from_fork+0x35/0x40 + +The bug is caused by the race between timeout handle and completion for +flush request. + +When timeout handle function blk_mq_rq_timed_out() try to read +'req->q->mq_ops', the 'req' have completed and reinitiated by next +flush request, which would call blk_rq_init() to clear 'req' as 0. + +After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"), +normal requests lifetime are protected by refcount. Until 'rq->ref' +drop to zero, the request can really be free. Thus, these requests +cannot been reused before timeout handle finish. + +However, flush request has defined .end_io and rq->end_io() is still +called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq' +can be reused by the next flush request handle, resulting in null +pointer deference BUG ON. + +We fix this problem by covering flush request with 'rq->ref'. +If the refcount is not zero, flush_end_io() return and wait the +last holder recall it. To record the request status, we add a new +entry 'rq_status', which will be used in flush_end_io(). + +Cc: Christoph Hellwig +Cc: Keith Busch +Cc: Bart Van Assche +Cc: stable@vger.kernel.org # v4.18+ +Reviewed-by: Ming Lei +Reviewed-by: Bob Liu +Signed-off-by: Yufen Yu +Signed-off-by: Greg Kroah-Hartman + +------- +v2: + - move rq_status from struct request to struct blk_flush_queue +v3: + - remove unnecessary '{}' pair. +v4: + - let spinlock to protect 'fq->rq_status' +v5: + - move rq_status after flush_running_idx member of struct blk_flush_queue +Signed-off-by: Jens Axboe + +--- + block/blk-flush.c | 10 ++++++++++ + block/blk-mq.c | 5 ++++- + block/blk.h | 7 +++++++ + 3 files changed, 21 insertions(+), 1 deletion(-) + +--- a/block/blk-flush.c ++++ b/block/blk-flush.c +@@ -232,6 +232,16 @@ static void flush_end_io(struct request + + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); ++ ++ if (!refcount_dec_and_test(&flush_rq->ref)) { ++ fq->rq_status = error; ++ spin_unlock_irqrestore(&fq->mq_flush_lock, flags); ++ return; ++ } ++ ++ if (fq->rq_status != BLK_STS_OK) ++ error = fq->rq_status; ++ + hctx = blk_mq_map_queue(q, flush_rq->mq_ctx->cpu); + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -844,7 +844,10 @@ static void blk_mq_check_expired(struct + */ + if (blk_mq_req_expired(rq, next)) + blk_mq_rq_timed_out(rq, reserved); +- if (refcount_dec_and_test(&rq->ref)) ++ ++ if (is_flush_rq(rq, hctx)) ++ rq->end_io(rq, 0); ++ else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); + } + +--- a/block/blk.h ++++ b/block/blk.h +@@ -23,6 +23,7 @@ struct blk_flush_queue { + unsigned int flush_queue_delayed:1; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; ++ blk_status_t rq_status; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; +@@ -123,6 +124,12 @@ static inline void __blk_get_queue(struc + kobject_get(&q->kobj); + } + ++static inline bool ++is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) ++{ ++ return hctx->fq->flush_rq == req; ++} ++ + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size, gfp_t flags); + void blk_free_flush_queue(struct blk_flush_queue *q); diff --git a/queue-4.19/block-mq-deadline-fix-queue-restart-handling.patch b/queue-4.19/block-mq-deadline-fix-queue-restart-handling.patch new file mode 100644 index 00000000000..a87d442c445 --- /dev/null +++ b/queue-4.19/block-mq-deadline-fix-queue-restart-handling.patch @@ -0,0 +1,107 @@ +From cb8acabbe33b110157955a7425ee876fb81e6bbc Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Wed, 28 Aug 2019 13:40:20 +0900 +Subject: block: mq-deadline: Fix queue restart handling + +From: Damien Le Moal + +commit cb8acabbe33b110157955a7425ee876fb81e6bbc upstream. + +Commit 7211aef86f79 ("block: mq-deadline: Fix write completion +handling") added a call to blk_mq_sched_mark_restart_hctx() in +dd_dispatch_request() to make sure that write request dispatching does +not stall when all target zones are locked. This fix left a subtle race +when a write completion happens during a dispatch execution on another +CPU: + +CPU 0: Dispatch CPU1: write completion + +dd_dispatch_request() + lock(&dd->lock); + ... + lock(&dd->zone_lock); dd_finish_request() + rq = find request lock(&dd->zone_lock); + unlock(&dd->zone_lock); + zone write unlock + unlock(&dd->zone_lock); + ... + __blk_mq_free_request + check restart flag (not set) + -> queue not run + ... + if (!rq && have writes) + blk_mq_sched_mark_restart_hctx() + unlock(&dd->lock) + +Since the dispatch context finishes after the write request completion +handling, marking the queue as needing a restart is not seen from +__blk_mq_free_request() and blk_mq_sched_restart() not executed leading +to the dispatch stall under 100% write workloads. + +Fix this by moving the call to blk_mq_sched_mark_restart_hctx() from +dd_dispatch_request() into dd_finish_request() under the zone lock to +ensure full mutual exclusion between write request dispatch selection +and zone unlock on write request completion. + +Fixes: 7211aef86f79 ("block: mq-deadline: Fix write completion handling") +Cc: stable@vger.kernel.org +Reported-by: Hans Holmberg +Reviewed-by: Hans Holmberg +Reviewed-by: Christoph Hellwig +Signed-off-by: Damien Le Moal +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + block/mq-deadline.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -376,13 +376,6 @@ done: + * hardware queue, but we may return a request that is for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. +- * +- * For a zoned block device, __dd_dispatch_request() may return NULL +- * if all the queued write requests are directed at zones that are already +- * locked due to on-going write requests. In this case, make sure to mark +- * the queue as needing a restart to ensure that the queue is run again +- * and the pending writes dispatched once the target zones for the ongoing +- * write requests are unlocked in dd_finish_request(). + */ + static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + { +@@ -391,9 +384,6 @@ static struct request *dd_dispatch_reque + + spin_lock(&dd->lock); + rq = __dd_dispatch_request(dd); +- if (!rq && blk_queue_is_zoned(hctx->queue) && +- !list_empty(&dd->fifo_list[WRITE])) +- blk_mq_sched_mark_restart_hctx(hctx); + spin_unlock(&dd->lock); + + return rq; +@@ -559,6 +549,13 @@ static void dd_prepare_request(struct re + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. ++ * ++ * For a zoned block device, __dd_dispatch_request() may have stopped ++ * dispatching requests if all the queued requests are write requests directed ++ * at zones that are already locked due to on-going write requests. To ensure ++ * write request dispatch progress in this case, mark the queue as needing a ++ * restart to ensure that the queue is run again after completion of the ++ * request and zones being unlocked. + */ + static void dd_finish_request(struct request *rq) + { +@@ -570,6 +567,8 @@ static void dd_finish_request(struct req + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); ++ if (!list_empty(&dd->fifo_list[WRITE])) ++ blk_mq_sched_mark_restart_hctx(rq->mq_hctx); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } + } diff --git a/queue-4.19/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch b/queue-4.19/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch new file mode 100644 index 00000000000..4dab52cf2b2 --- /dev/null +++ b/queue-4.19/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch @@ -0,0 +1,189 @@ +From 3acd48507dc43eeeb0a1fe965b8bad91cab904a7 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Wed, 21 Aug 2019 15:05:55 +0000 +Subject: btrfs: fix allocation of free space cache v1 bitmap pages + +From: Christophe Leroy + +commit 3acd48507dc43eeeb0a1fe965b8bad91cab904a7 upstream. + +Various notifications of type "BUG kmalloc-4096 () : Redzone +overwritten" have been observed recently in various parts of the kernel. +After some time, it has been made a relation with the use of BTRFS +filesystem and with SLUB_DEBUG turned on. + +[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten + +[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc +[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224 +[ 22.811193] __slab_alloc.constprop.26+0x44/0x70 +[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec +[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs] +[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs] +[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs] +[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs] +[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs] +[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs] +[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs] +[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs] +[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs] +[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs] +[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs] +[ 22.814107] generic_shutdown_super+0x80/0x110 +[ 22.814250] kill_anon_super+0x18/0x30 +[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs] +[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83 +[ 22.814841] proc_cgroup_show+0xc0/0x248 +[ 22.814967] proc_single_show+0x54/0x98 +[ 22.815086] seq_read+0x278/0x45c +[ 22.815190] __vfs_read+0x28/0x17c +[ 22.815289] vfs_read+0xa8/0x14c +[ 22.815381] ksys_read+0x50/0x94 +[ 22.815475] ret_from_syscall+0x0/0x38 + +Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of +memcpy") changed the way bitmap blocks are copied. But allthough bitmaps +have the size of a page, they were allocated with kzalloc(). + +Most of the time, kzalloc() allocates aligned blocks of memory, so +copy_page() can be used. But when some debug options like SLAB_DEBUG are +activated, kzalloc() may return unaligned pointer. + +On powerpc, memcpy(), copy_page() and other copying functions use +'dcbz' instruction which provides an entire zeroed cacheline to avoid +memory read when the intention is to overwrite a full line. Functions +like memcpy() are writen to care about partial cachelines at the start +and end of the destination, but copy_page() assumes it gets pages. As +pages are naturally cache aligned, copy_page() doesn't care about +partial lines. This means that when copy_page() is called with a +misaligned pointer, a few leading bytes are zeroed. + +To fix it, allocate bitmaps through kmem_cache instead of using kzalloc() +The cache pool is created with PAGE_SIZE alignment constraint. + +Reported-by: Erhard F. +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371 +Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy") +Cc: stable@vger.kernel.org # 4.19+ +Signed-off-by: Christophe Leroy +Reviewed-by: David Sterba +[ rename to btrfs_free_space_bitmap ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 1 + + fs/btrfs/free-space-cache.c | 18 +++++++++++------- + fs/btrfs/inode.c | 8 ++++++++ + 3 files changed, 20 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -39,6 +39,7 @@ extern struct kmem_cache *btrfs_trans_ha + extern struct kmem_cache *btrfs_bit_radix_cachep; + extern struct kmem_cache *btrfs_path_cachep; + extern struct kmem_cache *btrfs_free_space_cachep; ++extern struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_ordered_sum; + + #ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -763,7 +763,8 @@ static int __load_free_space_cache(struc + } else { + ASSERT(num_bitmaps); + num_bitmaps--; +- e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); ++ e->bitmap = kmem_cache_zalloc( ++ btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); +@@ -1864,7 +1865,7 @@ static void free_bitmap(struct btrfs_fre + struct btrfs_free_space *bitmap_info) + { + unlink_free_space(ctl, bitmap_info); +- kfree(bitmap_info->bitmap); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); +@@ -2118,7 +2119,8 @@ new_bitmap: + } + + /* allocate the bitmap */ +- info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); ++ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, ++ GFP_NOFS); + spin_lock(&ctl->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; +@@ -2130,7 +2132,8 @@ new_bitmap: + out: + if (info) { + if (info->bitmap) +- kfree(info->bitmap); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, ++ info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, info); + } + +@@ -2786,7 +2789,8 @@ out: + if (entry->bytes == 0) { + ctl->free_extents--; + if (entry->bitmap) { +- kfree(entry->bitmap); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, ++ entry->bitmap); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); + } +@@ -3594,7 +3598,7 @@ again: + } + + if (!map) { +- map = kzalloc(PAGE_SIZE, GFP_NOFS); ++ map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; +@@ -3624,7 +3628,7 @@ again: + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); + if (map) +- kfree(map); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, map); + return 0; + } + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -72,6 +72,7 @@ static struct kmem_cache *btrfs_inode_ca + struct kmem_cache *btrfs_trans_handle_cachep; + struct kmem_cache *btrfs_path_cachep; + struct kmem_cache *btrfs_free_space_cachep; ++struct kmem_cache *btrfs_free_space_bitmap_cachep; + + #define S_SHIFT 12 + static const unsigned char btrfs_type_by_mode[S_IFMT >> S_SHIFT] = { +@@ -9361,6 +9362,7 @@ void __cold btrfs_destroy_cachep(void) + kmem_cache_destroy(btrfs_trans_handle_cachep); + kmem_cache_destroy(btrfs_path_cachep); + kmem_cache_destroy(btrfs_free_space_cachep); ++ kmem_cache_destroy(btrfs_free_space_bitmap_cachep); + } + + int __init btrfs_init_cachep(void) +@@ -9390,6 +9392,12 @@ int __init btrfs_init_cachep(void) + if (!btrfs_free_space_cachep) + goto fail; + ++ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", ++ PAGE_SIZE, PAGE_SIZE, ++ SLAB_RED_ZONE, NULL); ++ if (!btrfs_free_space_bitmap_cachep) ++ goto fail; ++ + return 0; + fail: + btrfs_destroy_cachep(); diff --git a/queue-4.19/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch b/queue-4.19/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch new file mode 100644 index 00000000000..6ecc58f48e4 --- /dev/null +++ b/queue-4.19/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch @@ -0,0 +1,202 @@ +From 13fc1d271a2e3ab8a02071e711add01fab9271f6 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 24 Sep 2019 10:49:54 +0100 +Subject: Btrfs: fix race setting up and completing qgroup rescan workers + +From: Filipe Manana + +commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream. + +There is a race between setting up a qgroup rescan worker and completing +a qgroup rescan worker that can lead to callers of the qgroup rescan wait +ioctl to either not wait for the rescan worker to complete or to hang +forever due to missing wake ups. The following diagram shows a sequence +of steps that illustrates the race. + + CPU 1 CPU 2 CPU 3 + + btrfs_ioctl_quota_rescan() + btrfs_qgroup_rescan() + qgroup_rescan_init() + mutex_lock(&fs_info->qgroup_rescan_lock) + spin_lock(&fs_info->qgroup_lock) + + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_RESCAN + + init_completion( + &fs_info->qgroup_rescan_completion) + + fs_info->qgroup_rescan_running = true + + mutex_unlock(&fs_info->qgroup_rescan_lock) + spin_unlock(&fs_info->qgroup_lock) + + btrfs_init_work() + --> starts the worker + + btrfs_qgroup_rescan_worker() + mutex_lock(&fs_info->qgroup_rescan_lock) + + fs_info->qgroup_flags &= + ~BTRFS_QGROUP_STATUS_FLAG_RESCAN + + mutex_unlock(&fs_info->qgroup_rescan_lock) + + starts transaction, updates qgroup status + item, etc + + btrfs_ioctl_quota_rescan() + btrfs_qgroup_rescan() + qgroup_rescan_init() + mutex_lock(&fs_info->qgroup_rescan_lock) + spin_lock(&fs_info->qgroup_lock) + + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_RESCAN + + init_completion( + &fs_info->qgroup_rescan_completion) + + fs_info->qgroup_rescan_running = true + + mutex_unlock(&fs_info->qgroup_rescan_lock) + spin_unlock(&fs_info->qgroup_lock) + + btrfs_init_work() + --> starts another worker + + mutex_lock(&fs_info->qgroup_rescan_lock) + + fs_info->qgroup_rescan_running = false + + mutex_unlock(&fs_info->qgroup_rescan_lock) + + complete_all(&fs_info->qgroup_rescan_completion) + +Before the rescan worker started by the task at CPU 3 completes, if +another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS +because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at +fs_info->qgroup_flags, which is expected and correct behaviour. + +However if other task calls btrfs_ioctl_quota_rescan_wait() before the +rescan worker started by the task at CPU 3 completes, it will return +immediately without waiting for the new rescan worker to complete, +because fs_info->qgroup_rescan_running is set to false by CPU 2. + +This race is making test case btrfs/171 (from fstests) to fail often: + + btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad) +# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100 +# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100 +# @@ -1,2 +1,3 @@ +# QA output created by 171 +# +ERROR: quota rescan failed: Operation now in progress +# Silence is golden +# ... +# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff) + +That is because the test calls the btrfs-progs commands "qgroup quota +rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes +calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs" +commands 'qgroup assign' and 'qgroup remove' often call the rescan start +ioctl after calling the qgroup assign ioctl, +btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait +for a rescan worker to complete. + +Another problem the race can cause is missing wake ups for waiters, +since the call to complete_all() happens outside a critical section and +after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence +diagram above, if we have a waiter for the first rescan task (executed +by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and +if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and +before it calls complete_all() against +fs_info->qgroup_rescan_completion, the task at CPU 3 calls +init_completion() against fs_info->qgroup_rescan_completion which +re-initilizes its wait queue to an empty queue, therefore causing the +rescan worker at CPU 2 to call complete_all() against an empty queue, +never waking up the task waiting for that rescan worker. + +Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting +fs_info->qgroup_rescan_running to false in the same critical section, +delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the +call to complete_all() in that same critical section. This gives the +protection needed to avoid rescan wait ioctl callers not waiting for a +running rescan worker and the lost wake ups problem, since setting that +rescan flag and boolean as well as initializing the wait queue is done +already in a critical section delimited by that mutex (at +qgroup_rescan_init()). + +Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion") +Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -2796,9 +2796,6 @@ out: + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); +- if (!btrfs_fs_closing(fs_info)) +- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; +- + if (err > 0 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; +@@ -2814,16 +2811,30 @@ out: + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); ++ trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); +- goto done; + } +- ret = update_qgroup_status_item(trans); +- if (ret < 0) { +- err = ret; +- btrfs_err(fs_info, "fail to update qgroup status: %d", err); ++ ++ mutex_lock(&fs_info->qgroup_rescan_lock); ++ if (!btrfs_fs_closing(fs_info)) ++ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; ++ if (trans) { ++ ret = update_qgroup_status_item(trans); ++ if (ret < 0) { ++ err = ret; ++ btrfs_err(fs_info, "fail to update qgroup status: %d", ++ err); ++ } + } ++ fs_info->qgroup_rescan_running = false; ++ complete_all(&fs_info->qgroup_rescan_completion); ++ mutex_unlock(&fs_info->qgroup_rescan_lock); ++ ++ if (!trans) ++ return; ++ + btrfs_end_transaction(trans); + + if (btrfs_fs_closing(fs_info)) { +@@ -2834,12 +2845,6 @@ out: + } else { + btrfs_err(fs_info, "qgroup scan failed with %d", err); + } +- +-done: +- mutex_lock(&fs_info->qgroup_rescan_lock); +- fs_info->qgroup_rescan_running = false; +- mutex_unlock(&fs_info->qgroup_rescan_lock); +- complete_all(&fs_info->qgroup_rescan_completion); + } + + /* diff --git a/queue-4.19/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch b/queue-4.19/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch new file mode 100644 index 00000000000..b7fd260f455 --- /dev/null +++ b/queue-4.19/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch @@ -0,0 +1,99 @@ +From efad8a853ad2057f96664328a0d327a05ce39c76 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 12 Aug 2019 19:14:29 +0100 +Subject: Btrfs: fix use-after-free when using the tree modification log + +From: Filipe Manana + +commit efad8a853ad2057f96664328a0d327a05ce39c76 upstream. + +At ctree.c:get_old_root(), we are accessing a root's header owner field +after we have freed the respective extent buffer. This results in an +use-after-free that can lead to crashes, and when CONFIG_DEBUG_PAGEALLOC +is set, results in a stack trace like the following: + + [ 3876.799331] stack segment: 0000 [#1] SMP DEBUG_PAGEALLOC PTI + [ 3876.799363] CPU: 0 PID: 15436 Comm: pool Not tainted 5.3.0-rc3-btrfs-next-54 #1 + [ 3876.799385] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 + [ 3876.799433] RIP: 0010:btrfs_search_old_slot+0x652/0xd80 [btrfs] + (...) + [ 3876.799502] RSP: 0018:ffff9f08c1a2f9f0 EFLAGS: 00010286 + [ 3876.799518] RAX: ffff8dd300000000 RBX: ffff8dd85a7a9348 RCX: 000000038da26000 + [ 3876.799538] RDX: 0000000000000000 RSI: ffffe522ce368980 RDI: 0000000000000246 + [ 3876.799559] RBP: dae1922adadad000 R08: 0000000008020000 R09: ffffe522c0000000 + [ 3876.799579] R10: ffff8dd57fd788c8 R11: 000000007511b030 R12: ffff8dd781ddc000 + [ 3876.799599] R13: ffff8dd9e6240578 R14: ffff8dd6896f7a88 R15: ffff8dd688cf90b8 + [ 3876.799620] FS: 00007f23ddd97700(0000) GS:ffff8dda20200000(0000) knlGS:0000000000000000 + [ 3876.799643] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [ 3876.799660] CR2: 00007f23d4024000 CR3: 0000000710bb0005 CR4: 00000000003606f0 + [ 3876.799682] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [ 3876.799703] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [ 3876.799723] Call Trace: + [ 3876.799735] ? do_raw_spin_unlock+0x49/0xc0 + [ 3876.799749] ? _raw_spin_unlock+0x24/0x30 + [ 3876.799779] resolve_indirect_refs+0x1eb/0xc80 [btrfs] + [ 3876.799810] find_parent_nodes+0x38d/0x1180 [btrfs] + [ 3876.799841] btrfs_check_shared+0x11a/0x1d0 [btrfs] + [ 3876.799870] ? extent_fiemap+0x598/0x6e0 [btrfs] + [ 3876.799895] extent_fiemap+0x598/0x6e0 [btrfs] + [ 3876.799913] do_vfs_ioctl+0x45a/0x700 + [ 3876.799926] ksys_ioctl+0x70/0x80 + [ 3876.799938] ? trace_hardirqs_off_thunk+0x1a/0x20 + [ 3876.799953] __x64_sys_ioctl+0x16/0x20 + [ 3876.799965] do_syscall_64+0x62/0x220 + [ 3876.799977] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [ 3876.799993] RIP: 0033:0x7f23e0013dd7 + (...) + [ 3876.800056] RSP: 002b:00007f23ddd96ca8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [ 3876.800078] RAX: ffffffffffffffda RBX: 00007f23d80210f8 RCX: 00007f23e0013dd7 + [ 3876.800099] RDX: 00007f23d80210f8 RSI: 00000000c020660b RDI: 0000000000000003 + [ 3876.800626] RBP: 000055fa2a2a2440 R08: 0000000000000000 R09: 00007f23ddd96d7c + [ 3876.801143] R10: 00007f23d8022000 R11: 0000000000000246 R12: 00007f23ddd96d80 + [ 3876.801662] R13: 00007f23ddd96d78 R14: 00007f23d80210f0 R15: 00007f23ddd96d80 + (...) + [ 3876.805107] ---[ end trace e53161e179ef04f9 ]--- + +Fix that by saving the root's header owner field into a local variable +before freeing the root's extent buffer, and then use that local variable +when needed. + +Fixes: 30b0463a9394d9 ("Btrfs: fix accessing the root pointer in tree mod log functions") +CC: stable@vger.kernel.org # 3.10+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Anand Jain +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1374,6 +1374,7 @@ get_old_root(struct btrfs_root *root, u6 + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; ++ u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; +@@ -1411,6 +1412,7 @@ get_old_root(struct btrfs_root *root, u6 + free_extent_buffer(old); + } + } else if (old_root) { ++ eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); +@@ -1428,7 +1430,7 @@ get_old_root(struct btrfs_root *root, u6 + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); +- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); ++ btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } diff --git a/queue-4.19/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch b/queue-4.19/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch new file mode 100644 index 00000000000..9d39f1a7aec --- /dev/null +++ b/queue-4.19/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch @@ -0,0 +1,92 @@ +From d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 16 Sep 2019 20:02:39 +0800 +Subject: btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls + +From: Qu Wenruo + +commit d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 upstream. + +[BUG] +The following script can cause btrfs qgroup data space leak: + + mkfs.btrfs -f $dev + mount $dev -o nospace_cache $mnt + + btrfs subv create $mnt/subv + btrfs quota en $mnt + btrfs quota rescan -w $mnt + btrfs qgroup limit 128m $mnt/subv + + for (( i = 0; i < 3; i++)); do + # Create 3 64M holes for latter fallocate to fail + truncate -s 192m $mnt/subv/file + xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null + xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null + sync + + # it's supposed to fail, and each failure will leak at least 64M + # data space + xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null + rm $mnt/subv/file + sync + done + + # Shouldn't fail after we removed the file + xfs_io -f -c "falloc 0 64m" $mnt/subv/file + +[CAUSE] +Btrfs qgroup data reserve code allow multiple reservations to happen on +a single extent_changeset: +E.g: + btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M); + btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M); + btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M); + +Btrfs qgroup code has its internal tracking to make sure we don't +double-reserve in above example. + +The only pattern utilizing this feature is in the main while loop of +btrfs_fallocate() function. + +However btrfs_qgroup_reserve_data()'s error handling has a bug in that +on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED +flag but doesn't free previously reserved bytes. + +This bug has a two fold effect: +- Clearing EXTENT_QGROUP_RESERVED ranges + This is the correct behavior, but it prevents + btrfs_qgroup_check_reserved_leak() to catch the leakage as the + detector is purely EXTENT_QGROUP_RESERVED flag based. + +- Leak the previously reserved data bytes. + +The bug manifests when N calls to btrfs_qgroup_reserve_data are made and +the last one fails, leaking space reserved in the previous ones. + +[FIX] +Also free previously reserved data bytes when btrfs_qgroup_reserve_data +fails. + +Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3067,6 +3067,9 @@ cleanup: + while ((unode = ulist_next(&reserved->range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); ++ /* Also free data bytes of already reserved one */ ++ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, ++ orig_reserved, BTRFS_QGROUP_RSV_DATA); + extent_changeset_release(reserved); + return ret; + } diff --git a/queue-4.19/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch b/queue-4.19/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch new file mode 100644 index 00000000000..4d909687464 --- /dev/null +++ b/queue-4.19/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch @@ -0,0 +1,81 @@ +From bab32fc069ce8829c416e8737c119f62a57970f9 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 16 Sep 2019 20:02:38 +0800 +Subject: btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space + +From: Qu Wenruo + +commit bab32fc069ce8829c416e8737c119f62a57970f9 upstream. + +[BUG] +Under the following case with qgroup enabled, if some error happened +after we have reserved delalloc space, then in error handling path, we +could cause qgroup data space leakage: + +From btrfs_truncate_block() in inode.c: + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + block_start, blocksize); + if (ret) + goto out; + + again: + page = find_or_create_page(mapping, index, mask); + if (!page) { + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + ret = -ENOMEM; + goto out; + } + +[CAUSE] +In the above case, btrfs_delalloc_reserve_space() will call +btrfs_qgroup_reserve_data() and mark the io_tree range with +EXTENT_QGROUP_RESERVED flag. + +In the error handling path, we have the following call stack: +btrfs_delalloc_release_space() +|- btrfs_free_reserved_data_space() + |- btrsf_qgroup_free_data() + |- __btrfs_qgroup_release_data(reserved=@reserved, free=1) + |- qgroup_free_reserved_data(reserved=@reserved) + |- clear_record_extent_bits(); + |- freed += changeset.bytes_changed; + +However due to a completion bug, qgroup_free_reserved_data() will clear +EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other +than the correct BTRFS_I(inode)->io_tree. +Since io_failure_tree is never marked with that flag, +btrfs_qgroup_free_data() will not free any data reserved space at all, +causing a leakage. + +This type of error handling can only be triggered by errors outside of +qgroup code. So EDQUOT error from qgroup can't trigger it. + +[FIX] +Fix the wrong target io_tree. + +Reported-by: Josef Bacik +Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges") +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Nikolay Borisov +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3111,7 +3111,7 @@ static int qgroup_free_reserved_data(str + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ +- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, ++ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) diff --git a/queue-4.19/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch b/queue-4.19/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch new file mode 100644 index 00000000000..305fe056244 --- /dev/null +++ b/queue-4.19/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch @@ -0,0 +1,69 @@ +From 6af112b11a4bc1b560f60a618ac9c1dcefe9836e Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Wed, 4 Sep 2019 19:33:58 +0300 +Subject: btrfs: Relinquish CPUs in btrfs_compare_trees + +From: Nikolay Borisov + +commit 6af112b11a4bc1b560f60a618ac9c1dcefe9836e upstream. + +When doing any form of incremental send the parent and the child trees +need to be compared via btrfs_compare_trees. This can result in long +loop chains without ever relinquishing the CPU. This causes softlockup +detector to trigger when comparing trees with a lot of items. Example +report: + +watchdog: BUG: soft lockup - CPU#0 stuck for 24s! [snapperd:16153] +CPU: 0 PID: 16153 Comm: snapperd Not tainted 5.2.9-1-default #1 openSUSE Tumbleweed (unreleased) +Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 +pstate: 40000005 (nZcv daif -PAN -UAO) +pc : __ll_sc_arch_atomic_sub_return+0x14/0x20 +lr : btrfs_release_extent_buffer_pages+0xe0/0x1e8 [btrfs] +sp : ffff00001273b7e0 +Call trace: + __ll_sc_arch_atomic_sub_return+0x14/0x20 + release_extent_buffer+0xdc/0x120 [btrfs] + free_extent_buffer.part.0+0xb0/0x118 [btrfs] + free_extent_buffer+0x24/0x30 [btrfs] + btrfs_release_path+0x4c/0xa0 [btrfs] + btrfs_free_path.part.0+0x20/0x40 [btrfs] + btrfs_free_path+0x24/0x30 [btrfs] + get_inode_info+0xa8/0xf8 [btrfs] + finish_inode_if_needed+0xe0/0x6d8 [btrfs] + changed_cb+0x9c/0x410 [btrfs] + btrfs_compare_trees+0x284/0x648 [btrfs] + send_subvol+0x33c/0x520 [btrfs] + btrfs_ioctl_send+0x8a0/0xaf0 [btrfs] + btrfs_ioctl+0x199c/0x2288 [btrfs] + do_vfs_ioctl+0x4b0/0x820 + ksys_ioctl+0x84/0xb8 + __arm64_sys_ioctl+0x28/0x38 + el0_svc_common.constprop.0+0x7c/0x188 + el0_svc_handler+0x34/0x90 + el0_svc+0x8/0xc + +Fix this by adding a call to cond_resched at the beginning of the main +loop in btrfs_compare_trees. + +Fixes: 7069830a9e38 ("Btrfs: add btrfs_compare_trees function") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -5516,6 +5516,7 @@ int btrfs_compare_trees(struct btrfs_roo + advance_left = advance_right = 0; + + while (1) { ++ cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(fs_info, left_path, &left_level, + left_root_level, diff --git a/queue-4.19/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch b/queue-4.19/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch new file mode 100644 index 00000000000..67845f9585f --- /dev/null +++ b/queue-4.19/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch @@ -0,0 +1,71 @@ +From 51677dfcc17f88ed754143df670ff064eae67f84 Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Sun, 21 Jul 2019 15:19:18 +0200 +Subject: efifb: BGRT: Improve efifb_bgrt_sanity_check + +From: Hans de Goede + +commit 51677dfcc17f88ed754143df670ff064eae67f84 upstream. + +For various reasons, at least with x86 EFI firmwares, the xoffset and +yoffset in the BGRT info are not always reliable. + +Extensive testing has shown that when the info is correct, the +BGRT image is always exactly centered horizontally (the yoffset variable +is more variable and not always predictable). + +This commit simplifies / improves the bgrt_sanity_check to simply +check that the BGRT image is exactly centered horizontally and skips +(re)drawing it when it is not. + +This fixes the BGRT image sometimes being drawn in the wrong place. + +Cc: stable@vger.kernel.org +Fixes: 88fe4ceb2447 ("efifb: BGRT: Do not copy the boot graphics for non native resolutions") +Signed-off-by: Hans de Goede +Cc: Peter Jones , +Signed-off-by: Bartlomiej Zolnierkiewicz +Link: https://patchwork.freedesktop.org/patch/msgid/20190721131918.10115-1-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/fbdev/efifb.c | 27 ++++++--------------------- + 1 file changed, 6 insertions(+), 21 deletions(-) + +--- a/drivers/video/fbdev/efifb.c ++++ b/drivers/video/fbdev/efifb.c +@@ -122,28 +122,13 @@ static void efifb_copy_bmp(u8 *src, u32 + */ + static bool efifb_bgrt_sanity_check(struct screen_info *si, u32 bmp_width) + { +- static const int default_resolutions[][2] = { +- { 800, 600 }, +- { 1024, 768 }, +- { 1280, 1024 }, +- }; +- u32 i, right_margin; ++ /* ++ * All x86 firmwares horizontally center the image (the yoffset ++ * calculations differ between boards, but xoffset is predictable). ++ */ ++ u32 expected_xoffset = (si->lfb_width - bmp_width) / 2; + +- for (i = 0; i < ARRAY_SIZE(default_resolutions); i++) { +- if (default_resolutions[i][0] == si->lfb_width && +- default_resolutions[i][1] == si->lfb_height) +- break; +- } +- /* If not a default resolution used for textmode, this should be fine */ +- if (i >= ARRAY_SIZE(default_resolutions)) +- return true; +- +- /* If the right margin is 5 times smaller then the left one, reject */ +- right_margin = si->lfb_width - (bgrt_tab.image_offset_x + bmp_width); +- if (right_margin < (bgrt_tab.image_offset_x / 5)) +- return false; +- +- return true; ++ return bgrt_tab.image_offset_x == expected_xoffset; + } + #else + static bool efifb_bgrt_sanity_check(struct screen_info *si, u32 bmp_width) diff --git a/queue-4.19/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch b/queue-4.19/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch new file mode 100644 index 00000000000..ef33ff61ad6 --- /dev/null +++ b/queue-4.19/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch @@ -0,0 +1,39 @@ +From f0b444b349e33ae0d3dd93e25ca365482a5d17d4 Mon Sep 17 00:00:00 2001 +From: Bob Peterson +Date: Thu, 12 Sep 2019 13:54:27 -0400 +Subject: gfs2: clear buf_in_tr when ending a transaction in sweep_bh_for_rgrps + +From: Bob Peterson + +commit f0b444b349e33ae0d3dd93e25ca365482a5d17d4 upstream. + +In function sweep_bh_for_rgrps, which is a helper for punch_hole, +it uses variable buf_in_tr to keep track of when it needs to commit +pending block frees on a partial delete that overflows the +transaction created for the delete. The problem is that the +variable was initialized at the start of function sweep_bh_for_rgrps +but it was never cleared, even when starting a new transaction. + +This patch reinitializes the variable when the transaction is +ended, so the next transaction starts out with it cleared. + +Fixes: d552a2b9b33e ("GFS2: Non-recursive delete") +Cc: stable@vger.kernel.org # v4.12+ +Signed-off-by: Bob Peterson +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/bmap.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/gfs2/bmap.c ++++ b/fs/gfs2/bmap.c +@@ -1630,6 +1630,7 @@ out_unlock: + brelse(dibh); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); ++ buf_in_tr = false; + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); diff --git a/queue-4.19/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch b/queue-4.19/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch new file mode 100644 index 00000000000..5738d7eaefc --- /dev/null +++ b/queue-4.19/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch @@ -0,0 +1,74 @@ +From a7542b87607560d0b89e7ff81d870bd6ff8835cb Mon Sep 17 00:00:00 2001 +From: Stefan Assmann +Date: Wed, 21 Aug 2019 16:09:29 +0200 +Subject: i40e: check __I40E_VF_DISABLE bit in i40e_sync_filters_subtask + +From: Stefan Assmann + +commit a7542b87607560d0b89e7ff81d870bd6ff8835cb upstream. + +While testing VF spawn/destroy the following panic occurred. + +BUG: unable to handle kernel NULL pointer dereference at 0000000000000029 +[...] +Workqueue: i40e i40e_service_task [i40e] +RIP: 0010:i40e_sync_vsi_filters+0x6fd/0xc60 [i40e] +[...] +Call Trace: + ? __switch_to_asm+0x35/0x70 + ? __switch_to_asm+0x41/0x70 + ? __switch_to_asm+0x35/0x70 + ? _cond_resched+0x15/0x30 + i40e_sync_filters_subtask+0x56/0x70 [i40e] + i40e_service_task+0x382/0x11b0 [i40e] + ? __switch_to_asm+0x41/0x70 + ? __switch_to_asm+0x41/0x70 + process_one_work+0x1a7/0x3b0 + worker_thread+0x30/0x390 + ? create_worker+0x1a0/0x1a0 + kthread+0x112/0x130 + ? kthread_bind+0x30/0x30 + ret_from_fork+0x35/0x40 + +Investigation revealed a race where pf->vf[vsi->vf_id].trusted may get +accessed by the watchdog via i40e_sync_filters_subtask() although +i40e_free_vfs() already free'd pf->vf. +To avoid this the call to i40e_sync_vsi_filters() in +i40e_sync_filters_subtask() needs to be guarded by __I40E_VF_DISABLE, +which is also used by i40e_free_vfs(). + +Note: put the __I40E_VF_DISABLE check after the +__I40E_MACVLAN_SYNC_PENDING check as the latter is more likely to +trigger. + +CC: stable@vger.kernel.org +Signed-off-by: Stefan Assmann +Tested-by: Andrew Bowers +Signed-off-by: Jeff Kirsher +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -2566,6 +2566,10 @@ static void i40e_sync_filters_subtask(st + return; + if (!test_and_clear_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state)) + return; ++ if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) { ++ set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state); ++ return; ++ } + + for (v = 0; v < pf->num_alloc_vsi; v++) { + if (pf->vsi[v] && +@@ -2580,6 +2584,7 @@ static void i40e_sync_filters_subtask(st + } + } + } ++ clear_bit(__I40E_VF_DISABLE, pf->state); + } + + /** diff --git a/queue-4.19/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch b/queue-4.19/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch new file mode 100644 index 00000000000..a8a974a8f88 --- /dev/null +++ b/queue-4.19/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch @@ -0,0 +1,87 @@ +From e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Wed, 25 Sep 2019 16:45:53 -0700 +Subject: memcg, kmem: do not fail __GFP_NOFAIL charges + +From: Michal Hocko + +commit e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 upstream. + +Thomas has noticed the following NULL ptr dereference when using cgroup +v1 kmem limit: +BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 +PGD 0 +P4D 0 +Oops: 0000 [#1] PREEMPT SMP PTI +CPU: 3 PID: 16923 Comm: gtk-update-icon Not tainted 4.19.51 #42 +Hardware name: Gigabyte Technology Co., Ltd. Z97X-Gaming G1/Z97X-Gaming G1, BIOS F9 07/31/2015 +RIP: 0010:create_empty_buffers+0x24/0x100 +Code: cd 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 d4 ba 01 00 00 00 55 53 48 89 fb e8 97 fe ff ff 48 89 c5 48 89 c2 eb 03 48 89 ca <48> 8b 4a 08 4c 09 22 48 85 c9 75 f1 48 89 6a 08 48 8b 43 18 48 8d +RSP: 0018:ffff927ac1b37bf8 EFLAGS: 00010286 +RAX: 0000000000000000 RBX: fffff2d4429fd740 RCX: 0000000100097149 +RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff9075a99fbe00 +RBP: 0000000000000000 R08: fffff2d440949cc8 R09: 00000000000960c0 +R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000 +R13: ffff907601f18360 R14: 0000000000002000 R15: 0000000000001000 +FS: 00007fb55b288bc0(0000) GS:ffff90761f8c0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000008 CR3: 000000007aebc002 CR4: 00000000001606e0 +Call Trace: + create_page_buffers+0x4d/0x60 + __block_write_begin_int+0x8e/0x5a0 + ? ext4_inode_attach_jinode.part.82+0xb0/0xb0 + ? jbd2__journal_start+0xd7/0x1f0 + ext4_da_write_begin+0x112/0x3d0 + generic_perform_write+0xf1/0x1b0 + ? file_update_time+0x70/0x140 + __generic_file_write_iter+0x141/0x1a0 + ext4_file_write_iter+0xef/0x3b0 + __vfs_write+0x17e/0x1e0 + vfs_write+0xa5/0x1a0 + ksys_write+0x57/0xd0 + do_syscall_64+0x55/0x160 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Tetsuo then noticed that this is because the __memcg_kmem_charge_memcg +fails __GFP_NOFAIL charge when the kmem limit is reached. This is a wrong +behavior because nofail allocations are not allowed to fail. Normal +charge path simply forces the charge even if that means to cross the +limit. Kmem accounting should be doing the same. + +Link: http://lkml.kernel.org/r/20190906125608.32129-1-mhocko@kernel.org +Signed-off-by: Michal Hocko +Reported-by: Thomas Lindroth +Debugged-by: Tetsuo Handa +Cc: Johannes Weiner +Cc: Vladimir Davydov +Cc: Andrey Ryabinin +Cc: Thomas Lindroth +Cc: Shakeel Butt +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2637,6 +2637,16 @@ int memcg_kmem_charge_memcg(struct page + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { ++ ++ /* ++ * Enforce __GFP_NOFAIL allocation because callers are not ++ * prepared to see failures and likely do not have any failure ++ * handling code. ++ */ ++ if (gfp & __GFP_NOFAIL) { ++ page_counter_charge(&memcg->kmem, nr_pages); ++ return 0; ++ } + cancel_charge(memcg, nr_pages); + return -ENOMEM; + } diff --git a/queue-4.19/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch b/queue-4.19/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch new file mode 100644 index 00000000000..86d1712ae48 --- /dev/null +++ b/queue-4.19/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch @@ -0,0 +1,182 @@ +From f9c645621a28e37813a1de96d9cbd89cde94a1e4 Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 23 Sep 2019 15:37:08 -0700 +Subject: memcg, oom: don't require __GFP_FS when invoking memcg OOM killer + +From: Tetsuo Handa + +commit f9c645621a28e37813a1de96d9cbd89cde94a1e4 upstream. + +Masoud Sharbiani noticed that commit 29ef680ae7c21110 ("memcg, oom: move +out_of_memory back to the charge path") broke memcg OOM called from +__xfs_filemap_fault() path. It turned out that try_charge() is retrying +forever without making forward progress because mem_cgroup_oom(GFP_NOFS) +cannot invoke the OOM killer due to commit 3da88fb3bacfaa33 ("mm, oom: +move GFP_NOFS check to out_of_memory"). + +Allowing forced charge due to being unable to invoke memcg OOM killer will +lead to global OOM situation. Also, just returning -ENOMEM will be risky +because OOM path is lost and some paths (e.g. get_user_pages()) will leak +-ENOMEM. Therefore, invoking memcg OOM killer (despite GFP_NOFS) will be +the only choice we can choose for now. + +Until 29ef680ae7c21110, we were able to invoke memcg OOM killer when +GFP_KERNEL reclaim failed [1]. But since 29ef680ae7c21110, we need to +invoke memcg OOM killer when GFP_NOFS reclaim failed [2]. Although in the +past we did invoke memcg OOM killer for GFP_NOFS [3], we might get +pre-mature memcg OOM reports due to this patch. + +[1] + + leaker invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0 + CPU: 0 PID: 2746 Comm: leaker Not tainted 4.18.0+ #19 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + dump_stack+0x63/0x88 + dump_header+0x67/0x27a + ? mem_cgroup_scan_tasks+0x91/0xf0 + oom_kill_process+0x210/0x410 + out_of_memory+0x10a/0x2c0 + mem_cgroup_out_of_memory+0x46/0x80 + mem_cgroup_oom_synchronize+0x2e4/0x310 + ? high_work_func+0x20/0x20 + pagefault_out_of_memory+0x31/0x76 + mm_fault_error+0x55/0x115 + ? handle_mm_fault+0xfd/0x220 + __do_page_fault+0x433/0x4e0 + do_page_fault+0x22/0x30 + ? page_fault+0x8/0x30 + page_fault+0x1e/0x30 + RIP: 0033:0x4009f0 + Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 + RSP: 002b:00007ffe29ae96f0 EFLAGS: 00010206 + RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001ce1000 + RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 + RBP: 000000000000000c R08: 0000000000000000 R09: 00007f94be09220d + R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 + R13: 0000000000000003 R14: 00007f949d845000 R15: 0000000002800000 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 158965 + memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 2016kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:844KB rss:521136KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:132KB writeback:0KB inactive_anon:0KB active_anon:521224KB inactive_file:1012KB active_file:8KB unevictable:0KB + Memory cgroup out of memory: Kill process 2746 (leaker) score 998 or sacrifice child + Killed process 2746 (leaker) total-vm:536704kB, anon-rss:521176kB, file-rss:1208kB, shmem-rss:0kB + oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB + +[2] + + leaker invoked oom-killer: gfp_mask=0x600040(GFP_NOFS), nodemask=(null), order=0, oom_score_adj=0 + CPU: 1 PID: 2746 Comm: leaker Not tainted 4.18.0+ #20 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + dump_stack+0x63/0x88 + dump_header+0x67/0x27a + ? mem_cgroup_scan_tasks+0x91/0xf0 + oom_kill_process+0x210/0x410 + out_of_memory+0x109/0x2d0 + mem_cgroup_out_of_memory+0x46/0x80 + try_charge+0x58d/0x650 + ? __radix_tree_replace+0x81/0x100 + mem_cgroup_try_charge+0x7a/0x100 + __add_to_page_cache_locked+0x92/0x180 + add_to_page_cache_lru+0x4d/0xf0 + iomap_readpages_actor+0xde/0x1b0 + ? iomap_zero_range_actor+0x1d0/0x1d0 + iomap_apply+0xaf/0x130 + iomap_readpages+0x9f/0x150 + ? iomap_zero_range_actor+0x1d0/0x1d0 + xfs_vm_readpages+0x18/0x20 [xfs] + read_pages+0x60/0x140 + __do_page_cache_readahead+0x193/0x1b0 + ondemand_readahead+0x16d/0x2c0 + page_cache_async_readahead+0x9a/0xd0 + filemap_fault+0x403/0x620 + ? alloc_set_pte+0x12c/0x540 + ? _cond_resched+0x14/0x30 + __xfs_filemap_fault+0x66/0x180 [xfs] + xfs_filemap_fault+0x27/0x30 [xfs] + __do_fault+0x19/0x40 + __handle_mm_fault+0x8e8/0xb60 + handle_mm_fault+0xfd/0x220 + __do_page_fault+0x238/0x4e0 + do_page_fault+0x22/0x30 + ? page_fault+0x8/0x30 + page_fault+0x1e/0x30 + RIP: 0033:0x4009f0 + Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 + RSP: 002b:00007ffda45c9290 EFLAGS: 00010206 + RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001a1e000 + RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 + RBP: 000000000000000c R08: 0000000000000000 R09: 00007f6d061ff20d + R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 + R13: 0000000000000003 R14: 00007f6ce59b2000 R15: 0000000002800000 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 7221 + memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 1944kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:3632KB rss:518232KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:0KB writeback:0KB inactive_anon:0KB active_anon:518408KB inactive_file:3908KB active_file:12KB unevictable:0KB + Memory cgroup out of memory: Kill process 2746 (leaker) score 992 or sacrifice child + Killed process 2746 (leaker) total-vm:536704kB, anon-rss:518264kB, file-rss:1188kB, shmem-rss:0kB + oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB + +[3] + + leaker invoked oom-killer: gfp_mask=0x50, order=0, oom_score_adj=0 + leaker cpuset=/ mems_allowed=0 + CPU: 1 PID: 3206 Comm: leaker Not tainted 3.10.0-957.27.2.el7.x86_64 #1 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + [] dump_stack+0x19/0x1b + [] dump_header+0x90/0x229 + [] ? find_lock_task_mm+0x56/0xc0 + [] ? try_get_mem_cgroup_from_mm+0x28/0x60 + [] oom_kill_process+0x254/0x3d0 + [] mem_cgroup_oom_synchronize+0x546/0x570 + [] ? mem_cgroup_charge_common+0xc0/0xc0 + [] pagefault_out_of_memory+0x14/0x90 + [] mm_fault_error+0x6a/0x157 + [] __do_page_fault+0x3c8/0x4f0 + [] do_page_fault+0x35/0x90 + [] page_fault+0x28/0x30 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 20628 + memory+swap: usage 524288kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:840KB rss:523448KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:523448KB inactive_file:464KB active_file:376KB unevictable:0KB + Memory cgroup out of memory: Kill process 3206 (leaker) score 970 or sacrifice child + Killed process 3206 (leaker) total-vm:536692kB, anon-rss:523304kB, file-rss:412kB, shmem-rss:0kB + +Bisected by Masoud Sharbiani. + +Link: http://lkml.kernel.org/r/cbe54ed1-b6ba-a056-8899-2dc42526371d@i-love.sakura.ne.jp +Fixes: 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory") [necessary after 29ef680ae7c21110] +Signed-off-by: Tetsuo Handa +Reported-by: Masoud Sharbiani +Tested-by: Masoud Sharbiani +Acked-by: Michal Hocko +Cc: David Rientjes +Cc: [4.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/oom_kill.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -1089,9 +1089,10 @@ bool out_of_memory(struct oom_control *o + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least +- * ___GFP_DIRECT_RECLAIM to get here. ++ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to ++ * invoke the OOM killer even if it is a GFP_NOFS allocation. + */ +- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) ++ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) + return true; + + /* diff --git a/queue-4.19/ovl-filter-of-trusted-xattr-results-in-audit.patch b/queue-4.19/ovl-filter-of-trusted-xattr-results-in-audit.patch new file mode 100644 index 00000000000..33f56d626b6 --- /dev/null +++ b/queue-4.19/ovl-filter-of-trusted-xattr-results-in-audit.patch @@ -0,0 +1,41 @@ +From 5c2e9f346b815841f9bed6029ebcb06415caf640 Mon Sep 17 00:00:00 2001 +From: Mark Salyzyn +Date: Thu, 29 Aug 2019 11:30:14 -0700 +Subject: ovl: filter of trusted xattr results in audit + +From: Mark Salyzyn + +commit 5c2e9f346b815841f9bed6029ebcb06415caf640 upstream. + +When filtering xattr list for reading, presence of trusted xattr +results in a security audit log. However, if there is other content +no errno will be set, and if there isn't, the errno will be -ENODATA +and not -EPERM as is usually associated with a lack of capability. +The check does not block the request to list the xattrs present. + +Switch to ns_capable_noaudit to reflect a more appropriate check. + +Signed-off-by: Mark Salyzyn +Cc: linux-security-module@vger.kernel.org +Cc: kernel-team@android.com +Cc: stable@vger.kernel.org # v3.18+ +Fixes: a082c6f680da ("ovl: filter trusted xattr for non-admin") +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/inode.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/overlayfs/inode.c ++++ b/fs/overlayfs/inode.c +@@ -386,7 +386,8 @@ static bool ovl_can_list(const char *s) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ +- return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); ++ return !ovl_is_private_xattr(s) && ++ ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + } + + ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) diff --git a/queue-4.19/ovl-fix-dereferencing-possible-err_ptr.patch b/queue-4.19/ovl-fix-dereferencing-possible-err_ptr.patch new file mode 100644 index 00000000000..eb3aaf20433 --- /dev/null +++ b/queue-4.19/ovl-fix-dereferencing-possible-err_ptr.patch @@ -0,0 +1,35 @@ +From 97f024b9171e74c4443bbe8a8dce31b917f97ac5 Mon Sep 17 00:00:00 2001 +From: Ding Xiang +Date: Mon, 9 Sep 2019 16:29:56 +0800 +Subject: ovl: Fix dereferencing possible ERR_PTR() + +From: Ding Xiang + +commit 97f024b9171e74c4443bbe8a8dce31b917f97ac5 upstream. + +if ovl_encode_real_fh() fails, no memory was allocated +and the error in the error-valued pointer should be returned. + +Fixes: 9b6faee07470 ("ovl: check ERR_PTR() return value from ovl_encode_fh()") +Signed-off-by: Ding Xiang +Cc: # v4.16+ +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/export.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/overlayfs/export.c ++++ b/fs/overlayfs/export.c +@@ -230,9 +230,8 @@ static int ovl_d_to_fh(struct dentry *de + /* Encode an upper or lower file handle */ + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); +- err = PTR_ERR(fh); + if (IS_ERR(fh)) +- goto fail; ++ return PTR_ERR(fh); + + err = -EOVERFLOW; + if (fh->len > buflen) diff --git a/queue-4.19/series b/queue-4.19/series index e121591af7f..2f4f1087cfd 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -179,3 +179,19 @@ arm64-tlb-ensure-we-execute-an-isb-following-walk-cache-invalidation.patch arm64-dts-rockchip-limit-clock-rate-of-mmc-controllers-for-rk3328.patch alarmtimer-use-eopnotsupp-instead-of-enotsupp.patch regulator-defer-init-completion-for-a-while-after-late_initcall.patch +efifb-bgrt-improve-efifb_bgrt_sanity_check.patch +gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch +memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch +memcg-kmem-do-not-fail-__gfp_nofail-charges.patch +i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch +block-mq-deadline-fix-queue-restart-handling.patch +block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch +smb3-allow-disabling-requesting-leases.patch +ovl-fix-dereferencing-possible-err_ptr.patch +ovl-filter-of-trusted-xattr-results-in-audit.patch +btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch +btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch +btrfs-relinquish-cpus-in-btrfs_compare_trees.patch +btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch +btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch +btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch diff --git a/queue-4.19/smb3-allow-disabling-requesting-leases.patch b/queue-4.19/smb3-allow-disabling-requesting-leases.patch new file mode 100644 index 00000000000..2f9f61c69b4 --- /dev/null +++ b/queue-4.19/smb3-allow-disabling-requesting-leases.patch @@ -0,0 +1,118 @@ +From 3e7a02d47872081f4b6234a9f72500f1d10f060c Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Wed, 11 Sep 2019 21:46:20 -0500 +Subject: smb3: allow disabling requesting leases + +From: Steve French + +commit 3e7a02d47872081f4b6234a9f72500f1d10f060c upstream. + +In some cases to work around server bugs or performance +problems it can be helpful to be able to disable requesting +SMB2.1/SMB3 leases on a particular mount (not to all servers +and all shares we are mounted to). Add new mount parm +"nolease" which turns off requesting leases on directory +or file opens. Currently the only way to disable leases is +globally through a module load parameter. This is more +granular. + +Suggested-by: Pavel Shilovsky +Signed-off-by: Steve French +Reviewed-by: Ronnie Sahlberg +Reviewed-by: Pavel Shilovsky +CC: Stable +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifsfs.c | 2 ++ + fs/cifs/cifsglob.h | 2 ++ + fs/cifs/connect.c | 9 ++++++++- + fs/cifs/smb2pdu.c | 2 +- + 4 files changed, 13 insertions(+), 2 deletions(-) + +--- a/fs/cifs/cifsfs.c ++++ b/fs/cifs/cifsfs.c +@@ -428,6 +428,8 @@ cifs_show_options(struct seq_file *s, st + cifs_show_security(s, tcon->ses); + cifs_show_cache_flavor(s, cifs_sb); + ++ if (tcon->no_lease) ++ seq_puts(s, ",nolease"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) + seq_puts(s, ",multiuser"); + else if (tcon->ses->user_name) +--- a/fs/cifs/cifsglob.h ++++ b/fs/cifs/cifsglob.h +@@ -543,6 +543,7 @@ struct smb_vol { + bool noblocksnd:1; + bool noautotune:1; + bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ ++ bool no_lease:1; /* disable requesting leases */ + bool fsc:1; /* enable fscache */ + bool mfsymlinks:1; /* use Minshall+French Symlinks */ + bool multiuser:1; +@@ -1004,6 +1005,7 @@ struct cifs_tcon { + bool need_reopen_files:1; /* need to reopen tcon file handles */ + bool use_resilient:1; /* use resilient instead of durable handles */ + bool use_persistent:1; /* use persistent instead of durable handles */ ++ bool no_lease:1; /* Do not request leases on files or directories */ + __le32 capabilities; + __u32 share_flags; + __u32 maximal_access; +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -70,7 +70,7 @@ enum { + Opt_user_xattr, Opt_nouser_xattr, + Opt_forceuid, Opt_noforceuid, + Opt_forcegid, Opt_noforcegid, +- Opt_noblocksend, Opt_noautotune, ++ Opt_noblocksend, Opt_noautotune, Opt_nolease, + Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_mapposix, Opt_nomapposix, + Opt_mapchars, Opt_nomapchars, Opt_sfu, +@@ -129,6 +129,7 @@ static const match_table_t cifs_mount_op + { Opt_noforcegid, "noforcegid" }, + { Opt_noblocksend, "noblocksend" }, + { Opt_noautotune, "noautotune" }, ++ { Opt_nolease, "nolease" }, + { Opt_hard, "hard" }, + { Opt_soft, "soft" }, + { Opt_perm, "perm" }, +@@ -1542,6 +1543,9 @@ cifs_parse_mount_options(const char *mou + case Opt_noautotune: + vol->noautotune = 1; + break; ++ case Opt_nolease: ++ vol->no_lease = 1; ++ break; + case Opt_hard: + vol->retry = 1; + break; +@@ -3023,6 +3027,8 @@ static int match_tcon(struct cifs_tcon * + return 0; + if (tcon->snapshot_time != volume_info->snapshot_time) + return 0; ++ if (tcon->no_lease != volume_info->no_lease) ++ return 0; + return 1; + } + +@@ -3231,6 +3237,7 @@ cifs_get_tcon(struct cifs_ses *ses, stru + tcon->nocase = volume_info->nocase; + tcon->nohandlecache = volume_info->nohandlecache; + tcon->local_lease = volume_info->local_lease; ++ tcon->no_lease = volume_info->no_lease; + INIT_LIST_HEAD(&tcon->pending_opens); + + spin_lock(&cifs_tcp_ses_lock); +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -2192,7 +2192,7 @@ SMB2_open_init(struct cifs_tcon *tcon, s + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + +- if (!server->oplocks) ++ if ((!server->oplocks) || (tcon->no_lease)) + *oplock = SMB2_OPLOCK_LEVEL_NONE; + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || -- 2.47.3