From: Greg Kroah-Hartman Date: Thu, 3 Oct 2019 11:57:01 +0000 (+0200) Subject: 5.2-stable patches X-Git-Tag: v4.4.195~29 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=fca490d000898b46a4b05bf60fad3233dfce9053;p=thirdparty%2Fkernel%2Fstable-queue.git 5.2-stable patches added patches: ath10k-fix-channel-info-parsing-for-non-tlv-target.patch block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch block-mq-deadline-fix-queue-restart-handling.patch btrfs-adjust-dirty_metadata_bytes-after-writeback-failure-of-extent-buffer.patch btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch btrfs-relinquish-cpus-in-btrfs_compare_trees.patch efifb-bgrt-improve-efifb_bgrt_sanity_check.patch gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch keys-trusted-correctly-initialize-digests-and-fix-locking-issue.patch lib-lzo-lzo1x_compress.c-fix-alignment-bug-in-lzo-rle.patch memcg-kmem-do-not-fail-__gfp_nofail-charges.patch memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch mm-compaction.c-clear-total_-migrate-free-_scanned-before-scanning-a-new-zone.patch mt76-round-up-length-on-mt76_wr_copy.patch ovl-filter-of-trusted-xattr-results-in-audit.patch ovl-fix-dereferencing-possible-err_ptr.patch rtw88-pci-rearrange-the-memory-usage-for-skb-in-rx-isr.patch rtw88-pci-use-dma-sync-instead-of-remapping-in-rx-isr.patch smb3-allow-disabling-requesting-leases.patch smb3-fix-leak-in-open-on-server-perf-counter.patch z3fold-fix-memory-leak-in-kmem-cache.patch z3fold-fix-retry-mechanism-in-page-reclaim.patch --- diff --git a/queue-5.2/ath10k-fix-channel-info-parsing-for-non-tlv-target.patch b/queue-5.2/ath10k-fix-channel-info-parsing-for-non-tlv-target.patch new file mode 100644 index 00000000000..935fa8fe0ea --- /dev/null +++ b/queue-5.2/ath10k-fix-channel-info-parsing-for-non-tlv-target.patch @@ -0,0 +1,95 @@ +From 6be6c04bcc2e8770b8637632789ff15765124894 Mon Sep 17 00:00:00 2001 +From: Rakesh Pillai +Date: Fri, 8 Mar 2019 16:56:06 +0530 +Subject: ath10k: fix channel info parsing for non tlv target + +From: Rakesh Pillai + +commit 6be6c04bcc2e8770b8637632789ff15765124894 upstream. + +The tlv targets such as WCN3990 send more data in the chan info event, which is +not sent by the non tlv targets. There is a minimum size check in the wmi event +for non-tlv targets and hence we cannot update the common channel info +structure as it was done in commit 13104929d2ec ("ath10k: fill the channel +survey results for WCN3990 correctly"). This broke channel survey results on +10.x firmware versions. + +If the common channel info structure is updated, the size check for chan info +event for non-tlv targets will fail and return -EPROTO and we see the below +error messages + + ath10k_pci 0000:01:00.0: failed to parse chan info event: -71 + +Add tlv specific channel info structure and restore the original size of the +common channel info structure to mitigate this issue. + +Tested HW: WCN3990 + QCA9887 +Tested FW: WLAN.HL.3.1-00784-QCAHLSWMTPLZ-1 + 10.2.4-1.0-00037 + +Fixes: 13104929d2ec ("ath10k: fill the channel survey results for WCN3990 correctly") +Cc: stable@vger.kernel.org # 5.0 +Signed-off-by: Rakesh Pillai +Signed-off-by: Kalle Valo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/ath/ath10k/wmi-tlv.c | 2 +- + drivers/net/wireless/ath/ath10k/wmi-tlv.h | 16 ++++++++++++++++ + drivers/net/wireless/ath/ath10k/wmi.h | 8 -------- + 3 files changed, 17 insertions(+), 9 deletions(-) + +--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.c ++++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.c +@@ -810,7 +810,7 @@ static int ath10k_wmi_tlv_op_pull_ch_inf + struct wmi_ch_info_ev_arg *arg) + { + const void **tb; +- const struct wmi_chan_info_event *ev; ++ const struct wmi_tlv_chan_info_event *ev; + int ret; + + tb = ath10k_wmi_tlv_parse_alloc(ar, skb->data, skb->len, GFP_ATOMIC); +--- a/drivers/net/wireless/ath/ath10k/wmi-tlv.h ++++ b/drivers/net/wireless/ath/ath10k/wmi-tlv.h +@@ -1607,6 +1607,22 @@ struct chan_info_params { + + #define WMI_TLV_FLAG_MGMT_BUNDLE_TX_COMPL BIT(9) + ++struct wmi_tlv_chan_info_event { ++ __le32 err_code; ++ __le32 freq; ++ __le32 cmd_flags; ++ __le32 noise_floor; ++ __le32 rx_clear_count; ++ __le32 cycle_count; ++ __le32 chan_tx_pwr_range; ++ __le32 chan_tx_pwr_tp; ++ __le32 rx_frame_count; ++ __le32 my_bss_rx_cycle_count; ++ __le32 rx_11b_mode_data_duration; ++ __le32 tx_frame_cnt; ++ __le32 mac_clk_mhz; ++} __packed; ++ + struct wmi_tlv_mgmt_tx_compl_ev { + __le32 desc_id; + __le32 status; +--- a/drivers/net/wireless/ath/ath10k/wmi.h ++++ b/drivers/net/wireless/ath/ath10k/wmi.h +@@ -6524,14 +6524,6 @@ struct wmi_chan_info_event { + __le32 noise_floor; + __le32 rx_clear_count; + __le32 cycle_count; +- __le32 chan_tx_pwr_range; +- __le32 chan_tx_pwr_tp; +- __le32 rx_frame_count; +- __le32 my_bss_rx_cycle_count; +- __le32 rx_11b_mode_data_duration; +- __le32 tx_frame_cnt; +- __le32 mac_clk_mhz; +- + } __packed; + + struct wmi_10_4_chan_info_event { diff --git a/queue-5.2/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch b/queue-5.2/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch new file mode 100644 index 00000000000..c0de834306a --- /dev/null +++ b/queue-5.2/block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch @@ -0,0 +1,137 @@ +From 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 Mon Sep 17 00:00:00 2001 +From: Yufen Yu +Date: Fri, 27 Sep 2019 16:19:55 +0800 +Subject: block: fix null pointer dereference in blk_mq_rq_timed_out() + +From: Yufen Yu + +commit 8d6996630c03d7ceeabe2611378fea5ca1c3f1b3 upstream. + +We got a null pointer deference BUG_ON in blk_mq_rq_timed_out() +as following: + +[ 108.825472] BUG: kernel NULL pointer dereference, address: 0000000000000040 +[ 108.827059] PGD 0 P4D 0 +[ 108.827313] Oops: 0000 [#1] SMP PTI +[ 108.827657] CPU: 6 PID: 198 Comm: kworker/6:1H Not tainted 5.3.0-rc8+ #431 +[ 108.829503] Workqueue: kblockd blk_mq_timeout_work +[ 108.829913] RIP: 0010:blk_mq_check_expired+0x258/0x330 +[ 108.838191] Call Trace: +[ 108.838406] bt_iter+0x74/0x80 +[ 108.838665] blk_mq_queue_tag_busy_iter+0x204/0x450 +[ 108.839074] ? __switch_to_asm+0x34/0x70 +[ 108.839405] ? blk_mq_stop_hw_queue+0x40/0x40 +[ 108.839823] ? blk_mq_stop_hw_queue+0x40/0x40 +[ 108.840273] ? syscall_return_via_sysret+0xf/0x7f +[ 108.840732] blk_mq_timeout_work+0x74/0x200 +[ 108.841151] process_one_work+0x297/0x680 +[ 108.841550] worker_thread+0x29c/0x6f0 +[ 108.841926] ? rescuer_thread+0x580/0x580 +[ 108.842344] kthread+0x16a/0x1a0 +[ 108.842666] ? kthread_flush_work+0x170/0x170 +[ 108.843100] ret_from_fork+0x35/0x40 + +The bug is caused by the race between timeout handle and completion for +flush request. + +When timeout handle function blk_mq_rq_timed_out() try to read +'req->q->mq_ops', the 'req' have completed and reinitiated by next +flush request, which would call blk_rq_init() to clear 'req' as 0. + +After commit 12f5b93145 ("blk-mq: Remove generation seqeunce"), +normal requests lifetime are protected by refcount. Until 'rq->ref' +drop to zero, the request can really be free. Thus, these requests +cannot been reused before timeout handle finish. + +However, flush request has defined .end_io and rq->end_io() is still +called even if 'rq->ref' doesn't drop to zero. After that, the 'flush_rq' +can be reused by the next flush request handle, resulting in null +pointer deference BUG ON. + +We fix this problem by covering flush request with 'rq->ref'. +If the refcount is not zero, flush_end_io() return and wait the +last holder recall it. To record the request status, we add a new +entry 'rq_status', which will be used in flush_end_io(). + +Cc: Christoph Hellwig +Cc: Keith Busch +Cc: Bart Van Assche +Cc: stable@vger.kernel.org # v4.18+ +Reviewed-by: Ming Lei +Reviewed-by: Bob Liu +Signed-off-by: Yufen Yu +Signed-off-by: Greg Kroah-Hartman + +------- +v2: + - move rq_status from struct request to struct blk_flush_queue +v3: + - remove unnecessary '{}' pair. +v4: + - let spinlock to protect 'fq->rq_status' +v5: + - move rq_status after flush_running_idx member of struct blk_flush_queue +Signed-off-by: Jens Axboe + +--- + block/blk-flush.c | 10 ++++++++++ + block/blk-mq.c | 5 ++++- + block/blk.h | 7 +++++++ + 3 files changed, 21 insertions(+), 1 deletion(-) + +--- a/block/blk-flush.c ++++ b/block/blk-flush.c +@@ -214,6 +214,16 @@ static void flush_end_io(struct request + + /* release the tag's ownership to the req cloned from */ + spin_lock_irqsave(&fq->mq_flush_lock, flags); ++ ++ if (!refcount_dec_and_test(&flush_rq->ref)) { ++ fq->rq_status = error; ++ spin_unlock_irqrestore(&fq->mq_flush_lock, flags); ++ return; ++ } ++ ++ if (fq->rq_status != BLK_STS_OK) ++ error = fq->rq_status; ++ + hctx = flush_rq->mq_hctx; + if (!q->elevator) { + blk_mq_tag_set_rq(hctx, flush_rq->tag, fq->orig_rq); +--- a/block/blk-mq.c ++++ b/block/blk-mq.c +@@ -910,7 +910,10 @@ static bool blk_mq_check_expired(struct + */ + if (blk_mq_req_expired(rq, next)) + blk_mq_rq_timed_out(rq, reserved); +- if (refcount_dec_and_test(&rq->ref)) ++ ++ if (is_flush_rq(rq, hctx)) ++ rq->end_io(rq, 0); ++ else if (refcount_dec_and_test(&rq->ref)) + __blk_mq_free_request(rq); + + return true; +--- a/block/blk.h ++++ b/block/blk.h +@@ -19,6 +19,7 @@ struct blk_flush_queue { + unsigned int flush_queue_delayed:1; + unsigned int flush_pending_idx:1; + unsigned int flush_running_idx:1; ++ blk_status_t rq_status; + unsigned long flush_pending_since; + struct list_head flush_queue[2]; + struct list_head flush_data_in_flight; +@@ -47,6 +48,12 @@ static inline void __blk_get_queue(struc + kobject_get(&q->kobj); + } + ++static inline bool ++is_flush_rq(struct request *req, struct blk_mq_hw_ctx *hctx) ++{ ++ return hctx->fq->flush_rq == req; ++} ++ + struct blk_flush_queue *blk_alloc_flush_queue(struct request_queue *q, + int node, int cmd_size, gfp_t flags); + void blk_free_flush_queue(struct blk_flush_queue *q); diff --git a/queue-5.2/block-mq-deadline-fix-queue-restart-handling.patch b/queue-5.2/block-mq-deadline-fix-queue-restart-handling.patch new file mode 100644 index 00000000000..04f7b614afe --- /dev/null +++ b/queue-5.2/block-mq-deadline-fix-queue-restart-handling.patch @@ -0,0 +1,107 @@ +From cb8acabbe33b110157955a7425ee876fb81e6bbc Mon Sep 17 00:00:00 2001 +From: Damien Le Moal +Date: Wed, 28 Aug 2019 13:40:20 +0900 +Subject: block: mq-deadline: Fix queue restart handling + +From: Damien Le Moal + +commit cb8acabbe33b110157955a7425ee876fb81e6bbc upstream. + +Commit 7211aef86f79 ("block: mq-deadline: Fix write completion +handling") added a call to blk_mq_sched_mark_restart_hctx() in +dd_dispatch_request() to make sure that write request dispatching does +not stall when all target zones are locked. This fix left a subtle race +when a write completion happens during a dispatch execution on another +CPU: + +CPU 0: Dispatch CPU1: write completion + +dd_dispatch_request() + lock(&dd->lock); + ... + lock(&dd->zone_lock); dd_finish_request() + rq = find request lock(&dd->zone_lock); + unlock(&dd->zone_lock); + zone write unlock + unlock(&dd->zone_lock); + ... + __blk_mq_free_request + check restart flag (not set) + -> queue not run + ... + if (!rq && have writes) + blk_mq_sched_mark_restart_hctx() + unlock(&dd->lock) + +Since the dispatch context finishes after the write request completion +handling, marking the queue as needing a restart is not seen from +__blk_mq_free_request() and blk_mq_sched_restart() not executed leading +to the dispatch stall under 100% write workloads. + +Fix this by moving the call to blk_mq_sched_mark_restart_hctx() from +dd_dispatch_request() into dd_finish_request() under the zone lock to +ensure full mutual exclusion between write request dispatch selection +and zone unlock on write request completion. + +Fixes: 7211aef86f79 ("block: mq-deadline: Fix write completion handling") +Cc: stable@vger.kernel.org +Reported-by: Hans Holmberg +Reviewed-by: Hans Holmberg +Reviewed-by: Christoph Hellwig +Signed-off-by: Damien Le Moal +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman + +--- + block/mq-deadline.c | 19 +++++++++---------- + 1 file changed, 9 insertions(+), 10 deletions(-) + +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -377,13 +377,6 @@ done: + * hardware queue, but we may return a request that is for a + * different hardware queue. This is because mq-deadline has shared + * state for all hardware queues, in terms of sorting, FIFOs, etc. +- * +- * For a zoned block device, __dd_dispatch_request() may return NULL +- * if all the queued write requests are directed at zones that are already +- * locked due to on-going write requests. In this case, make sure to mark +- * the queue as needing a restart to ensure that the queue is run again +- * and the pending writes dispatched once the target zones for the ongoing +- * write requests are unlocked in dd_finish_request(). + */ + static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + { +@@ -392,9 +385,6 @@ static struct request *dd_dispatch_reque + + spin_lock(&dd->lock); + rq = __dd_dispatch_request(dd); +- if (!rq && blk_queue_is_zoned(hctx->queue) && +- !list_empty(&dd->fifo_list[WRITE])) +- blk_mq_sched_mark_restart_hctx(hctx); + spin_unlock(&dd->lock); + + return rq; +@@ -560,6 +550,13 @@ static void dd_prepare_request(struct re + * spinlock so that the zone is never unlocked while deadline_fifo_request() + * or deadline_next_request() are executing. This function is called for + * all requests, whether or not these requests complete successfully. ++ * ++ * For a zoned block device, __dd_dispatch_request() may have stopped ++ * dispatching requests if all the queued requests are write requests directed ++ * at zones that are already locked due to on-going write requests. To ensure ++ * write request dispatch progress in this case, mark the queue as needing a ++ * restart to ensure that the queue is run again after completion of the ++ * request and zones being unlocked. + */ + static void dd_finish_request(struct request *rq) + { +@@ -571,6 +568,8 @@ static void dd_finish_request(struct req + + spin_lock_irqsave(&dd->zone_lock, flags); + blk_req_zone_write_unlock(rq); ++ if (!list_empty(&dd->fifo_list[WRITE])) ++ blk_mq_sched_mark_restart_hctx(rq->mq_hctx); + spin_unlock_irqrestore(&dd->zone_lock, flags); + } + } diff --git a/queue-5.2/btrfs-adjust-dirty_metadata_bytes-after-writeback-failure-of-extent-buffer.patch b/queue-5.2/btrfs-adjust-dirty_metadata_bytes-after-writeback-failure-of-extent-buffer.patch new file mode 100644 index 00000000000..42fc068bb68 --- /dev/null +++ b/queue-5.2/btrfs-adjust-dirty_metadata_bytes-after-writeback-failure-of-extent-buffer.patch @@ -0,0 +1,50 @@ +From eb5b64f142504a597d67e2109d603055ff765e52 Mon Sep 17 00:00:00 2001 +From: Dennis Zhou +Date: Fri, 13 Sep 2019 14:54:07 +0100 +Subject: btrfs: adjust dirty_metadata_bytes after writeback failure of extent buffer + +From: Dennis Zhou + +commit eb5b64f142504a597d67e2109d603055ff765e52 upstream. + +Before, if a eb failed to write out, we would end up triggering a +BUG_ON(). As of f4340622e0226 ("btrfs: extent_io: Move the BUG_ON() in +flush_write_bio() one level up"), we no longer BUG_ON(), so we should +make life consistent and add back the unwritten bytes to +dirty_metadata_bytes. + +Fixes: f4340622e022 ("btrfs: extent_io: Move the BUG_ON() in flush_write_bio() one level up") +CC: stable@vger.kernel.org # 5.2+ +Reviewed-by: Filipe Manana +Signed-off-by: Dennis Zhou +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/extent_io.c | 9 +++++++++ + 1 file changed, 9 insertions(+) + +--- a/fs/btrfs/extent_io.c ++++ b/fs/btrfs/extent_io.c +@@ -3708,12 +3708,21 @@ err_unlock: + static void set_btree_ioerr(struct page *page) + { + struct extent_buffer *eb = (struct extent_buffer *)page->private; ++ struct btrfs_fs_info *fs_info; + + SetPageError(page); + if (test_and_set_bit(EXTENT_BUFFER_WRITE_ERR, &eb->bflags)) + return; + + /* ++ * If we error out, we should add back the dirty_metadata_bytes ++ * to make it consistent. ++ */ ++ fs_info = eb->fs_info; ++ percpu_counter_add_batch(&fs_info->dirty_metadata_bytes, ++ eb->len, fs_info->dirty_metadata_batch); ++ ++ /* + * If writeback for a btree extent that doesn't belong to a log tree + * failed, increment the counter transaction->eb_write_errors. + * We do this because while the transaction is running and before it's diff --git a/queue-5.2/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch b/queue-5.2/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch new file mode 100644 index 00000000000..a40de5d7019 --- /dev/null +++ b/queue-5.2/btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch @@ -0,0 +1,191 @@ +From 3acd48507dc43eeeb0a1fe965b8bad91cab904a7 Mon Sep 17 00:00:00 2001 +From: Christophe Leroy +Date: Wed, 21 Aug 2019 15:05:55 +0000 +Subject: btrfs: fix allocation of free space cache v1 bitmap pages + +From: Christophe Leroy + +commit 3acd48507dc43eeeb0a1fe965b8bad91cab904a7 upstream. + +Various notifications of type "BUG kmalloc-4096 () : Redzone +overwritten" have been observed recently in various parts of the kernel. +After some time, it has been made a relation with the use of BTRFS +filesystem and with SLUB_DEBUG turned on. + +[ 22.809700] BUG kmalloc-4096 (Tainted: G W ): Redzone overwritten + +[ 22.810286] INFO: 0xbe1a5921-0xfbfc06cd. First byte 0x0 instead of 0xcc +[ 22.810866] INFO: Allocated in __load_free_space_cache+0x588/0x780 [btrfs] age=22 cpu=0 pid=224 +[ 22.811193] __slab_alloc.constprop.26+0x44/0x70 +[ 22.811345] kmem_cache_alloc_trace+0xf0/0x2ec +[ 22.811588] __load_free_space_cache+0x588/0x780 [btrfs] +[ 22.811848] load_free_space_cache+0xf4/0x1b0 [btrfs] +[ 22.812090] cache_block_group+0x1d0/0x3d0 [btrfs] +[ 22.812321] find_free_extent+0x680/0x12a4 [btrfs] +[ 22.812549] btrfs_reserve_extent+0xec/0x220 [btrfs] +[ 22.812785] btrfs_alloc_tree_block+0x178/0x5f4 [btrfs] +[ 22.813032] __btrfs_cow_block+0x150/0x5d4 [btrfs] +[ 22.813262] btrfs_cow_block+0x194/0x298 [btrfs] +[ 22.813484] commit_cowonly_roots+0x44/0x294 [btrfs] +[ 22.813718] btrfs_commit_transaction+0x63c/0xc0c [btrfs] +[ 22.813973] close_ctree+0xf8/0x2a4 [btrfs] +[ 22.814107] generic_shutdown_super+0x80/0x110 +[ 22.814250] kill_anon_super+0x18/0x30 +[ 22.814437] btrfs_kill_super+0x18/0x90 [btrfs] +[ 22.814590] INFO: Freed in proc_cgroup_show+0xc0/0x248 age=41 cpu=0 pid=83 +[ 22.814841] proc_cgroup_show+0xc0/0x248 +[ 22.814967] proc_single_show+0x54/0x98 +[ 22.815086] seq_read+0x278/0x45c +[ 22.815190] __vfs_read+0x28/0x17c +[ 22.815289] vfs_read+0xa8/0x14c +[ 22.815381] ksys_read+0x50/0x94 +[ 22.815475] ret_from_syscall+0x0/0x38 + +Commit 69d2480456d1 ("btrfs: use copy_page for copying pages instead of +memcpy") changed the way bitmap blocks are copied. But allthough bitmaps +have the size of a page, they were allocated with kzalloc(). + +Most of the time, kzalloc() allocates aligned blocks of memory, so +copy_page() can be used. But when some debug options like SLAB_DEBUG are +activated, kzalloc() may return unaligned pointer. + +On powerpc, memcpy(), copy_page() and other copying functions use +'dcbz' instruction which provides an entire zeroed cacheline to avoid +memory read when the intention is to overwrite a full line. Functions +like memcpy() are writen to care about partial cachelines at the start +and end of the destination, but copy_page() assumes it gets pages. As +pages are naturally cache aligned, copy_page() doesn't care about +partial lines. This means that when copy_page() is called with a +misaligned pointer, a few leading bytes are zeroed. + +To fix it, allocate bitmaps through kmem_cache instead of using kzalloc() +The cache pool is created with PAGE_SIZE alignment constraint. + +Reported-by: Erhard F. +Bugzilla: https://bugzilla.kernel.org/show_bug.cgi?id=204371 +Fixes: 69d2480456d1 ("btrfs: use copy_page for copying pages instead of memcpy") +Cc: stable@vger.kernel.org # 4.19+ +Signed-off-by: Christophe Leroy +Reviewed-by: David Sterba +[ rename to btrfs_free_space_bitmap ] +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.h | 1 + + fs/btrfs/free-space-cache.c | 20 +++++++++++++------- + fs/btrfs/inode.c | 8 ++++++++ + 3 files changed, 22 insertions(+), 7 deletions(-) + +--- a/fs/btrfs/ctree.h ++++ b/fs/btrfs/ctree.h +@@ -40,6 +40,7 @@ extern struct kmem_cache *btrfs_trans_ha + extern struct kmem_cache *btrfs_bit_radix_cachep; + extern struct kmem_cache *btrfs_path_cachep; + extern struct kmem_cache *btrfs_free_space_cachep; ++extern struct kmem_cache *btrfs_free_space_bitmap_cachep; + struct btrfs_ordered_sum; + struct btrfs_ref; + +--- a/fs/btrfs/free-space-cache.c ++++ b/fs/btrfs/free-space-cache.c +@@ -764,7 +764,8 @@ static int __load_free_space_cache(struc + } else { + ASSERT(num_bitmaps); + num_bitmaps--; +- e->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); ++ e->bitmap = kmem_cache_zalloc( ++ btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!e->bitmap) { + kmem_cache_free( + btrfs_free_space_cachep, e); +@@ -1881,7 +1882,7 @@ static void free_bitmap(struct btrfs_fre + struct btrfs_free_space *bitmap_info) + { + unlink_free_space(ctl, bitmap_info); +- kfree(bitmap_info->bitmap); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, bitmap_info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, bitmap_info); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); +@@ -2135,7 +2136,8 @@ new_bitmap: + } + + /* allocate the bitmap */ +- info->bitmap = kzalloc(PAGE_SIZE, GFP_NOFS); ++ info->bitmap = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, ++ GFP_NOFS); + spin_lock(&ctl->tree_lock); + if (!info->bitmap) { + ret = -ENOMEM; +@@ -2146,7 +2148,9 @@ new_bitmap: + + out: + if (info) { +- kfree(info->bitmap); ++ if (info->bitmap) ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, ++ info->bitmap); + kmem_cache_free(btrfs_free_space_cachep, info); + } + +@@ -2802,7 +2806,8 @@ out: + if (entry->bytes == 0) { + ctl->free_extents--; + if (entry->bitmap) { +- kfree(entry->bitmap); ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, ++ entry->bitmap); + ctl->total_bitmaps--; + ctl->op->recalc_thresholds(ctl); + } +@@ -3606,7 +3611,7 @@ again: + } + + if (!map) { +- map = kzalloc(PAGE_SIZE, GFP_NOFS); ++ map = kmem_cache_zalloc(btrfs_free_space_bitmap_cachep, GFP_NOFS); + if (!map) { + kmem_cache_free(btrfs_free_space_cachep, info); + return -ENOMEM; +@@ -3635,7 +3640,8 @@ again: + + if (info) + kmem_cache_free(btrfs_free_space_cachep, info); +- kfree(map); ++ if (map) ++ kmem_cache_free(btrfs_free_space_bitmap_cachep, map); + return 0; + } + +--- a/fs/btrfs/inode.c ++++ b/fs/btrfs/inode.c +@@ -73,6 +73,7 @@ static struct kmem_cache *btrfs_inode_ca + struct kmem_cache *btrfs_trans_handle_cachep; + struct kmem_cache *btrfs_path_cachep; + struct kmem_cache *btrfs_free_space_cachep; ++struct kmem_cache *btrfs_free_space_bitmap_cachep; + + static int btrfs_setsize(struct inode *inode, struct iattr *attr); + static int btrfs_truncate(struct inode *inode, bool skip_writeback); +@@ -9361,6 +9362,7 @@ void __cold btrfs_destroy_cachep(void) + kmem_cache_destroy(btrfs_trans_handle_cachep); + kmem_cache_destroy(btrfs_path_cachep); + kmem_cache_destroy(btrfs_free_space_cachep); ++ kmem_cache_destroy(btrfs_free_space_bitmap_cachep); + } + + int __init btrfs_init_cachep(void) +@@ -9390,6 +9392,12 @@ int __init btrfs_init_cachep(void) + if (!btrfs_free_space_cachep) + goto fail; + ++ btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap", ++ PAGE_SIZE, PAGE_SIZE, ++ SLAB_RED_ZONE, NULL); ++ if (!btrfs_free_space_bitmap_cachep) ++ goto fail; ++ + return 0; + fail: + btrfs_destroy_cachep(); diff --git a/queue-5.2/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch b/queue-5.2/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch new file mode 100644 index 00000000000..6aa18c6dda4 --- /dev/null +++ b/queue-5.2/btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch @@ -0,0 +1,202 @@ +From 13fc1d271a2e3ab8a02071e711add01fab9271f6 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Tue, 24 Sep 2019 10:49:54 +0100 +Subject: Btrfs: fix race setting up and completing qgroup rescan workers + +From: Filipe Manana + +commit 13fc1d271a2e3ab8a02071e711add01fab9271f6 upstream. + +There is a race between setting up a qgroup rescan worker and completing +a qgroup rescan worker that can lead to callers of the qgroup rescan wait +ioctl to either not wait for the rescan worker to complete or to hang +forever due to missing wake ups. The following diagram shows a sequence +of steps that illustrates the race. + + CPU 1 CPU 2 CPU 3 + + btrfs_ioctl_quota_rescan() + btrfs_qgroup_rescan() + qgroup_rescan_init() + mutex_lock(&fs_info->qgroup_rescan_lock) + spin_lock(&fs_info->qgroup_lock) + + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_RESCAN + + init_completion( + &fs_info->qgroup_rescan_completion) + + fs_info->qgroup_rescan_running = true + + mutex_unlock(&fs_info->qgroup_rescan_lock) + spin_unlock(&fs_info->qgroup_lock) + + btrfs_init_work() + --> starts the worker + + btrfs_qgroup_rescan_worker() + mutex_lock(&fs_info->qgroup_rescan_lock) + + fs_info->qgroup_flags &= + ~BTRFS_QGROUP_STATUS_FLAG_RESCAN + + mutex_unlock(&fs_info->qgroup_rescan_lock) + + starts transaction, updates qgroup status + item, etc + + btrfs_ioctl_quota_rescan() + btrfs_qgroup_rescan() + qgroup_rescan_init() + mutex_lock(&fs_info->qgroup_rescan_lock) + spin_lock(&fs_info->qgroup_lock) + + fs_info->qgroup_flags |= + BTRFS_QGROUP_STATUS_FLAG_RESCAN + + init_completion( + &fs_info->qgroup_rescan_completion) + + fs_info->qgroup_rescan_running = true + + mutex_unlock(&fs_info->qgroup_rescan_lock) + spin_unlock(&fs_info->qgroup_lock) + + btrfs_init_work() + --> starts another worker + + mutex_lock(&fs_info->qgroup_rescan_lock) + + fs_info->qgroup_rescan_running = false + + mutex_unlock(&fs_info->qgroup_rescan_lock) + + complete_all(&fs_info->qgroup_rescan_completion) + +Before the rescan worker started by the task at CPU 3 completes, if +another task calls btrfs_ioctl_quota_rescan(), it will get -EINPROGRESS +because the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN is set at +fs_info->qgroup_flags, which is expected and correct behaviour. + +However if other task calls btrfs_ioctl_quota_rescan_wait() before the +rescan worker started by the task at CPU 3 completes, it will return +immediately without waiting for the new rescan worker to complete, +because fs_info->qgroup_rescan_running is set to false by CPU 2. + +This race is making test case btrfs/171 (from fstests) to fail often: + + btrfs/171 9s ... - output mismatch (see /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad) +# --- tests/btrfs/171.out 2018-09-16 21:30:48.505104287 +0100 +# +++ /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad 2019-09-19 02:01:36.938486039 +0100 +# @@ -1,2 +1,3 @@ +# QA output created by 171 +# +ERROR: quota rescan failed: Operation now in progress +# Silence is golden +# ... +# (Run 'diff -u /home/fdmanana/git/hub/xfstests/tests/btrfs/171.out /home/fdmanana/git/hub/xfstests/results//btrfs/171.out.bad' to see the entire diff) + +That is because the test calls the btrfs-progs commands "qgroup quota +rescan -w", "qgroup assign" and "qgroup remove" in a sequence that makes +calls to the rescan start ioctl fail with -EINPROGRESS (note the "btrfs" +commands 'qgroup assign' and 'qgroup remove' often call the rescan start +ioctl after calling the qgroup assign ioctl, +btrfs_ioctl_qgroup_assign()), since previous waits didn't actually wait +for a rescan worker to complete. + +Another problem the race can cause is missing wake ups for waiters, +since the call to complete_all() happens outside a critical section and +after clearing the flag BTRFS_QGROUP_STATUS_FLAG_RESCAN. In the sequence +diagram above, if we have a waiter for the first rescan task (executed +by CPU 2), then fs_info->qgroup_rescan_completion.wait is not empty, and +if after the rescan worker clears BTRFS_QGROUP_STATUS_FLAG_RESCAN and +before it calls complete_all() against +fs_info->qgroup_rescan_completion, the task at CPU 3 calls +init_completion() against fs_info->qgroup_rescan_completion which +re-initilizes its wait queue to an empty queue, therefore causing the +rescan worker at CPU 2 to call complete_all() against an empty queue, +never waking up the task waiting for that rescan worker. + +Fix this by clearing BTRFS_QGROUP_STATUS_FLAG_RESCAN and setting +fs_info->qgroup_rescan_running to false in the same critical section, +delimited by the mutex fs_info->qgroup_rescan_lock, as well as doing the +call to complete_all() in that same critical section. This gives the +protection needed to avoid rescan wait ioctl callers not waiting for a +running rescan worker and the lost wake ups problem, since setting that +rescan flag and boolean as well as initializing the wait queue is done +already in a critical section delimited by that mutex (at +qgroup_rescan_init()). + +Fixes: 57254b6ebce4ce ("Btrfs: add ioctl to wait for qgroup rescan completion") +Fixes: d2c609b834d62f ("btrfs: properly track when rescan worker is running") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Josef Bacik +Signed-off-by: Filipe Manana +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 33 +++++++++++++++++++-------------- + 1 file changed, 19 insertions(+), 14 deletions(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3154,9 +3154,6 @@ out: + btrfs_free_path(path); + + mutex_lock(&fs_info->qgroup_rescan_lock); +- if (!btrfs_fs_closing(fs_info)) +- fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; +- + if (err > 0 && + fs_info->qgroup_flags & BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT) { + fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_INCONSISTENT; +@@ -3172,16 +3169,30 @@ out: + trans = btrfs_start_transaction(fs_info->quota_root, 1); + if (IS_ERR(trans)) { + err = PTR_ERR(trans); ++ trans = NULL; + btrfs_err(fs_info, + "fail to start transaction for status update: %d", + err); +- goto done; + } +- ret = update_qgroup_status_item(trans); +- if (ret < 0) { +- err = ret; +- btrfs_err(fs_info, "fail to update qgroup status: %d", err); ++ ++ mutex_lock(&fs_info->qgroup_rescan_lock); ++ if (!btrfs_fs_closing(fs_info)) ++ fs_info->qgroup_flags &= ~BTRFS_QGROUP_STATUS_FLAG_RESCAN; ++ if (trans) { ++ ret = update_qgroup_status_item(trans); ++ if (ret < 0) { ++ err = ret; ++ btrfs_err(fs_info, "fail to update qgroup status: %d", ++ err); ++ } + } ++ fs_info->qgroup_rescan_running = false; ++ complete_all(&fs_info->qgroup_rescan_completion); ++ mutex_unlock(&fs_info->qgroup_rescan_lock); ++ ++ if (!trans) ++ return; ++ + btrfs_end_transaction(trans); + + if (btrfs_fs_closing(fs_info)) { +@@ -3192,12 +3203,6 @@ out: + } else { + btrfs_err(fs_info, "qgroup scan failed with %d", err); + } +- +-done: +- mutex_lock(&fs_info->qgroup_rescan_lock); +- fs_info->qgroup_rescan_running = false; +- mutex_unlock(&fs_info->qgroup_rescan_lock); +- complete_all(&fs_info->qgroup_rescan_completion); + } + + /* diff --git a/queue-5.2/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch b/queue-5.2/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch new file mode 100644 index 00000000000..f0b689367de --- /dev/null +++ b/queue-5.2/btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch @@ -0,0 +1,99 @@ +From efad8a853ad2057f96664328a0d327a05ce39c76 Mon Sep 17 00:00:00 2001 +From: Filipe Manana +Date: Mon, 12 Aug 2019 19:14:29 +0100 +Subject: Btrfs: fix use-after-free when using the tree modification log + +From: Filipe Manana + +commit efad8a853ad2057f96664328a0d327a05ce39c76 upstream. + +At ctree.c:get_old_root(), we are accessing a root's header owner field +after we have freed the respective extent buffer. This results in an +use-after-free that can lead to crashes, and when CONFIG_DEBUG_PAGEALLOC +is set, results in a stack trace like the following: + + [ 3876.799331] stack segment: 0000 [#1] SMP DEBUG_PAGEALLOC PTI + [ 3876.799363] CPU: 0 PID: 15436 Comm: pool Not tainted 5.3.0-rc3-btrfs-next-54 #1 + [ 3876.799385] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS rel-1.12.0-0-ga698c8995f-prebuilt.qemu.org 04/01/2014 + [ 3876.799433] RIP: 0010:btrfs_search_old_slot+0x652/0xd80 [btrfs] + (...) + [ 3876.799502] RSP: 0018:ffff9f08c1a2f9f0 EFLAGS: 00010286 + [ 3876.799518] RAX: ffff8dd300000000 RBX: ffff8dd85a7a9348 RCX: 000000038da26000 + [ 3876.799538] RDX: 0000000000000000 RSI: ffffe522ce368980 RDI: 0000000000000246 + [ 3876.799559] RBP: dae1922adadad000 R08: 0000000008020000 R09: ffffe522c0000000 + [ 3876.799579] R10: ffff8dd57fd788c8 R11: 000000007511b030 R12: ffff8dd781ddc000 + [ 3876.799599] R13: ffff8dd9e6240578 R14: ffff8dd6896f7a88 R15: ffff8dd688cf90b8 + [ 3876.799620] FS: 00007f23ddd97700(0000) GS:ffff8dda20200000(0000) knlGS:0000000000000000 + [ 3876.799643] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [ 3876.799660] CR2: 00007f23d4024000 CR3: 0000000710bb0005 CR4: 00000000003606f0 + [ 3876.799682] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 + [ 3876.799703] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 + [ 3876.799723] Call Trace: + [ 3876.799735] ? do_raw_spin_unlock+0x49/0xc0 + [ 3876.799749] ? _raw_spin_unlock+0x24/0x30 + [ 3876.799779] resolve_indirect_refs+0x1eb/0xc80 [btrfs] + [ 3876.799810] find_parent_nodes+0x38d/0x1180 [btrfs] + [ 3876.799841] btrfs_check_shared+0x11a/0x1d0 [btrfs] + [ 3876.799870] ? extent_fiemap+0x598/0x6e0 [btrfs] + [ 3876.799895] extent_fiemap+0x598/0x6e0 [btrfs] + [ 3876.799913] do_vfs_ioctl+0x45a/0x700 + [ 3876.799926] ksys_ioctl+0x70/0x80 + [ 3876.799938] ? trace_hardirqs_off_thunk+0x1a/0x20 + [ 3876.799953] __x64_sys_ioctl+0x16/0x20 + [ 3876.799965] do_syscall_64+0x62/0x220 + [ 3876.799977] entry_SYSCALL_64_after_hwframe+0x49/0xbe + [ 3876.799993] RIP: 0033:0x7f23e0013dd7 + (...) + [ 3876.800056] RSP: 002b:00007f23ddd96ca8 EFLAGS: 00000246 ORIG_RAX: 0000000000000010 + [ 3876.800078] RAX: ffffffffffffffda RBX: 00007f23d80210f8 RCX: 00007f23e0013dd7 + [ 3876.800099] RDX: 00007f23d80210f8 RSI: 00000000c020660b RDI: 0000000000000003 + [ 3876.800626] RBP: 000055fa2a2a2440 R08: 0000000000000000 R09: 00007f23ddd96d7c + [ 3876.801143] R10: 00007f23d8022000 R11: 0000000000000246 R12: 00007f23ddd96d80 + [ 3876.801662] R13: 00007f23ddd96d78 R14: 00007f23d80210f0 R15: 00007f23ddd96d80 + (...) + [ 3876.805107] ---[ end trace e53161e179ef04f9 ]--- + +Fix that by saving the root's header owner field into a local variable +before freeing the root's extent buffer, and then use that local variable +when needed. + +Fixes: 30b0463a9394d9 ("Btrfs: fix accessing the root pointer in tree mod log functions") +CC: stable@vger.kernel.org # 3.10+ +Reviewed-by: Nikolay Borisov +Reviewed-by: Anand Jain +Signed-off-by: Filipe Manana +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -1343,6 +1343,7 @@ get_old_root(struct btrfs_root *root, u6 + struct tree_mod_elem *tm; + struct extent_buffer *eb = NULL; + struct extent_buffer *eb_root; ++ u64 eb_root_owner = 0; + struct extent_buffer *old; + struct tree_mod_root *old_root = NULL; + u64 old_generation = 0; +@@ -1380,6 +1381,7 @@ get_old_root(struct btrfs_root *root, u6 + free_extent_buffer(old); + } + } else if (old_root) { ++ eb_root_owner = btrfs_header_owner(eb_root); + btrfs_tree_read_unlock(eb_root); + free_extent_buffer(eb_root); + eb = alloc_dummy_extent_buffer(fs_info, logical); +@@ -1396,7 +1398,7 @@ get_old_root(struct btrfs_root *root, u6 + if (old_root) { + btrfs_set_header_bytenr(eb, eb->start); + btrfs_set_header_backref_rev(eb, BTRFS_MIXED_BACKREF_REV); +- btrfs_set_header_owner(eb, btrfs_header_owner(eb_root)); ++ btrfs_set_header_owner(eb, eb_root_owner); + btrfs_set_header_level(eb, old_root->level); + btrfs_set_header_generation(eb, old_generation); + } diff --git a/queue-5.2/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch b/queue-5.2/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch new file mode 100644 index 00000000000..082bd95200e --- /dev/null +++ b/queue-5.2/btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch @@ -0,0 +1,92 @@ +From d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 16 Sep 2019 20:02:39 +0800 +Subject: btrfs: qgroup: Fix reserved data space leak if we have multiple reserve calls + +From: Qu Wenruo + +commit d4e204948fe3e0dc8e1fbf3f8f3290c9c2823be3 upstream. + +[BUG] +The following script can cause btrfs qgroup data space leak: + + mkfs.btrfs -f $dev + mount $dev -o nospace_cache $mnt + + btrfs subv create $mnt/subv + btrfs quota en $mnt + btrfs quota rescan -w $mnt + btrfs qgroup limit 128m $mnt/subv + + for (( i = 0; i < 3; i++)); do + # Create 3 64M holes for latter fallocate to fail + truncate -s 192m $mnt/subv/file + xfs_io -c "pwrite 64m 4k" $mnt/subv/file > /dev/null + xfs_io -c "pwrite 128m 4k" $mnt/subv/file > /dev/null + sync + + # it's supposed to fail, and each failure will leak at least 64M + # data space + xfs_io -f -c "falloc 0 192m" $mnt/subv/file &> /dev/null + rm $mnt/subv/file + sync + done + + # Shouldn't fail after we removed the file + xfs_io -f -c "falloc 0 64m" $mnt/subv/file + +[CAUSE] +Btrfs qgroup data reserve code allow multiple reservations to happen on +a single extent_changeset: +E.g: + btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_1M); + btrfs_qgroup_reserve_data(inode, &data_reserved, SZ_1M, SZ_2M); + btrfs_qgroup_reserve_data(inode, &data_reserved, 0, SZ_4M); + +Btrfs qgroup code has its internal tracking to make sure we don't +double-reserve in above example. + +The only pattern utilizing this feature is in the main while loop of +btrfs_fallocate() function. + +However btrfs_qgroup_reserve_data()'s error handling has a bug in that +on error it clears all ranges in the io_tree with EXTENT_QGROUP_RESERVED +flag but doesn't free previously reserved bytes. + +This bug has a two fold effect: +- Clearing EXTENT_QGROUP_RESERVED ranges + This is the correct behavior, but it prevents + btrfs_qgroup_check_reserved_leak() to catch the leakage as the + detector is purely EXTENT_QGROUP_RESERVED flag based. + +- Leak the previously reserved data bytes. + +The bug manifests when N calls to btrfs_qgroup_reserve_data are made and +the last one fails, leaking space reserved in the previous ones. + +[FIX] +Also free previously reserved data bytes when btrfs_qgroup_reserve_data +fails. + +Fixes: 524725537023 ("btrfs: qgroup: Introduce btrfs_qgroup_reserve_data function") +CC: stable@vger.kernel.org # 4.4+ +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 3 +++ + 1 file changed, 3 insertions(+) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3425,6 +3425,9 @@ cleanup: + while ((unode = ulist_next(&reserved->range_changed, &uiter))) + clear_extent_bit(&BTRFS_I(inode)->io_tree, unode->val, + unode->aux, EXTENT_QGROUP_RESERVED, 0, 0, NULL); ++ /* Also free data bytes of already reserved one */ ++ btrfs_qgroup_free_refroot(root->fs_info, root->root_key.objectid, ++ orig_reserved, BTRFS_QGROUP_RSV_DATA); + extent_changeset_release(reserved); + return ret; + } diff --git a/queue-5.2/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch b/queue-5.2/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch new file mode 100644 index 00000000000..cecc957915e --- /dev/null +++ b/queue-5.2/btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch @@ -0,0 +1,81 @@ +From bab32fc069ce8829c416e8737c119f62a57970f9 Mon Sep 17 00:00:00 2001 +From: Qu Wenruo +Date: Mon, 16 Sep 2019 20:02:38 +0800 +Subject: btrfs: qgroup: Fix the wrong target io_tree when freeing reserved data space + +From: Qu Wenruo + +commit bab32fc069ce8829c416e8737c119f62a57970f9 upstream. + +[BUG] +Under the following case with qgroup enabled, if some error happened +after we have reserved delalloc space, then in error handling path, we +could cause qgroup data space leakage: + +From btrfs_truncate_block() in inode.c: + + ret = btrfs_delalloc_reserve_space(inode, &data_reserved, + block_start, blocksize); + if (ret) + goto out; + + again: + page = find_or_create_page(mapping, index, mask); + if (!page) { + btrfs_delalloc_release_space(inode, data_reserved, + block_start, blocksize, true); + btrfs_delalloc_release_extents(BTRFS_I(inode), blocksize, true); + ret = -ENOMEM; + goto out; + } + +[CAUSE] +In the above case, btrfs_delalloc_reserve_space() will call +btrfs_qgroup_reserve_data() and mark the io_tree range with +EXTENT_QGROUP_RESERVED flag. + +In the error handling path, we have the following call stack: +btrfs_delalloc_release_space() +|- btrfs_free_reserved_data_space() + |- btrsf_qgroup_free_data() + |- __btrfs_qgroup_release_data(reserved=@reserved, free=1) + |- qgroup_free_reserved_data(reserved=@reserved) + |- clear_record_extent_bits(); + |- freed += changeset.bytes_changed; + +However due to a completion bug, qgroup_free_reserved_data() will clear +EXTENT_QGROUP_RESERVED flag in BTRFS_I(inode)->io_failure_tree, other +than the correct BTRFS_I(inode)->io_tree. +Since io_failure_tree is never marked with that flag, +btrfs_qgroup_free_data() will not free any data reserved space at all, +causing a leakage. + +This type of error handling can only be triggered by errors outside of +qgroup code. So EDQUOT error from qgroup can't trigger it. + +[FIX] +Fix the wrong target io_tree. + +Reported-by: Josef Bacik +Fixes: bc42bda22345 ("btrfs: qgroup: Fix qgroup reserved space underflow by only freeing reserved ranges") +CC: stable@vger.kernel.org # 4.14+ +Reviewed-by: Nikolay Borisov +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/qgroup.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/fs/btrfs/qgroup.c ++++ b/fs/btrfs/qgroup.c +@@ -3469,7 +3469,7 @@ static int qgroup_free_reserved_data(str + * EXTENT_QGROUP_RESERVED, we won't double free. + * So not need to rush. + */ +- ret = clear_record_extent_bits(&BTRFS_I(inode)->io_failure_tree, ++ ret = clear_record_extent_bits(&BTRFS_I(inode)->io_tree, + free_start, free_start + free_len - 1, + EXTENT_QGROUP_RESERVED, &changeset); + if (ret < 0) diff --git a/queue-5.2/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch b/queue-5.2/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch new file mode 100644 index 00000000000..48d7691edc9 --- /dev/null +++ b/queue-5.2/btrfs-relinquish-cpus-in-btrfs_compare_trees.patch @@ -0,0 +1,69 @@ +From 6af112b11a4bc1b560f60a618ac9c1dcefe9836e Mon Sep 17 00:00:00 2001 +From: Nikolay Borisov +Date: Wed, 4 Sep 2019 19:33:58 +0300 +Subject: btrfs: Relinquish CPUs in btrfs_compare_trees + +From: Nikolay Borisov + +commit 6af112b11a4bc1b560f60a618ac9c1dcefe9836e upstream. + +When doing any form of incremental send the parent and the child trees +need to be compared via btrfs_compare_trees. This can result in long +loop chains without ever relinquishing the CPU. This causes softlockup +detector to trigger when comparing trees with a lot of items. Example +report: + +watchdog: BUG: soft lockup - CPU#0 stuck for 24s! [snapperd:16153] +CPU: 0 PID: 16153 Comm: snapperd Not tainted 5.2.9-1-default #1 openSUSE Tumbleweed (unreleased) +Hardware name: QEMU KVM Virtual Machine, BIOS 0.0.0 02/06/2015 +pstate: 40000005 (nZcv daif -PAN -UAO) +pc : __ll_sc_arch_atomic_sub_return+0x14/0x20 +lr : btrfs_release_extent_buffer_pages+0xe0/0x1e8 [btrfs] +sp : ffff00001273b7e0 +Call trace: + __ll_sc_arch_atomic_sub_return+0x14/0x20 + release_extent_buffer+0xdc/0x120 [btrfs] + free_extent_buffer.part.0+0xb0/0x118 [btrfs] + free_extent_buffer+0x24/0x30 [btrfs] + btrfs_release_path+0x4c/0xa0 [btrfs] + btrfs_free_path.part.0+0x20/0x40 [btrfs] + btrfs_free_path+0x24/0x30 [btrfs] + get_inode_info+0xa8/0xf8 [btrfs] + finish_inode_if_needed+0xe0/0x6d8 [btrfs] + changed_cb+0x9c/0x410 [btrfs] + btrfs_compare_trees+0x284/0x648 [btrfs] + send_subvol+0x33c/0x520 [btrfs] + btrfs_ioctl_send+0x8a0/0xaf0 [btrfs] + btrfs_ioctl+0x199c/0x2288 [btrfs] + do_vfs_ioctl+0x4b0/0x820 + ksys_ioctl+0x84/0xb8 + __arm64_sys_ioctl+0x28/0x38 + el0_svc_common.constprop.0+0x7c/0x188 + el0_svc_handler+0x34/0x90 + el0_svc+0x8/0xc + +Fix this by adding a call to cond_resched at the beginning of the main +loop in btrfs_compare_trees. + +Fixes: 7069830a9e38 ("Btrfs: add btrfs_compare_trees function") +CC: stable@vger.kernel.org # 4.4+ +Reviewed-by: Johannes Thumshirn +Signed-off-by: Nikolay Borisov +Reviewed-by: David Sterba +Signed-off-by: David Sterba +Signed-off-by: Greg Kroah-Hartman + +--- + fs/btrfs/ctree.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/btrfs/ctree.c ++++ b/fs/btrfs/ctree.c +@@ -5477,6 +5477,7 @@ int btrfs_compare_trees(struct btrfs_roo + advance_left = advance_right = 0; + + while (1) { ++ cond_resched(); + if (advance_left && !left_end_reached) { + ret = tree_advance(left_path, &left_level, + left_root_level, diff --git a/queue-5.2/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch b/queue-5.2/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch new file mode 100644 index 00000000000..67845f9585f --- /dev/null +++ b/queue-5.2/efifb-bgrt-improve-efifb_bgrt_sanity_check.patch @@ -0,0 +1,71 @@ +From 51677dfcc17f88ed754143df670ff064eae67f84 Mon Sep 17 00:00:00 2001 +From: Hans de Goede +Date: Sun, 21 Jul 2019 15:19:18 +0200 +Subject: efifb: BGRT: Improve efifb_bgrt_sanity_check + +From: Hans de Goede + +commit 51677dfcc17f88ed754143df670ff064eae67f84 upstream. + +For various reasons, at least with x86 EFI firmwares, the xoffset and +yoffset in the BGRT info are not always reliable. + +Extensive testing has shown that when the info is correct, the +BGRT image is always exactly centered horizontally (the yoffset variable +is more variable and not always predictable). + +This commit simplifies / improves the bgrt_sanity_check to simply +check that the BGRT image is exactly centered horizontally and skips +(re)drawing it when it is not. + +This fixes the BGRT image sometimes being drawn in the wrong place. + +Cc: stable@vger.kernel.org +Fixes: 88fe4ceb2447 ("efifb: BGRT: Do not copy the boot graphics for non native resolutions") +Signed-off-by: Hans de Goede +Cc: Peter Jones , +Signed-off-by: Bartlomiej Zolnierkiewicz +Link: https://patchwork.freedesktop.org/patch/msgid/20190721131918.10115-1-hdegoede@redhat.com +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/video/fbdev/efifb.c | 27 ++++++--------------------- + 1 file changed, 6 insertions(+), 21 deletions(-) + +--- a/drivers/video/fbdev/efifb.c ++++ b/drivers/video/fbdev/efifb.c +@@ -122,28 +122,13 @@ static void efifb_copy_bmp(u8 *src, u32 + */ + static bool efifb_bgrt_sanity_check(struct screen_info *si, u32 bmp_width) + { +- static const int default_resolutions[][2] = { +- { 800, 600 }, +- { 1024, 768 }, +- { 1280, 1024 }, +- }; +- u32 i, right_margin; ++ /* ++ * All x86 firmwares horizontally center the image (the yoffset ++ * calculations differ between boards, but xoffset is predictable). ++ */ ++ u32 expected_xoffset = (si->lfb_width - bmp_width) / 2; + +- for (i = 0; i < ARRAY_SIZE(default_resolutions); i++) { +- if (default_resolutions[i][0] == si->lfb_width && +- default_resolutions[i][1] == si->lfb_height) +- break; +- } +- /* If not a default resolution used for textmode, this should be fine */ +- if (i >= ARRAY_SIZE(default_resolutions)) +- return true; +- +- /* If the right margin is 5 times smaller then the left one, reject */ +- right_margin = si->lfb_width - (bgrt_tab.image_offset_x + bmp_width); +- if (right_margin < (bgrt_tab.image_offset_x / 5)) +- return false; +- +- return true; ++ return bgrt_tab.image_offset_x == expected_xoffset; + } + #else + static bool efifb_bgrt_sanity_check(struct screen_info *si, u32 bmp_width) diff --git a/queue-5.2/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch b/queue-5.2/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch new file mode 100644 index 00000000000..57c4d3ab97f --- /dev/null +++ b/queue-5.2/gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch @@ -0,0 +1,39 @@ +From f0b444b349e33ae0d3dd93e25ca365482a5d17d4 Mon Sep 17 00:00:00 2001 +From: Bob Peterson +Date: Thu, 12 Sep 2019 13:54:27 -0400 +Subject: gfs2: clear buf_in_tr when ending a transaction in sweep_bh_for_rgrps + +From: Bob Peterson + +commit f0b444b349e33ae0d3dd93e25ca365482a5d17d4 upstream. + +In function sweep_bh_for_rgrps, which is a helper for punch_hole, +it uses variable buf_in_tr to keep track of when it needs to commit +pending block frees on a partial delete that overflows the +transaction created for the delete. The problem is that the +variable was initialized at the start of function sweep_bh_for_rgrps +but it was never cleared, even when starting a new transaction. + +This patch reinitializes the variable when the transaction is +ended, so the next transaction starts out with it cleared. + +Fixes: d552a2b9b33e ("GFS2: Non-recursive delete") +Cc: stable@vger.kernel.org # v4.12+ +Signed-off-by: Bob Peterson +Signed-off-by: Andreas Gruenbacher +Signed-off-by: Greg Kroah-Hartman + +--- + fs/gfs2/bmap.c | 1 + + 1 file changed, 1 insertion(+) + +--- a/fs/gfs2/bmap.c ++++ b/fs/gfs2/bmap.c +@@ -1670,6 +1670,7 @@ out_unlock: + brelse(dibh); + up_write(&ip->i_rw_mutex); + gfs2_trans_end(sdp); ++ buf_in_tr = false; + } + gfs2_glock_dq_uninit(rd_gh); + cond_resched(); diff --git a/queue-5.2/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch b/queue-5.2/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch new file mode 100644 index 00000000000..67aa4253d9d --- /dev/null +++ b/queue-5.2/i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch @@ -0,0 +1,74 @@ +From a7542b87607560d0b89e7ff81d870bd6ff8835cb Mon Sep 17 00:00:00 2001 +From: Stefan Assmann +Date: Wed, 21 Aug 2019 16:09:29 +0200 +Subject: i40e: check __I40E_VF_DISABLE bit in i40e_sync_filters_subtask + +From: Stefan Assmann + +commit a7542b87607560d0b89e7ff81d870bd6ff8835cb upstream. + +While testing VF spawn/destroy the following panic occurred. + +BUG: unable to handle kernel NULL pointer dereference at 0000000000000029 +[...] +Workqueue: i40e i40e_service_task [i40e] +RIP: 0010:i40e_sync_vsi_filters+0x6fd/0xc60 [i40e] +[...] +Call Trace: + ? __switch_to_asm+0x35/0x70 + ? __switch_to_asm+0x41/0x70 + ? __switch_to_asm+0x35/0x70 + ? _cond_resched+0x15/0x30 + i40e_sync_filters_subtask+0x56/0x70 [i40e] + i40e_service_task+0x382/0x11b0 [i40e] + ? __switch_to_asm+0x41/0x70 + ? __switch_to_asm+0x41/0x70 + process_one_work+0x1a7/0x3b0 + worker_thread+0x30/0x390 + ? create_worker+0x1a0/0x1a0 + kthread+0x112/0x130 + ? kthread_bind+0x30/0x30 + ret_from_fork+0x35/0x40 + +Investigation revealed a race where pf->vf[vsi->vf_id].trusted may get +accessed by the watchdog via i40e_sync_filters_subtask() although +i40e_free_vfs() already free'd pf->vf. +To avoid this the call to i40e_sync_vsi_filters() in +i40e_sync_filters_subtask() needs to be guarded by __I40E_VF_DISABLE, +which is also used by i40e_free_vfs(). + +Note: put the __I40E_VF_DISABLE check after the +__I40E_MACVLAN_SYNC_PENDING check as the latter is more likely to +trigger. + +CC: stable@vger.kernel.org +Signed-off-by: Stefan Assmann +Tested-by: Andrew Bowers +Signed-off-by: Jeff Kirsher +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 5 +++++ + 1 file changed, 5 insertions(+) + +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -2586,6 +2586,10 @@ static void i40e_sync_filters_subtask(st + return; + if (!test_and_clear_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state)) + return; ++ if (test_and_set_bit(__I40E_VF_DISABLE, pf->state)) { ++ set_bit(__I40E_MACVLAN_SYNC_PENDING, pf->state); ++ return; ++ } + + for (v = 0; v < pf->num_alloc_vsi; v++) { + if (pf->vsi[v] && +@@ -2600,6 +2604,7 @@ static void i40e_sync_filters_subtask(st + } + } + } ++ clear_bit(__I40E_VF_DISABLE, pf->state); + } + + /** diff --git a/queue-5.2/keys-trusted-correctly-initialize-digests-and-fix-locking-issue.patch b/queue-5.2/keys-trusted-correctly-initialize-digests-and-fix-locking-issue.patch new file mode 100644 index 00000000000..31073aa4139 --- /dev/null +++ b/queue-5.2/keys-trusted-correctly-initialize-digests-and-fix-locking-issue.patch @@ -0,0 +1,83 @@ +From 9f75c82246313d4c2a6bc77e947b45655b3b5ad5 Mon Sep 17 00:00:00 2001 +From: Roberto Sassu +Date: Fri, 13 Sep 2019 20:51:36 +0200 +Subject: KEYS: trusted: correctly initialize digests and fix locking issue + +From: Roberto Sassu + +commit 9f75c82246313d4c2a6bc77e947b45655b3b5ad5 upstream. + +Commit 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to +tpm_pcr_extend()") modifies tpm_pcr_extend() to accept a digest for each +PCR bank. After modification, tpm_pcr_extend() expects that digests are +passed in the same order as the algorithms set in chip->allocated_banks. + +This patch fixes two issues introduced in the last iterations of the patch +set: missing initialization of the TPM algorithm ID in the tpm_digest +structures passed to tpm_pcr_extend() by the trusted key module, and +unreleased locks in the TPM driver due to returning from tpm_pcr_extend() +without calling tpm_put_ops(). + +Cc: stable@vger.kernel.org +Fixes: 0b6cf6b97b7e ("tpm: pass an array of tpm_extend_digest structures to tpm_pcr_extend()") +Signed-off-by: Roberto Sassu +Suggested-by: Jarkko Sakkinen +Reviewed-by: Jerry Snitselaar +Reviewed-by: Jarkko Sakkinen +Signed-off-by: Jarkko Sakkinen +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/char/tpm/tpm-interface.c | 14 +++++++++----- + security/keys/trusted.c | 5 +++++ + 2 files changed, 14 insertions(+), 5 deletions(-) + +--- a/drivers/char/tpm/tpm-interface.c ++++ b/drivers/char/tpm/tpm-interface.c +@@ -320,18 +320,22 @@ int tpm_pcr_extend(struct tpm_chip *chip + if (!chip) + return -ENODEV; + +- for (i = 0; i < chip->nr_allocated_banks; i++) +- if (digests[i].alg_id != chip->allocated_banks[i].alg_id) +- return -EINVAL; ++ for (i = 0; i < chip->nr_allocated_banks; i++) { ++ if (digests[i].alg_id != chip->allocated_banks[i].alg_id) { ++ rc = EINVAL; ++ goto out; ++ } ++ } + + if (chip->flags & TPM_CHIP_FLAG_TPM2) { + rc = tpm2_pcr_extend(chip, pcr_idx, digests); +- tpm_put_ops(chip); +- return rc; ++ goto out; + } + + rc = tpm1_pcr_extend(chip, pcr_idx, digests[0].digest, + "attempting extend a PCR value"); ++ ++out: + tpm_put_ops(chip); + return rc; + } +--- a/security/keys/trusted.c ++++ b/security/keys/trusted.c +@@ -1228,11 +1228,16 @@ hashalg_fail: + + static int __init init_digests(void) + { ++ int i; ++ + digests = kcalloc(chip->nr_allocated_banks, sizeof(*digests), + GFP_KERNEL); + if (!digests) + return -ENOMEM; + ++ for (i = 0; i < chip->nr_allocated_banks; i++) ++ digests[i].alg_id = chip->allocated_banks[i].alg_id; ++ + return 0; + } + diff --git a/queue-5.2/lib-lzo-lzo1x_compress.c-fix-alignment-bug-in-lzo-rle.patch b/queue-5.2/lib-lzo-lzo1x_compress.c-fix-alignment-bug-in-lzo-rle.patch new file mode 100644 index 00000000000..53db45da280 --- /dev/null +++ b/queue-5.2/lib-lzo-lzo1x_compress.c-fix-alignment-bug-in-lzo-rle.patch @@ -0,0 +1,54 @@ +From 09b35b4192f6682dff96a093ab1930998cdb73b4 Mon Sep 17 00:00:00 2001 +From: Dave Rodgman +Date: Wed, 25 Sep 2019 16:48:24 -0700 +Subject: lib/lzo/lzo1x_compress.c: fix alignment bug in lzo-rle + +From: Dave Rodgman + +commit 09b35b4192f6682dff96a093ab1930998cdb73b4 upstream. + +Fix an unaligned access which breaks on platforms where this is not +permitted (e.g., Sparc). + +Link: http://lkml.kernel.org/r/20190912145502.35229-1-dave.rodgman@arm.com +Signed-off-by: Dave Rodgman +Cc: Dave Rodgman +Cc: Markus F.X.J. Oberhumer +Cc: Minchan Kim +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + lib/lzo/lzo1x_compress.c | 14 ++++++++------ + 1 file changed, 8 insertions(+), 6 deletions(-) + +--- a/lib/lzo/lzo1x_compress.c ++++ b/lib/lzo/lzo1x_compress.c +@@ -83,17 +83,19 @@ next: + ALIGN((uintptr_t)ir, 4)) && + (ir < limit) && (*ir == 0)) + ir++; +- for (; (ir + 4) <= limit; ir += 4) { +- dv = *((u32 *)ir); +- if (dv) { ++ if (IS_ALIGNED((uintptr_t)ir, 4)) { ++ for (; (ir + 4) <= limit; ir += 4) { ++ dv = *((u32 *)ir); ++ if (dv) { + # if defined(__LITTLE_ENDIAN) +- ir += __builtin_ctz(dv) >> 3; ++ ir += __builtin_ctz(dv) >> 3; + # elif defined(__BIG_ENDIAN) +- ir += __builtin_clz(dv) >> 3; ++ ir += __builtin_clz(dv) >> 3; + # else + # error "missing endian definition" + # endif +- break; ++ break; ++ } + } + } + #endif diff --git a/queue-5.2/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch b/queue-5.2/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch new file mode 100644 index 00000000000..93aa0ce3a58 --- /dev/null +++ b/queue-5.2/memcg-kmem-do-not-fail-__gfp_nofail-charges.patch @@ -0,0 +1,87 @@ +From e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 Mon Sep 17 00:00:00 2001 +From: Michal Hocko +Date: Wed, 25 Sep 2019 16:45:53 -0700 +Subject: memcg, kmem: do not fail __GFP_NOFAIL charges + +From: Michal Hocko + +commit e55d9d9bfb69405bd7615c0f8d229d8fafb3e9b8 upstream. + +Thomas has noticed the following NULL ptr dereference when using cgroup +v1 kmem limit: +BUG: unable to handle kernel NULL pointer dereference at 0000000000000008 +PGD 0 +P4D 0 +Oops: 0000 [#1] PREEMPT SMP PTI +CPU: 3 PID: 16923 Comm: gtk-update-icon Not tainted 4.19.51 #42 +Hardware name: Gigabyte Technology Co., Ltd. Z97X-Gaming G1/Z97X-Gaming G1, BIOS F9 07/31/2015 +RIP: 0010:create_empty_buffers+0x24/0x100 +Code: cd 0f 1f 44 00 00 0f 1f 44 00 00 41 54 49 89 d4 ba 01 00 00 00 55 53 48 89 fb e8 97 fe ff ff 48 89 c5 48 89 c2 eb 03 48 89 ca <48> 8b 4a 08 4c 09 22 48 85 c9 75 f1 48 89 6a 08 48 8b 43 18 48 8d +RSP: 0018:ffff927ac1b37bf8 EFLAGS: 00010286 +RAX: 0000000000000000 RBX: fffff2d4429fd740 RCX: 0000000100097149 +RDX: 0000000000000000 RSI: 0000000000000082 RDI: ffff9075a99fbe00 +RBP: 0000000000000000 R08: fffff2d440949cc8 R09: 00000000000960c0 +R10: 0000000000000002 R11: 0000000000000000 R12: 0000000000000000 +R13: ffff907601f18360 R14: 0000000000002000 R15: 0000000000001000 +FS: 00007fb55b288bc0(0000) GS:ffff90761f8c0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000000000008 CR3: 000000007aebc002 CR4: 00000000001606e0 +Call Trace: + create_page_buffers+0x4d/0x60 + __block_write_begin_int+0x8e/0x5a0 + ? ext4_inode_attach_jinode.part.82+0xb0/0xb0 + ? jbd2__journal_start+0xd7/0x1f0 + ext4_da_write_begin+0x112/0x3d0 + generic_perform_write+0xf1/0x1b0 + ? file_update_time+0x70/0x140 + __generic_file_write_iter+0x141/0x1a0 + ext4_file_write_iter+0xef/0x3b0 + __vfs_write+0x17e/0x1e0 + vfs_write+0xa5/0x1a0 + ksys_write+0x57/0xd0 + do_syscall_64+0x55/0x160 + entry_SYSCALL_64_after_hwframe+0x44/0xa9 + +Tetsuo then noticed that this is because the __memcg_kmem_charge_memcg +fails __GFP_NOFAIL charge when the kmem limit is reached. This is a wrong +behavior because nofail allocations are not allowed to fail. Normal +charge path simply forces the charge even if that means to cross the +limit. Kmem accounting should be doing the same. + +Link: http://lkml.kernel.org/r/20190906125608.32129-1-mhocko@kernel.org +Signed-off-by: Michal Hocko +Reported-by: Thomas Lindroth +Debugged-by: Tetsuo Handa +Cc: Johannes Weiner +Cc: Vladimir Davydov +Cc: Andrey Ryabinin +Cc: Thomas Lindroth +Cc: Shakeel Butt +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/memcontrol.c | 10 ++++++++++ + 1 file changed, 10 insertions(+) + +--- a/mm/memcontrol.c ++++ b/mm/memcontrol.c +@@ -2719,6 +2719,16 @@ int __memcg_kmem_charge_memcg(struct pag + + if (!cgroup_subsys_on_dfl(memory_cgrp_subsys) && + !page_counter_try_charge(&memcg->kmem, nr_pages, &counter)) { ++ ++ /* ++ * Enforce __GFP_NOFAIL allocation because callers are not ++ * prepared to see failures and likely do not have any failure ++ * handling code. ++ */ ++ if (gfp & __GFP_NOFAIL) { ++ page_counter_charge(&memcg->kmem, nr_pages); ++ return 0; ++ } + cancel_charge(memcg, nr_pages); + return -ENOMEM; + } diff --git a/queue-5.2/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch b/queue-5.2/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch new file mode 100644 index 00000000000..b986ebde53a --- /dev/null +++ b/queue-5.2/memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch @@ -0,0 +1,182 @@ +From f9c645621a28e37813a1de96d9cbd89cde94a1e4 Mon Sep 17 00:00:00 2001 +From: Tetsuo Handa +Date: Mon, 23 Sep 2019 15:37:08 -0700 +Subject: memcg, oom: don't require __GFP_FS when invoking memcg OOM killer + +From: Tetsuo Handa + +commit f9c645621a28e37813a1de96d9cbd89cde94a1e4 upstream. + +Masoud Sharbiani noticed that commit 29ef680ae7c21110 ("memcg, oom: move +out_of_memory back to the charge path") broke memcg OOM called from +__xfs_filemap_fault() path. It turned out that try_charge() is retrying +forever without making forward progress because mem_cgroup_oom(GFP_NOFS) +cannot invoke the OOM killer due to commit 3da88fb3bacfaa33 ("mm, oom: +move GFP_NOFS check to out_of_memory"). + +Allowing forced charge due to being unable to invoke memcg OOM killer will +lead to global OOM situation. Also, just returning -ENOMEM will be risky +because OOM path is lost and some paths (e.g. get_user_pages()) will leak +-ENOMEM. Therefore, invoking memcg OOM killer (despite GFP_NOFS) will be +the only choice we can choose for now. + +Until 29ef680ae7c21110, we were able to invoke memcg OOM killer when +GFP_KERNEL reclaim failed [1]. But since 29ef680ae7c21110, we need to +invoke memcg OOM killer when GFP_NOFS reclaim failed [2]. Although in the +past we did invoke memcg OOM killer for GFP_NOFS [3], we might get +pre-mature memcg OOM reports due to this patch. + +[1] + + leaker invoked oom-killer: gfp_mask=0x6200ca(GFP_HIGHUSER_MOVABLE), nodemask=(null), order=0, oom_score_adj=0 + CPU: 0 PID: 2746 Comm: leaker Not tainted 4.18.0+ #19 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + dump_stack+0x63/0x88 + dump_header+0x67/0x27a + ? mem_cgroup_scan_tasks+0x91/0xf0 + oom_kill_process+0x210/0x410 + out_of_memory+0x10a/0x2c0 + mem_cgroup_out_of_memory+0x46/0x80 + mem_cgroup_oom_synchronize+0x2e4/0x310 + ? high_work_func+0x20/0x20 + pagefault_out_of_memory+0x31/0x76 + mm_fault_error+0x55/0x115 + ? handle_mm_fault+0xfd/0x220 + __do_page_fault+0x433/0x4e0 + do_page_fault+0x22/0x30 + ? page_fault+0x8/0x30 + page_fault+0x1e/0x30 + RIP: 0033:0x4009f0 + Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 + RSP: 002b:00007ffe29ae96f0 EFLAGS: 00010206 + RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001ce1000 + RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 + RBP: 000000000000000c R08: 0000000000000000 R09: 00007f94be09220d + R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 + R13: 0000000000000003 R14: 00007f949d845000 R15: 0000000002800000 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 158965 + memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 2016kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:844KB rss:521136KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:132KB writeback:0KB inactive_anon:0KB active_anon:521224KB inactive_file:1012KB active_file:8KB unevictable:0KB + Memory cgroup out of memory: Kill process 2746 (leaker) score 998 or sacrifice child + Killed process 2746 (leaker) total-vm:536704kB, anon-rss:521176kB, file-rss:1208kB, shmem-rss:0kB + oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB + +[2] + + leaker invoked oom-killer: gfp_mask=0x600040(GFP_NOFS), nodemask=(null), order=0, oom_score_adj=0 + CPU: 1 PID: 2746 Comm: leaker Not tainted 4.18.0+ #20 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + dump_stack+0x63/0x88 + dump_header+0x67/0x27a + ? mem_cgroup_scan_tasks+0x91/0xf0 + oom_kill_process+0x210/0x410 + out_of_memory+0x109/0x2d0 + mem_cgroup_out_of_memory+0x46/0x80 + try_charge+0x58d/0x650 + ? __radix_tree_replace+0x81/0x100 + mem_cgroup_try_charge+0x7a/0x100 + __add_to_page_cache_locked+0x92/0x180 + add_to_page_cache_lru+0x4d/0xf0 + iomap_readpages_actor+0xde/0x1b0 + ? iomap_zero_range_actor+0x1d0/0x1d0 + iomap_apply+0xaf/0x130 + iomap_readpages+0x9f/0x150 + ? iomap_zero_range_actor+0x1d0/0x1d0 + xfs_vm_readpages+0x18/0x20 [xfs] + read_pages+0x60/0x140 + __do_page_cache_readahead+0x193/0x1b0 + ondemand_readahead+0x16d/0x2c0 + page_cache_async_readahead+0x9a/0xd0 + filemap_fault+0x403/0x620 + ? alloc_set_pte+0x12c/0x540 + ? _cond_resched+0x14/0x30 + __xfs_filemap_fault+0x66/0x180 [xfs] + xfs_filemap_fault+0x27/0x30 [xfs] + __do_fault+0x19/0x40 + __handle_mm_fault+0x8e8/0xb60 + handle_mm_fault+0xfd/0x220 + __do_page_fault+0x238/0x4e0 + do_page_fault+0x22/0x30 + ? page_fault+0x8/0x30 + page_fault+0x1e/0x30 + RIP: 0033:0x4009f0 + Code: 03 00 00 00 e8 71 fd ff ff 48 83 f8 ff 49 89 c6 74 74 48 89 c6 bf c0 0c 40 00 31 c0 e8 69 fd ff ff 45 85 ff 7e 21 31 c9 66 90 <41> 0f be 14 0e 01 d3 f7 c1 ff 0f 00 00 75 05 41 c6 04 0e 2a 48 83 + RSP: 002b:00007ffda45c9290 EFLAGS: 00010206 + RAX: 000000000000001b RBX: 0000000000000000 RCX: 0000000001a1e000 + RDX: 0000000000000000 RSI: 000000007fffffe5 RDI: 0000000000000000 + RBP: 000000000000000c R08: 0000000000000000 R09: 00007f6d061ff20d + R10: 0000000000000002 R11: 0000000000000246 R12: 00000000000186a0 + R13: 0000000000000003 R14: 00007f6ce59b2000 R15: 0000000002800000 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 7221 + memory+swap: usage 0kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 1944kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:3632KB rss:518232KB rss_huge:0KB shmem:0KB mapped_file:0KB dirty:0KB writeback:0KB inactive_anon:0KB active_anon:518408KB inactive_file:3908KB active_file:12KB unevictable:0KB + Memory cgroup out of memory: Kill process 2746 (leaker) score 992 or sacrifice child + Killed process 2746 (leaker) total-vm:536704kB, anon-rss:518264kB, file-rss:1188kB, shmem-rss:0kB + oom_reaper: reaped process 2746 (leaker), now anon-rss:0kB, file-rss:0kB, shmem-rss:0kB + +[3] + + leaker invoked oom-killer: gfp_mask=0x50, order=0, oom_score_adj=0 + leaker cpuset=/ mems_allowed=0 + CPU: 1 PID: 3206 Comm: leaker Not tainted 3.10.0-957.27.2.el7.x86_64 #1 + Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 04/13/2018 + Call Trace: + [] dump_stack+0x19/0x1b + [] dump_header+0x90/0x229 + [] ? find_lock_task_mm+0x56/0xc0 + [] ? try_get_mem_cgroup_from_mm+0x28/0x60 + [] oom_kill_process+0x254/0x3d0 + [] mem_cgroup_oom_synchronize+0x546/0x570 + [] ? mem_cgroup_charge_common+0xc0/0xc0 + [] pagefault_out_of_memory+0x14/0x90 + [] mm_fault_error+0x6a/0x157 + [] __do_page_fault+0x3c8/0x4f0 + [] do_page_fault+0x35/0x90 + [] page_fault+0x28/0x30 + Task in /leaker killed as a result of limit of /leaker + memory: usage 524288kB, limit 524288kB, failcnt 20628 + memory+swap: usage 524288kB, limit 9007199254740988kB, failcnt 0 + kmem: usage 0kB, limit 9007199254740988kB, failcnt 0 + Memory cgroup stats for /leaker: cache:840KB rss:523448KB rss_huge:0KB mapped_file:0KB swap:0KB inactive_anon:0KB active_anon:523448KB inactive_file:464KB active_file:376KB unevictable:0KB + Memory cgroup out of memory: Kill process 3206 (leaker) score 970 or sacrifice child + Killed process 3206 (leaker) total-vm:536692kB, anon-rss:523304kB, file-rss:412kB, shmem-rss:0kB + +Bisected by Masoud Sharbiani. + +Link: http://lkml.kernel.org/r/cbe54ed1-b6ba-a056-8899-2dc42526371d@i-love.sakura.ne.jp +Fixes: 3da88fb3bacfaa33 ("mm, oom: move GFP_NOFS check to out_of_memory") [necessary after 29ef680ae7c21110] +Signed-off-by: Tetsuo Handa +Reported-by: Masoud Sharbiani +Tested-by: Masoud Sharbiani +Acked-by: Michal Hocko +Cc: David Rientjes +Cc: [4.19+] +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/oom_kill.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +--- a/mm/oom_kill.c ++++ b/mm/oom_kill.c +@@ -1060,9 +1060,10 @@ bool out_of_memory(struct oom_control *o + * The OOM killer does not compensate for IO-less reclaim. + * pagefault_out_of_memory lost its gfp context so we have to + * make sure exclude 0 mask - all other users should have at least +- * ___GFP_DIRECT_RECLAIM to get here. ++ * ___GFP_DIRECT_RECLAIM to get here. But mem_cgroup_oom() has to ++ * invoke the OOM killer even if it is a GFP_NOFS allocation. + */ +- if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS)) ++ if (oc->gfp_mask && !(oc->gfp_mask & __GFP_FS) && !is_memcg_oom(oc)) + return true; + + /* diff --git a/queue-5.2/mm-compaction.c-clear-total_-migrate-free-_scanned-before-scanning-a-new-zone.patch b/queue-5.2/mm-compaction.c-clear-total_-migrate-free-_scanned-before-scanning-a-new-zone.patch new file mode 100644 index 00000000000..d40636d4e58 --- /dev/null +++ b/queue-5.2/mm-compaction.c-clear-total_-migrate-free-_scanned-before-scanning-a-new-zone.patch @@ -0,0 +1,127 @@ +From a94b525241c0fff3598809131d7cfcfe1d572d8c Mon Sep 17 00:00:00 2001 +From: Yafang Shao +Date: Mon, 23 Sep 2019 15:36:54 -0700 +Subject: mm/compaction.c: clear total_{migrate,free}_scanned before scanning a new zone + +From: Yafang Shao + +commit a94b525241c0fff3598809131d7cfcfe1d572d8c upstream. + +total_{migrate,free}_scanned will be added to COMPACTMIGRATE_SCANNED and +COMPACTFREE_SCANNED in compact_zone(). We should clear them before +scanning a new zone. In the proc triggered compaction, we forgot clearing +them. + +[laoar.shao@gmail.com: introduce a helper compact_zone_counters_init()] + Link: http://lkml.kernel.org/r/1563869295-25748-1-git-send-email-laoar.shao@gmail.com +[akpm@linux-foundation.org: expand compact_zone_counters_init() into its single callsite, per mhocko] +[vbabka@suse.cz: squash compact_zone() list_head init as well] + Link: http://lkml.kernel.org/r/1fb6f7da-f776-9e42-22f8-bbb79b030b98@suse.cz +[akpm@linux-foundation.org: kcompactd_do_work(): avoid unnecessary initialization of cc.zone] +Link: http://lkml.kernel.org/r/1563789275-9639-1-git-send-email-laoar.shao@gmail.com +Fixes: 7f354a548d1c ("mm, compaction: add vmstats for kcompactd work") +Signed-off-by: Yafang Shao +Signed-off-by: Vlastimil Babka +Reviewed-by: Vlastimil Babka +Cc: David Rientjes +Cc: Yafang Shao +Cc: Mel Gorman +Cc: Michal Hocko +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/compaction.c | 35 +++++++++++++---------------------- + 1 file changed, 13 insertions(+), 22 deletions(-) + +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -2078,6 +2078,17 @@ compact_zone(struct compact_control *cc, + const bool sync = cc->mode != MIGRATE_ASYNC; + bool update_cached; + ++ /* ++ * These counters track activities during zone compaction. Initialize ++ * them before compacting a new zone. ++ */ ++ cc->total_migrate_scanned = 0; ++ cc->total_free_scanned = 0; ++ cc->nr_migratepages = 0; ++ cc->nr_freepages = 0; ++ INIT_LIST_HEAD(&cc->freepages); ++ INIT_LIST_HEAD(&cc->migratepages); ++ + cc->migratetype = gfpflags_to_migratetype(cc->gfp_mask); + ret = compaction_suitable(cc->zone, cc->order, cc->alloc_flags, + cc->classzone_idx); +@@ -2281,10 +2292,6 @@ static enum compact_result compact_zone_ + { + enum compact_result ret; + struct compact_control cc = { +- .nr_freepages = 0, +- .nr_migratepages = 0, +- .total_migrate_scanned = 0, +- .total_free_scanned = 0, + .order = order, + .search_order = order, + .gfp_mask = gfp_mask, +@@ -2305,8 +2312,6 @@ static enum compact_result compact_zone_ + + if (capture) + current->capture_control = &capc; +- INIT_LIST_HEAD(&cc.freepages); +- INIT_LIST_HEAD(&cc.migratepages); + + ret = compact_zone(&cc, &capc); + +@@ -2408,8 +2413,6 @@ static void compact_node(int nid) + struct zone *zone; + struct compact_control cc = { + .order = -1, +- .total_migrate_scanned = 0, +- .total_free_scanned = 0, + .mode = MIGRATE_SYNC, + .ignore_skip_hint = true, + .whole_zone = true, +@@ -2423,11 +2426,7 @@ static void compact_node(int nid) + if (!populated_zone(zone)) + continue; + +- cc.nr_freepages = 0; +- cc.nr_migratepages = 0; + cc.zone = zone; +- INIT_LIST_HEAD(&cc.freepages); +- INIT_LIST_HEAD(&cc.migratepages); + + compact_zone(&cc, NULL); + +@@ -2529,8 +2528,6 @@ static void kcompactd_do_work(pg_data_t + struct compact_control cc = { + .order = pgdat->kcompactd_max_order, + .search_order = pgdat->kcompactd_max_order, +- .total_migrate_scanned = 0, +- .total_free_scanned = 0, + .classzone_idx = pgdat->kcompactd_classzone_idx, + .mode = MIGRATE_SYNC_LIGHT, + .ignore_skip_hint = false, +@@ -2554,16 +2551,10 @@ static void kcompactd_do_work(pg_data_t + COMPACT_CONTINUE) + continue; + +- cc.nr_freepages = 0; +- cc.nr_migratepages = 0; +- cc.total_migrate_scanned = 0; +- cc.total_free_scanned = 0; +- cc.zone = zone; +- INIT_LIST_HEAD(&cc.freepages); +- INIT_LIST_HEAD(&cc.migratepages); +- + if (kthread_should_stop()) + return; ++ ++ cc.zone = zone; + status = compact_zone(&cc, NULL); + + if (status == COMPACT_SUCCESS) { diff --git a/queue-5.2/mt76-round-up-length-on-mt76_wr_copy.patch b/queue-5.2/mt76-round-up-length-on-mt76_wr_copy.patch new file mode 100644 index 00000000000..c17d5fefa9b --- /dev/null +++ b/queue-5.2/mt76-round-up-length-on-mt76_wr_copy.patch @@ -0,0 +1,46 @@ +From 850e8f6fbd5d0003b0f1119d19a01c6fef1644e2 Mon Sep 17 00:00:00 2001 +From: Felix Fietkau +Date: Mon, 1 Jul 2019 13:15:07 +0200 +Subject: mt76: round up length on mt76_wr_copy + +From: Felix Fietkau + +commit 850e8f6fbd5d0003b0f1119d19a01c6fef1644e2 upstream. + +When beacon length is not a multiple of 4, the beacon could be sent with +the last 1-3 bytes corrupted. The skb data is guaranteed to have enough +room for reading beyond the end, because it is always followed by +skb_shared_info, so rounding up is safe. +All other callers of mt76_wr_copy have multiple-of-4 length already. + +Cc: stable@vger.kernel.org +Signed-off-by: Felix Fietkau +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/mediatek/mt76/mmio.c | 2 +- + drivers/net/wireless/mediatek/mt76/usb.c | 2 +- + 2 files changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/wireless/mediatek/mt76/mmio.c ++++ b/drivers/net/wireless/mediatek/mt76/mmio.c +@@ -43,7 +43,7 @@ static u32 mt76_mmio_rmw(struct mt76_dev + static void mt76_mmio_copy(struct mt76_dev *dev, u32 offset, const void *data, + int len) + { +- __iowrite32_copy(dev->mmio.regs + offset, data, len >> 2); ++ __iowrite32_copy(dev->mmio.regs + offset, data, DIV_ROUND_UP(len, 4)); + } + + static int mt76_mmio_wr_rp(struct mt76_dev *dev, u32 base, +--- a/drivers/net/wireless/mediatek/mt76/usb.c ++++ b/drivers/net/wireless/mediatek/mt76/usb.c +@@ -164,7 +164,7 @@ static void mt76u_copy(struct mt76_dev * + int i, ret; + + mutex_lock(&usb->usb_ctrl_mtx); +- for (i = 0; i < (len / 4); i++) { ++ for (i = 0; i < DIV_ROUND_UP(len, 4); i++) { + put_unaligned_le32(val[i], usb->data); + ret = __mt76u_vendor_request(dev, MT_VEND_MULTI_WRITE, + USB_DIR_OUT | USB_TYPE_VENDOR, diff --git a/queue-5.2/ovl-filter-of-trusted-xattr-results-in-audit.patch b/queue-5.2/ovl-filter-of-trusted-xattr-results-in-audit.patch new file mode 100644 index 00000000000..6217951580f --- /dev/null +++ b/queue-5.2/ovl-filter-of-trusted-xattr-results-in-audit.patch @@ -0,0 +1,41 @@ +From 5c2e9f346b815841f9bed6029ebcb06415caf640 Mon Sep 17 00:00:00 2001 +From: Mark Salyzyn +Date: Thu, 29 Aug 2019 11:30:14 -0700 +Subject: ovl: filter of trusted xattr results in audit + +From: Mark Salyzyn + +commit 5c2e9f346b815841f9bed6029ebcb06415caf640 upstream. + +When filtering xattr list for reading, presence of trusted xattr +results in a security audit log. However, if there is other content +no errno will be set, and if there isn't, the errno will be -ENODATA +and not -EPERM as is usually associated with a lack of capability. +The check does not block the request to list the xattrs present. + +Switch to ns_capable_noaudit to reflect a more appropriate check. + +Signed-off-by: Mark Salyzyn +Cc: linux-security-module@vger.kernel.org +Cc: kernel-team@android.com +Cc: stable@vger.kernel.org # v3.18+ +Fixes: a082c6f680da ("ovl: filter trusted xattr for non-admin") +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/inode.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +--- a/fs/overlayfs/inode.c ++++ b/fs/overlayfs/inode.c +@@ -383,7 +383,8 @@ static bool ovl_can_list(const char *s) + return true; + + /* Never list trusted.overlay, list other trusted for superuser only */ +- return !ovl_is_private_xattr(s) && capable(CAP_SYS_ADMIN); ++ return !ovl_is_private_xattr(s) && ++ ns_capable_noaudit(&init_user_ns, CAP_SYS_ADMIN); + } + + ssize_t ovl_listxattr(struct dentry *dentry, char *list, size_t size) diff --git a/queue-5.2/ovl-fix-dereferencing-possible-err_ptr.patch b/queue-5.2/ovl-fix-dereferencing-possible-err_ptr.patch new file mode 100644 index 00000000000..665e203a00b --- /dev/null +++ b/queue-5.2/ovl-fix-dereferencing-possible-err_ptr.patch @@ -0,0 +1,35 @@ +From 97f024b9171e74c4443bbe8a8dce31b917f97ac5 Mon Sep 17 00:00:00 2001 +From: Ding Xiang +Date: Mon, 9 Sep 2019 16:29:56 +0800 +Subject: ovl: Fix dereferencing possible ERR_PTR() + +From: Ding Xiang + +commit 97f024b9171e74c4443bbe8a8dce31b917f97ac5 upstream. + +if ovl_encode_real_fh() fails, no memory was allocated +and the error in the error-valued pointer should be returned. + +Fixes: 9b6faee07470 ("ovl: check ERR_PTR() return value from ovl_encode_fh()") +Signed-off-by: Ding Xiang +Cc: # v4.16+ +Signed-off-by: Miklos Szeredi +Signed-off-by: Greg Kroah-Hartman + +--- + fs/overlayfs/export.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +--- a/fs/overlayfs/export.c ++++ b/fs/overlayfs/export.c +@@ -227,9 +227,8 @@ static int ovl_d_to_fh(struct dentry *de + /* Encode an upper or lower file handle */ + fh = ovl_encode_real_fh(enc_lower ? ovl_dentry_lower(dentry) : + ovl_dentry_upper(dentry), !enc_lower); +- err = PTR_ERR(fh); + if (IS_ERR(fh)) +- goto fail; ++ return PTR_ERR(fh); + + err = -EOVERFLOW; + if (fh->len > buflen) diff --git a/queue-5.2/rtw88-pci-rearrange-the-memory-usage-for-skb-in-rx-isr.patch b/queue-5.2/rtw88-pci-rearrange-the-memory-usage-for-skb-in-rx-isr.patch new file mode 100644 index 00000000000..b8a3aa2eea2 --- /dev/null +++ b/queue-5.2/rtw88-pci-rearrange-the-memory-usage-for-skb-in-rx-isr.patch @@ -0,0 +1,124 @@ +From ee6db78f5db9bfe426c57a1ec9713827ebccd2d4 Mon Sep 17 00:00:00 2001 +From: Jian-Hong Pan +Date: Thu, 11 Jul 2019 13:24:26 +0800 +Subject: rtw88: pci: Rearrange the memory usage for skb in RX ISR + +From: Jian-Hong Pan + +commit ee6db78f5db9bfe426c57a1ec9713827ebccd2d4 upstream. + +Testing with RTL8822BE hardware, when available memory is low, we +frequently see a kernel panic and system freeze. + +First, rtw_pci_rx_isr encounters a memory allocation failure (trimmed): + +rx routine starvation +WARNING: CPU: 7 PID: 9871 at drivers/net/wireless/realtek/rtw88/pci.c:822 rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci] +[ 2356.580313] RIP: 0010:rtw_pci_rx_isr.constprop.25+0x35a/0x370 [rtwpci] + +Then we see a variety of different error conditions and kernel panics, +such as this one (trimmed): + +rtw_pci 0000:02:00.0: pci bus timeout, check dma status +skbuff: skb_over_panic: text:00000000091b6e66 len:415 put:415 head:00000000d2880c6f data:000000007a02b1ea tail:0x1df end:0xc0 dev: +------------[ cut here ]------------ +kernel BUG at net/core/skbuff.c:105! +invalid opcode: 0000 [#1] SMP NOPTI +RIP: 0010:skb_panic+0x43/0x45 + +When skb allocation fails and the "rx routine starvation" is hit, the +function returns immediately without updating the RX ring. At this +point, the RX ring may continue referencing an old skb which was already +handed off to ieee80211_rx_irqsafe(). When it comes to be used again, +bad things happen. + +This patch allocates a new, data-sized skb first in RX ISR. After +copying the data in, we pass it to the upper layers. However, if skb +allocation fails, we effectively drop the frame. In both cases, the +original, full size ring skb is reused. + +In addition, to fixing the kernel crash, the RX routine should now +generally behave better under low memory conditions. + +Buglink: https://bugzilla.kernel.org/show_bug.cgi?id=204053 +Signed-off-by: Jian-Hong Pan +Cc: +Signed-off-by: Kalle Valo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/realtek/rtw88/pci.c | 49 +++++++++++++------------------ + 1 file changed, 22 insertions(+), 27 deletions(-) + +--- a/drivers/net/wireless/realtek/rtw88/pci.c ++++ b/drivers/net/wireless/realtek/rtw88/pci.c +@@ -763,6 +763,7 @@ static void rtw_pci_rx_isr(struct rtw_de + u32 pkt_offset; + u32 pkt_desc_sz = chip->rx_pkt_desc_sz; + u32 buf_desc_sz = chip->rx_buf_desc_sz; ++ u32 new_len; + u8 *rx_desc; + dma_addr_t dma; + +@@ -790,40 +791,34 @@ static void rtw_pci_rx_isr(struct rtw_de + pkt_offset = pkt_desc_sz + pkt_stat.drv_info_sz + + pkt_stat.shift; + +- if (pkt_stat.is_c2h) { +- /* keep rx_desc, halmac needs it */ +- skb_put(skb, pkt_stat.pkt_len + pkt_offset); ++ /* allocate a new skb for this frame, ++ * discard the frame if none available ++ */ ++ new_len = pkt_stat.pkt_len + pkt_offset; ++ new = dev_alloc_skb(new_len); ++ if (WARN_ONCE(!new, "rx routine starvation\n")) ++ goto next_rp; ++ ++ /* put the DMA data including rx_desc from phy to new skb */ ++ skb_put_data(new, skb->data, new_len); + +- /* pass offset for further operation */ +- *((u32 *)skb->cb) = pkt_offset; +- skb_queue_tail(&rtwdev->c2h_queue, skb); ++ if (pkt_stat.is_c2h) { ++ /* pass rx_desc & offset for further operation */ ++ *((u32 *)new->cb) = pkt_offset; ++ skb_queue_tail(&rtwdev->c2h_queue, new); + ieee80211_queue_work(rtwdev->hw, &rtwdev->c2h_work); + } else { +- /* remove rx_desc, maybe use skb_pull? */ +- skb_put(skb, pkt_stat.pkt_len); +- skb_reserve(skb, pkt_offset); +- +- /* alloc a smaller skb to mac80211 */ +- new = dev_alloc_skb(pkt_stat.pkt_len); +- if (!new) { +- new = skb; +- } else { +- skb_put_data(new, skb->data, skb->len); +- dev_kfree_skb_any(skb); +- } +- /* TODO: merge into rx.c */ +- rtw_rx_stats(rtwdev, pkt_stat.vif, skb); ++ /* remove rx_desc */ ++ skb_pull(new, pkt_offset); ++ ++ rtw_rx_stats(rtwdev, pkt_stat.vif, new); + memcpy(new->cb, &rx_status, sizeof(rx_status)); + ieee80211_rx_irqsafe(rtwdev->hw, new); + } + +- /* skb delivered to mac80211, alloc a new one in rx ring */ +- new = dev_alloc_skb(RTK_PCI_RX_BUF_SIZE); +- if (WARN(!new, "rx routine starvation\n")) +- return; +- +- ring->buf[cur_rp] = new; +- rtw_pci_reset_rx_desc(rtwdev, new, ring, cur_rp, buf_desc_sz); ++next_rp: ++ /* new skb delivered to mac80211, re-enable original skb DMA */ ++ rtw_pci_reset_rx_desc(rtwdev, skb, ring, cur_rp, buf_desc_sz); + + /* host read next element in ring */ + if (++cur_rp >= ring->r.len) diff --git a/queue-5.2/rtw88-pci-use-dma-sync-instead-of-remapping-in-rx-isr.patch b/queue-5.2/rtw88-pci-use-dma-sync-instead-of-remapping-in-rx-isr.patch new file mode 100644 index 00000000000..498152a6399 --- /dev/null +++ b/queue-5.2/rtw88-pci-use-dma-sync-instead-of-remapping-in-rx-isr.patch @@ -0,0 +1,68 @@ +From 29b68a920f6abb7b5ba21ab4b779f62d536bac9b Mon Sep 17 00:00:00 2001 +From: Jian-Hong Pan +Date: Thu, 11 Jul 2019 13:24:27 +0800 +Subject: rtw88: pci: Use DMA sync instead of remapping in RX ISR + +From: Jian-Hong Pan + +commit 29b68a920f6abb7b5ba21ab4b779f62d536bac9b upstream. + +Since each skb in RX ring is reused instead of new allocation, we can +treat the DMA in a more efficient way by DMA synchronization. + +Signed-off-by: Jian-Hong Pan +Cc: +Signed-off-by: Kalle Valo +Signed-off-by: Greg Kroah-Hartman + +--- + drivers/net/wireless/realtek/rtw88/pci.c | 24 +++++++++++++++++++++--- + 1 file changed, 21 insertions(+), 3 deletions(-) + +--- a/drivers/net/wireless/realtek/rtw88/pci.c ++++ b/drivers/net/wireless/realtek/rtw88/pci.c +@@ -206,6 +206,23 @@ static int rtw_pci_reset_rx_desc(struct + return 0; + } + ++static void rtw_pci_sync_rx_desc_device(struct rtw_dev *rtwdev, dma_addr_t dma, ++ struct rtw_pci_rx_ring *rx_ring, ++ u32 idx, u32 desc_sz) ++{ ++ struct device *dev = rtwdev->dev; ++ struct rtw_pci_rx_buffer_desc *buf_desc; ++ int buf_sz = RTK_PCI_RX_BUF_SIZE; ++ ++ dma_sync_single_for_device(dev, dma, buf_sz, DMA_FROM_DEVICE); ++ ++ buf_desc = (struct rtw_pci_rx_buffer_desc *)(rx_ring->r.head + ++ idx * desc_sz); ++ memset(buf_desc, 0, sizeof(*buf_desc)); ++ buf_desc->buf_size = cpu_to_le16(RTK_PCI_RX_BUF_SIZE); ++ buf_desc->dma = cpu_to_le32(dma); ++} ++ + static int rtw_pci_init_rx_ring(struct rtw_dev *rtwdev, + struct rtw_pci_rx_ring *rx_ring, + u8 desc_size, u32 len) +@@ -782,8 +799,8 @@ static void rtw_pci_rx_isr(struct rtw_de + rtw_pci_dma_check(rtwdev, ring, cur_rp); + skb = ring->buf[cur_rp]; + dma = *((dma_addr_t *)skb->cb); +- pci_unmap_single(rtwpci->pdev, dma, RTK_PCI_RX_BUF_SIZE, +- PCI_DMA_FROMDEVICE); ++ dma_sync_single_for_cpu(rtwdev->dev, dma, RTK_PCI_RX_BUF_SIZE, ++ DMA_FROM_DEVICE); + rx_desc = skb->data; + chip->ops->query_rx_desc(rtwdev, rx_desc, &pkt_stat, &rx_status); + +@@ -818,7 +835,8 @@ static void rtw_pci_rx_isr(struct rtw_de + + next_rp: + /* new skb delivered to mac80211, re-enable original skb DMA */ +- rtw_pci_reset_rx_desc(rtwdev, skb, ring, cur_rp, buf_desc_sz); ++ rtw_pci_sync_rx_desc_device(rtwdev, dma, ring, cur_rp, ++ buf_desc_sz); + + /* host read next element in ring */ + if (++cur_rp >= ring->r.len) diff --git a/queue-5.2/series b/queue-5.2/series index 8c0209fe3eb..d3e6dad9d35 100644 --- a/queue-5.2/series +++ b/queue-5.2/series @@ -260,3 +260,30 @@ iommu-arm-smmu-v3-disable-detection-of-ats-and-pri.patch alarmtimer-use-eopnotsupp-instead-of-enotsupp.patch iommu-vt-d-fix-wrong-analysis-whether-devices-share-the-same-bus.patch regulator-defer-init-completion-for-a-while-after-late_initcall.patch +efifb-bgrt-improve-efifb_bgrt_sanity_check.patch +gfs2-clear-buf_in_tr-when-ending-a-transaction-in-sweep_bh_for_rgrps.patch +z3fold-fix-retry-mechanism-in-page-reclaim.patch +z3fold-fix-memory-leak-in-kmem-cache.patch +mm-compaction.c-clear-total_-migrate-free-_scanned-before-scanning-a-new-zone.patch +memcg-oom-don-t-require-__gfp_fs-when-invoking-memcg-oom-killer.patch +memcg-kmem-do-not-fail-__gfp_nofail-charges.patch +lib-lzo-lzo1x_compress.c-fix-alignment-bug-in-lzo-rle.patch +mt76-round-up-length-on-mt76_wr_copy.patch +keys-trusted-correctly-initialize-digests-and-fix-locking-issue.patch +rtw88-pci-rearrange-the-memory-usage-for-skb-in-rx-isr.patch +rtw88-pci-use-dma-sync-instead-of-remapping-in-rx-isr.patch +ath10k-fix-channel-info-parsing-for-non-tlv-target.patch +i40e-check-__i40e_vf_disable-bit-in-i40e_sync_filters_subtask.patch +block-mq-deadline-fix-queue-restart-handling.patch +block-fix-null-pointer-dereference-in-blk_mq_rq_timed_out.patch +smb3-allow-disabling-requesting-leases.patch +smb3-fix-leak-in-open-on-server-perf-counter.patch +ovl-fix-dereferencing-possible-err_ptr.patch +ovl-filter-of-trusted-xattr-results-in-audit.patch +btrfs-fix-allocation-of-free-space-cache-v1-bitmap-pages.patch +btrfs-fix-use-after-free-when-using-the-tree-modification-log.patch +btrfs-relinquish-cpus-in-btrfs_compare_trees.patch +btrfs-adjust-dirty_metadata_bytes-after-writeback-failure-of-extent-buffer.patch +btrfs-qgroup-fix-the-wrong-target-io_tree-when-freeing-reserved-data-space.patch +btrfs-qgroup-fix-reserved-data-space-leak-if-we-have-multiple-reserve-calls.patch +btrfs-fix-race-setting-up-and-completing-qgroup-rescan-workers.patch diff --git a/queue-5.2/smb3-allow-disabling-requesting-leases.patch b/queue-5.2/smb3-allow-disabling-requesting-leases.patch new file mode 100644 index 00000000000..afd11bdc2f7 --- /dev/null +++ b/queue-5.2/smb3-allow-disabling-requesting-leases.patch @@ -0,0 +1,118 @@ +From 3e7a02d47872081f4b6234a9f72500f1d10f060c Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Wed, 11 Sep 2019 21:46:20 -0500 +Subject: smb3: allow disabling requesting leases + +From: Steve French + +commit 3e7a02d47872081f4b6234a9f72500f1d10f060c upstream. + +In some cases to work around server bugs or performance +problems it can be helpful to be able to disable requesting +SMB2.1/SMB3 leases on a particular mount (not to all servers +and all shares we are mounted to). Add new mount parm +"nolease" which turns off requesting leases on directory +or file opens. Currently the only way to disable leases is +globally through a module load parameter. This is more +granular. + +Suggested-by: Pavel Shilovsky +Signed-off-by: Steve French +Reviewed-by: Ronnie Sahlberg +Reviewed-by: Pavel Shilovsky +CC: Stable +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/cifsfs.c | 2 ++ + fs/cifs/cifsglob.h | 2 ++ + fs/cifs/connect.c | 9 ++++++++- + fs/cifs/smb2pdu.c | 2 +- + 4 files changed, 13 insertions(+), 2 deletions(-) + +--- a/fs/cifs/cifsfs.c ++++ b/fs/cifs/cifsfs.c +@@ -433,6 +433,8 @@ cifs_show_options(struct seq_file *s, st + cifs_show_security(s, tcon->ses); + cifs_show_cache_flavor(s, cifs_sb); + ++ if (tcon->no_lease) ++ seq_puts(s, ",nolease"); + if (cifs_sb->mnt_cifs_flags & CIFS_MOUNT_MULTIUSER) + seq_puts(s, ",multiuser"); + else if (tcon->ses->user_name) +--- a/fs/cifs/cifsglob.h ++++ b/fs/cifs/cifsglob.h +@@ -575,6 +575,7 @@ struct smb_vol { + bool noblocksnd:1; + bool noautotune:1; + bool nostrictsync:1; /* do not force expensive SMBflush on every sync */ ++ bool no_lease:1; /* disable requesting leases */ + bool fsc:1; /* enable fscache */ + bool mfsymlinks:1; /* use Minshall+French Symlinks */ + bool multiuser:1; +@@ -1079,6 +1080,7 @@ struct cifs_tcon { + bool need_reopen_files:1; /* need to reopen tcon file handles */ + bool use_resilient:1; /* use resilient instead of durable handles */ + bool use_persistent:1; /* use persistent instead of durable handles */ ++ bool no_lease:1; /* Do not request leases on files or directories */ + __le32 capabilities; + __u32 share_flags; + __u32 maximal_access; +--- a/fs/cifs/connect.c ++++ b/fs/cifs/connect.c +@@ -74,7 +74,7 @@ enum { + Opt_user_xattr, Opt_nouser_xattr, + Opt_forceuid, Opt_noforceuid, + Opt_forcegid, Opt_noforcegid, +- Opt_noblocksend, Opt_noautotune, ++ Opt_noblocksend, Opt_noautotune, Opt_nolease, + Opt_hard, Opt_soft, Opt_perm, Opt_noperm, + Opt_mapposix, Opt_nomapposix, + Opt_mapchars, Opt_nomapchars, Opt_sfu, +@@ -133,6 +133,7 @@ static const match_table_t cifs_mount_op + { Opt_noforcegid, "noforcegid" }, + { Opt_noblocksend, "noblocksend" }, + { Opt_noautotune, "noautotune" }, ++ { Opt_nolease, "nolease" }, + { Opt_hard, "hard" }, + { Opt_soft, "soft" }, + { Opt_perm, "perm" }, +@@ -1709,6 +1710,9 @@ cifs_parse_mount_options(const char *mou + case Opt_noautotune: + vol->noautotune = 1; + break; ++ case Opt_nolease: ++ vol->no_lease = 1; ++ break; + case Opt_hard: + vol->retry = 1; + break; +@@ -3230,6 +3234,8 @@ static int match_tcon(struct cifs_tcon * + return 0; + if (tcon->handle_timeout != volume_info->handle_timeout) + return 0; ++ if (tcon->no_lease != volume_info->no_lease) ++ return 0; + return 1; + } + +@@ -3444,6 +3450,7 @@ cifs_get_tcon(struct cifs_ses *ses, stru + tcon->nocase = volume_info->nocase; + tcon->nohandlecache = volume_info->nohandlecache; + tcon->local_lease = volume_info->local_lease; ++ tcon->no_lease = volume_info->no_lease; + INIT_LIST_HEAD(&tcon->pending_opens); + + spin_lock(&cifs_tcp_ses_lock); +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -2370,7 +2370,7 @@ SMB2_open_init(struct cifs_tcon *tcon, s + iov[1].iov_len = uni_path_len; + iov[1].iov_base = path; + +- if (!server->oplocks) ++ if ((!server->oplocks) || (tcon->no_lease)) + *oplock = SMB2_OPLOCK_LEVEL_NONE; + + if (!(server->capabilities & SMB2_GLOBAL_CAP_LEASING) || diff --git a/queue-5.2/smb3-fix-leak-in-open-on-server-perf-counter.patch b/queue-5.2/smb3-fix-leak-in-open-on-server-perf-counter.patch new file mode 100644 index 00000000000..fe53706c23c --- /dev/null +++ b/queue-5.2/smb3-fix-leak-in-open-on-server-perf-counter.patch @@ -0,0 +1,61 @@ +From d2f15428d6a0ebfc0edc364094d7c4a2de7037ed Mon Sep 17 00:00:00 2001 +From: Steve French +Date: Sun, 22 Sep 2019 00:55:46 -0500 +Subject: smb3: fix leak in "open on server" perf counter + +From: Steve French + +commit d2f15428d6a0ebfc0edc364094d7c4a2de7037ed upstream. + +We were not bumping up the "open on server" (num_remote_opens) +counter (in some cases) on opens of the share root so +could end up showing as a negative value. + +CC: Stable +Signed-off-by: Steve French +Reviewed-by: Pavel Shilovsky +Signed-off-by: Greg Kroah-Hartman + +--- + fs/cifs/smb2ops.c | 5 +++++ + fs/cifs/smb2pdu.c | 1 + + 2 files changed, 6 insertions(+) + +--- a/fs/cifs/smb2ops.c ++++ b/fs/cifs/smb2ops.c +@@ -743,6 +743,8 @@ int open_shroot(unsigned int xid, struct + if (rc) + goto oshr_exit; + ++ atomic_inc(&tcon->num_remote_opens); ++ + o_rsp = (struct smb2_create_rsp *)rsp_iov[0].iov_base; + oparms.fid->persistent_fid = o_rsp->PersistentFileId; + oparms.fid->volatile_fid = o_rsp->VolatileFileId; +@@ -1167,6 +1169,7 @@ smb2_set_ea(const unsigned int xid, stru + + rc = compound_send_recv(xid, ses, flags, 3, rqst, + resp_buftype, rsp_iov); ++ /* no need to bump num_remote_opens because handle immediately closed */ + + sea_exit: + kfree(ea); +@@ -1488,6 +1491,8 @@ smb2_ioctl_query_info(const unsigned int + resp_buftype, rsp_iov); + if (rc) + goto iqinf_exit; ++ ++ /* No need to bump num_remote_opens since handle immediately closed */ + if (qi.flags & PASSTHRU_FSCTL) { + pqi = (struct smb_query_info __user *)arg; + io_rsp = (struct smb2_ioctl_rsp *)rsp_iov[1].iov_base; +--- a/fs/cifs/smb2pdu.c ++++ b/fs/cifs/smb2pdu.c +@@ -2263,6 +2263,7 @@ int smb311_posix_mkdir(const unsigned in + rqst.rq_iov = iov; + rqst.rq_nvec = n_iov; + ++ /* no need to inc num_remote_opens because we close it just below */ + trace_smb3_posix_mkdir_enter(xid, tcon->tid, ses->Suid, CREATE_NOT_FILE, + FILE_WRITE_ATTRIBUTES); + /* resource #4: response buffer */ diff --git a/queue-5.2/z3fold-fix-memory-leak-in-kmem-cache.patch b/queue-5.2/z3fold-fix-memory-leak-in-kmem-cache.patch new file mode 100644 index 00000000000..d1fc1ed3208 --- /dev/null +++ b/queue-5.2/z3fold-fix-memory-leak-in-kmem-cache.patch @@ -0,0 +1,72 @@ +From 63398413c00c7836ea87a1fa205c91d2199b25cf Mon Sep 17 00:00:00 2001 +From: Vitaly Wool +Date: Mon, 23 Sep 2019 15:36:51 -0700 +Subject: z3fold: fix memory leak in kmem cache + +From: Vitaly Wool + +commit 63398413c00c7836ea87a1fa205c91d2199b25cf upstream. + +Currently there is a leak in init_z3fold_page() -- it allocates handles +from kmem cache even for headless pages, but then they are never used and +never freed, so eventually kmem cache may get exhausted. This patch +provides a fix for that. + +Link: http://lkml.kernel.org/r/20190917185352.44cf285d3ebd9e64548de5de@gmail.com +Signed-off-by: Vitaly Wool +Reported-by: Markus Linnala +Tested-by: Markus Linnala +Cc: Dan Streetman +Cc: Henry Burns +Cc: Shakeel Butt +Cc: Vlastimil Babka +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/z3fold.c | 15 +++++++++------ + 1 file changed, 9 insertions(+), 6 deletions(-) + +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -297,14 +297,11 @@ static void z3fold_unregister_migration( + } + + /* Initializes the z3fold header of a newly allocated z3fold page */ +-static struct z3fold_header *init_z3fold_page(struct page *page, ++static struct z3fold_header *init_z3fold_page(struct page *page, bool headless, + struct z3fold_pool *pool, gfp_t gfp) + { + struct z3fold_header *zhdr = page_address(page); +- struct z3fold_buddy_slots *slots = alloc_slots(pool, gfp); +- +- if (!slots) +- return NULL; ++ struct z3fold_buddy_slots *slots; + + INIT_LIST_HEAD(&page->lru); + clear_bit(PAGE_HEADLESS, &page->private); +@@ -312,6 +309,12 @@ static struct z3fold_header *init_z3fold + clear_bit(NEEDS_COMPACTING, &page->private); + clear_bit(PAGE_STALE, &page->private); + clear_bit(PAGE_CLAIMED, &page->private); ++ if (headless) ++ return zhdr; ++ ++ slots = alloc_slots(pool, gfp); ++ if (!slots) ++ return NULL; + + spin_lock_init(&zhdr->page_lock); + kref_init(&zhdr->refcount); +@@ -932,7 +935,7 @@ retry: + if (!page) + return -ENOMEM; + +- zhdr = init_z3fold_page(page, pool, gfp); ++ zhdr = init_z3fold_page(page, bud == HEADLESS, pool, gfp); + if (!zhdr) { + __free_page(page); + return -ENOMEM; diff --git a/queue-5.2/z3fold-fix-retry-mechanism-in-page-reclaim.patch b/queue-5.2/z3fold-fix-retry-mechanism-in-page-reclaim.patch new file mode 100644 index 00000000000..ae1a1bdbc26 --- /dev/null +++ b/queue-5.2/z3fold-fix-retry-mechanism-in-page-reclaim.patch @@ -0,0 +1,176 @@ +From 3f9d2b5766aea06042630ac60b7316fd0cebf06f Mon Sep 17 00:00:00 2001 +From: Vitaly Wool +Date: Mon, 23 Sep 2019 15:33:02 -0700 +Subject: z3fold: fix retry mechanism in page reclaim + +From: Vitaly Wool + +commit 3f9d2b5766aea06042630ac60b7316fd0cebf06f upstream. + +z3fold_page_reclaim()'s retry mechanism is broken: on a second iteration +it will have zhdr from the first one so that zhdr is no longer in line +with struct page. That leads to crashes when the system is stressed. + +Fix that by moving zhdr assignment up. + +While at it, protect against using already freed handles by using own +local slots structure in z3fold_page_reclaim(). + +Link: http://lkml.kernel.org/r/20190908162919.830388dc7404d1e2c80f4095@gmail.com +Signed-off-by: Vitaly Wool +Reported-by: Markus Linnala +Reported-by: Chris Murphy +Reported-by: Agustin Dall'Alba +Cc: "Maciej S. Szmigiero" +Cc: Shakeel Butt +Cc: Henry Burns +Cc: +Signed-off-by: Andrew Morton +Signed-off-by: Linus Torvalds +Signed-off-by: Greg Kroah-Hartman + +--- + mm/z3fold.c | 49 ++++++++++++++++++++++++++++++++++--------------- + 1 file changed, 34 insertions(+), 15 deletions(-) + +--- a/mm/z3fold.c ++++ b/mm/z3fold.c +@@ -368,9 +368,10 @@ static inline int __idx(struct z3fold_he + * Encodes the handle of a particular buddy within a z3fold page + * Pool lock should be held as this function accesses first_num + */ +-static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) ++static unsigned long __encode_handle(struct z3fold_header *zhdr, ++ struct z3fold_buddy_slots *slots, ++ enum buddy bud) + { +- struct z3fold_buddy_slots *slots; + unsigned long h = (unsigned long)zhdr; + int idx = 0; + +@@ -387,11 +388,15 @@ static unsigned long encode_handle(struc + if (bud == LAST) + h |= (zhdr->last_chunks << BUDDY_SHIFT); + +- slots = zhdr->slots; + slots->slot[idx] = h; + return (unsigned long)&slots->slot[idx]; + } + ++static unsigned long encode_handle(struct z3fold_header *zhdr, enum buddy bud) ++{ ++ return __encode_handle(zhdr, zhdr->slots, bud); ++} ++ + /* Returns the z3fold page where a given handle is stored */ + static inline struct z3fold_header *handle_to_z3fold_header(unsigned long h) + { +@@ -626,6 +631,7 @@ static void do_compact_page(struct z3fol + } + + if (unlikely(PageIsolated(page) || ++ test_bit(PAGE_CLAIMED, &page->private) || + test_bit(PAGE_STALE, &page->private))) { + z3fold_page_unlock(zhdr); + return; +@@ -1102,6 +1108,7 @@ static int z3fold_reclaim_page(struct z3 + struct z3fold_header *zhdr = NULL; + struct page *page = NULL; + struct list_head *pos; ++ struct z3fold_buddy_slots slots; + unsigned long first_handle = 0, middle_handle = 0, last_handle = 0; + + spin_lock(&pool->lock); +@@ -1120,16 +1127,22 @@ static int z3fold_reclaim_page(struct z3 + /* this bit could have been set by free, in which case + * we pass over to the next page in the pool. + */ +- if (test_and_set_bit(PAGE_CLAIMED, &page->private)) ++ if (test_and_set_bit(PAGE_CLAIMED, &page->private)) { ++ page = NULL; + continue; ++ } + +- if (unlikely(PageIsolated(page))) ++ if (unlikely(PageIsolated(page))) { ++ clear_bit(PAGE_CLAIMED, &page->private); ++ page = NULL; + continue; ++ } ++ zhdr = page_address(page); + if (test_bit(PAGE_HEADLESS, &page->private)) + break; + +- zhdr = page_address(page); + if (!z3fold_page_trylock(zhdr)) { ++ clear_bit(PAGE_CLAIMED, &page->private); + zhdr = NULL; + continue; /* can't evict at this point */ + } +@@ -1147,26 +1160,30 @@ static int z3fold_reclaim_page(struct z3 + + if (!test_bit(PAGE_HEADLESS, &page->private)) { + /* +- * We need encode the handles before unlocking, since +- * we can race with free that will set +- * (first|last)_chunks to 0 ++ * We need encode the handles before unlocking, and ++ * use our local slots structure because z3fold_free ++ * can zero out zhdr->slots and we can't do much ++ * about that + */ + first_handle = 0; + last_handle = 0; + middle_handle = 0; + if (zhdr->first_chunks) +- first_handle = encode_handle(zhdr, FIRST); ++ first_handle = __encode_handle(zhdr, &slots, ++ FIRST); + if (zhdr->middle_chunks) +- middle_handle = encode_handle(zhdr, MIDDLE); ++ middle_handle = __encode_handle(zhdr, &slots, ++ MIDDLE); + if (zhdr->last_chunks) +- last_handle = encode_handle(zhdr, LAST); ++ last_handle = __encode_handle(zhdr, &slots, ++ LAST); + /* + * it's safe to unlock here because we hold a + * reference to this page + */ + z3fold_page_unlock(zhdr); + } else { +- first_handle = encode_handle(zhdr, HEADLESS); ++ first_handle = __encode_handle(zhdr, &slots, HEADLESS); + last_handle = middle_handle = 0; + } + +@@ -1196,9 +1213,9 @@ next: + spin_lock(&pool->lock); + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); ++ clear_bit(PAGE_CLAIMED, &page->private); + } else { + z3fold_page_lock(zhdr); +- clear_bit(PAGE_CLAIMED, &page->private); + if (kref_put(&zhdr->refcount, + release_z3fold_page_locked)) { + atomic64_dec(&pool->pages_nr); +@@ -1213,6 +1230,7 @@ next: + list_add(&page->lru, &pool->lru); + spin_unlock(&pool->lock); + z3fold_page_unlock(zhdr); ++ clear_bit(PAGE_CLAIMED, &page->private); + } + + /* We started off locked to we need to lock the pool back */ +@@ -1317,7 +1335,8 @@ static bool z3fold_page_isolate(struct p + VM_BUG_ON_PAGE(!PageMovable(page), page); + VM_BUG_ON_PAGE(PageIsolated(page), page); + +- if (test_bit(PAGE_HEADLESS, &page->private)) ++ if (test_bit(PAGE_HEADLESS, &page->private) || ++ test_bit(PAGE_CLAIMED, &page->private)) + return false; + + zhdr = page_address(page);