From: Greg Kroah-Hartman Date: Sun, 1 May 2022 18:30:18 +0000 (+0200) Subject: 5.15-stable patches X-Git-Tag: v5.4.192~52 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=7b26ace0e3f5a469bc6429a20290849ca8156a67;p=thirdparty%2Fkernel%2Fstable-queue.git 5.15-stable patches added patches: iocost-don-t-reset-the-inuse-weight-of-under-weighted-debtors.patch riscv-patch_text-fixup-last-cpu-should-be-master.patch virtio_net-fix-wrong-buf-address-calculation-when-using-xdp.patch x86-pci-xen-disable-pci-msi-masking-for-xen_hvm-guests.patch --- diff --git a/queue-5.15/iocost-don-t-reset-the-inuse-weight-of-under-weighted-debtors.patch b/queue-5.15/iocost-don-t-reset-the-inuse-weight-of-under-weighted-debtors.patch new file mode 100644 index 00000000000..ba1baae7420 --- /dev/null +++ b/queue-5.15/iocost-don-t-reset-the-inuse-weight-of-under-weighted-debtors.patch @@ -0,0 +1,63 @@ +From 8c936f9ea11ec4e35e288810a7503b5c841a355f Mon Sep 17 00:00:00 2001 +From: Tejun Heo +Date: Tue, 26 Apr 2022 19:01:01 -1000 +Subject: iocost: don't reset the inuse weight of under-weighted debtors + +From: Tejun Heo + +commit 8c936f9ea11ec4e35e288810a7503b5c841a355f upstream. + +When an iocg is in debt, its inuse weight is owned by debt handling and +should stay at 1. This invariant was broken when determining the amount of +surpluses at the beginning of donation calculation - when an iocg's +hierarchical weight is too low, the iocg is excluded from donation +calculation and its inuse is reset to its active regardless of its +indebtedness, triggering warnings like the following: + + WARNING: CPU: 5 PID: 0 at block/blk-iocost.c:1416 iocg_kick_waitq+0x392/0x3a0 + ... + RIP: 0010:iocg_kick_waitq+0x392/0x3a0 + Code: 00 00 be ff ff ff ff 48 89 4d a8 e8 98 b2 70 00 48 8b 4d a8 85 c0 0f 85 4a fe ff ff 0f 0b e9 43 fe ff ff 0f 0b e9 4d fe ff ff <0f> 0b e9 50 fe ff ff e8 a2 ae 70 00 66 90 0f 1f 44 00 00 55 48 89 + RSP: 0018:ffffc90000200d08 EFLAGS: 00010016 + ... + + ioc_timer_fn+0x2e0/0x1470 + call_timer_fn+0xa1/0x2c0 + ... + +As this happens only when an iocg's hierarchical weight is negligible, its +impact likely is limited to triggering the warnings. Fix it by skipping +resetting inuse of under-weighted debtors. + +Signed-off-by: Tejun Heo +Reported-by: Rik van Riel +Fixes: c421a3eb2e27 ("blk-iocost: revamp debt handling") +Cc: stable@vger.kernel.org # v5.10+ +Link: https://lore.kernel.org/r/YmjODd4aif9BzFuO@slm.duckdns.org +Signed-off-by: Jens Axboe +Signed-off-by: Greg Kroah-Hartman +--- + block/blk-iocost.c | 12 +++++++++++- + 1 file changed, 11 insertions(+), 1 deletion(-) + +--- a/block/blk-iocost.c ++++ b/block/blk-iocost.c +@@ -2322,7 +2322,17 @@ static void ioc_timer_fn(struct timer_li + iocg->hweight_donating = hwa; + iocg->hweight_after_donation = new_hwi; + list_add(&iocg->surplus_list, &surpluses); +- } else { ++ } else if (!iocg->abs_vdebt) { ++ /* ++ * @iocg doesn't have enough to donate. Reset ++ * its inuse to active. ++ * ++ * Don't reset debtors as their inuse's are ++ * owned by debt handling. This shouldn't affect ++ * donation calculuation in any meaningful way ++ * as @iocg doesn't have a meaningful amount of ++ * share anyway. ++ */ + TRACE_IOCG_PATH(inuse_shortage, iocg, &now, + iocg->inuse, iocg->active, + iocg->hweight_inuse, new_hwi); diff --git a/queue-5.15/riscv-patch_text-fixup-last-cpu-should-be-master.patch b/queue-5.15/riscv-patch_text-fixup-last-cpu-should-be-master.patch new file mode 100644 index 00000000000..f420067b995 --- /dev/null +++ b/queue-5.15/riscv-patch_text-fixup-last-cpu-should-be-master.patch @@ -0,0 +1,39 @@ +From 8ec1442953c66a1d8462cccd8c20b7ba561f5915 Mon Sep 17 00:00:00 2001 +From: Guo Ren +Date: Wed, 6 Apr 2022 22:16:49 +0800 +Subject: riscv: patch_text: Fixup last cpu should be master + +From: Guo Ren + +commit 8ec1442953c66a1d8462cccd8c20b7ba561f5915 upstream. + +These patch_text implementations are using stop_machine_cpuslocked +infrastructure with atomic cpu_count. The original idea: When the +master CPU patch_text, the others should wait for it. But current +implementation is using the first CPU as master, which couldn't +guarantee the remaining CPUs are waiting. This patch changes the +last CPU as the master to solve the potential risk. + +Signed-off-by: Guo Ren +Signed-off-by: Guo Ren +Acked-by: Palmer Dabbelt +Reviewed-by: Masami Hiramatsu +Fixes: 043cb41a85de ("riscv: introduce interfaces to patch kernel code") +Cc: stable@vger.kernel.org +Signed-off-by: Palmer Dabbelt +Signed-off-by: Greg Kroah-Hartman +--- + arch/riscv/kernel/patch.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/arch/riscv/kernel/patch.c ++++ b/arch/riscv/kernel/patch.c +@@ -104,7 +104,7 @@ static int patch_text_cb(void *data) + struct patch_insn *patch = data; + int ret = 0; + +- if (atomic_inc_return(&patch->cpu_count) == 1) { ++ if (atomic_inc_return(&patch->cpu_count) == num_online_cpus()) { + ret = + patch_text_nosync(patch->addr, &patch->insn, + GET_INSN_LENGTH(patch->insn)); diff --git a/queue-5.15/series b/queue-5.15/series index 39728e5e9d7..7e8f0ef3d3b 100644 --- a/queue-5.15/series +++ b/queue-5.15/series @@ -38,3 +38,7 @@ arm64-dts-imx8mm-venice-fix-spi2-pin-configuration.patch pinctrl-samsung-fix-missing-gpiolib-on-arm64-exynos-config.patch hex2bin-make-the-function-hex_to_bin-constant-time.patch hex2bin-fix-access-beyond-string-end.patch +riscv-patch_text-fixup-last-cpu-should-be-master.patch +x86-pci-xen-disable-pci-msi-masking-for-xen_hvm-guests.patch +iocost-don-t-reset-the-inuse-weight-of-under-weighted-debtors.patch +virtio_net-fix-wrong-buf-address-calculation-when-using-xdp.patch diff --git a/queue-5.15/virtio_net-fix-wrong-buf-address-calculation-when-using-xdp.patch b/queue-5.15/virtio_net-fix-wrong-buf-address-calculation-when-using-xdp.patch new file mode 100644 index 00000000000..bb1f3cc245e --- /dev/null +++ b/queue-5.15/virtio_net-fix-wrong-buf-address-calculation-when-using-xdp.patch @@ -0,0 +1,205 @@ +From acb16b395c3f3d7502443e0c799c2b42df645642 Mon Sep 17 00:00:00 2001 +From: Nikolay Aleksandrov +Date: Mon, 25 Apr 2022 13:37:03 +0300 +Subject: virtio_net: fix wrong buf address calculation when using xdp + +From: Nikolay Aleksandrov + +commit acb16b395c3f3d7502443e0c799c2b42df645642 upstream. + +We received a report[1] of kernel crashes when Cilium is used in XDP +mode with virtio_net after updating to newer kernels. After +investigating the reason it turned out that when using mergeable bufs +with an XDP program which adjusts xdp.data or xdp.data_meta page_to_buf() +calculates the build_skb address wrong because the offset can become less +than the headroom so it gets the address of the previous page (-X bytes +depending on how lower offset is): + page_to_skb: page addr ffff9eb2923e2000 buf ffff9eb2923e1ffc offset 252 headroom 256 + +This is a pr_err() I added in the beginning of page_to_skb which clearly +shows offset that is less than headroom by adding 4 bytes of metadata +via an xdp prog. The calculations done are: + receive_mergeable(): + headroom = VIRTIO_XDP_HEADROOM; // VIRTIO_XDP_HEADROOM == 256 bytes + offset = xdp.data - page_address(xdp_page) - + vi->hdr_len - metasize; + + page_to_skb(): + p = page_address(page) + offset; + ... + buf = p - headroom; + +Now buf goes -4 bytes from the page's starting address as can be seen +above which is set as skb->head and skb->data by build_skb later. Depending +on what's done with the skb (when it's freed most often) we get all kinds +of corruptions and BUG_ON() triggers in mm[2]. We have to recalculate +the new headroom after the xdp program has run, similar to how offset +and len are recalculated. Headroom is directly related to +data_hard_start, data and data_meta, so we use them to get the new size. +The result is correct (similar pr_err() in page_to_skb, one case of +xdp_page and one case of virtnet buf): + a) Case with 4 bytes of metadata + [ 115.949641] page_to_skb: page addr ffff8b4dcfad2000 offset 252 headroom 252 + [ 121.084105] page_to_skb: page addr ffff8b4dcf018000 offset 20732 headroom 252 + b) Case of pushing data +32 bytes + [ 153.181401] page_to_skb: page addr ffff8b4dd0c4d000 offset 288 headroom 288 + [ 158.480421] page_to_skb: page addr ffff8b4dd00b0000 offset 24864 headroom 288 + c) Case of pushing data -33 bytes + [ 835.906830] page_to_skb: page addr ffff8b4dd3270000 offset 223 headroom 223 + [ 840.839910] page_to_skb: page addr ffff8b4dcdd68000 offset 12511 headroom 223 + +Offset and headroom are equal because offset points to the start of +reserved bytes for the virtio_net header which are at buf start + +headroom, while data points at buf start + vnet hdr size + headroom so +when data or data_meta are adjusted by the xdp prog both the headroom size +and the offset change equally. We can use data_hard_start to compute the +new headroom after the xdp prog (linearized / page start case, the +virtnet buf case is similar just with bigger base offset): + xdp.data_hard_start = page_address + vnet_hdr + xdp.data = page_address + vnet_hdr + headroom + new headroom after xdp prog = xdp.data - xdp.data_hard_start - metasize + +An example reproducer xdp prog[3] is below. + +[1] https://github.com/cilium/cilium/issues/19453 + +[2] Two of the many traces: + [ 40.437400] BUG: Bad page state in process swapper/0 pfn:14940 + [ 40.916726] BUG: Bad page state in process systemd-resolve pfn:053b7 + [ 41.300891] kernel BUG at include/linux/mm.h:720! + [ 41.301801] invalid opcode: 0000 [#1] PREEMPT SMP NOPTI + [ 41.302784] CPU: 1 PID: 1181 Comm: kubelet Kdump: loaded Tainted: G B W 5.18.0-rc1+ #37 + [ 41.304458] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1.fc35 04/01/2014 + [ 41.306018] RIP: 0010:page_frag_free+0x79/0xe0 + [ 41.306836] Code: 00 00 75 ea 48 8b 07 a9 00 00 01 00 74 e0 48 8b 47 48 48 8d 50 ff a8 01 48 0f 45 fa eb d0 48 c7 c6 18 b8 30 a6 e8 d7 f8 fc ff <0f> 0b 48 8d 78 ff eb bc 48 8b 07 a9 00 00 01 00 74 3a 66 90 0f b6 + [ 41.310235] RSP: 0018:ffffac05c2a6bc78 EFLAGS: 00010292 + [ 41.311201] RAX: 000000000000003e RBX: 0000000000000000 RCX: 0000000000000000 + [ 41.312502] RDX: 0000000000000001 RSI: ffffffffa6423004 RDI: 00000000ffffffff + [ 41.313794] RBP: ffff993c98823600 R08: 0000000000000000 R09: 00000000ffffdfff + [ 41.315089] R10: ffffac05c2a6ba68 R11: ffffffffa698ca28 R12: ffff993c98823600 + [ 41.316398] R13: ffff993c86311ebc R14: 0000000000000000 R15: 000000000000005c + [ 41.317700] FS: 00007fe13fc56740(0000) GS:ffff993cdd900000(0000) knlGS:0000000000000000 + [ 41.319150] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 + [ 41.320152] CR2: 000000c00008a000 CR3: 0000000014908000 CR4: 0000000000350ee0 + [ 41.321387] Call Trace: + [ 41.321819] + [ 41.322193] skb_release_data+0x13f/0x1c0 + [ 41.322902] __kfree_skb+0x20/0x30 + [ 41.343870] tcp_recvmsg_locked+0x671/0x880 + [ 41.363764] tcp_recvmsg+0x5e/0x1c0 + [ 41.384102] inet_recvmsg+0x42/0x100 + [ 41.406783] ? sock_recvmsg+0x1d/0x70 + [ 41.428201] sock_read_iter+0x84/0xd0 + [ 41.445592] ? 0xffffffffa3000000 + [ 41.462442] new_sync_read+0x148/0x160 + [ 41.479314] ? 0xffffffffa3000000 + [ 41.496937] vfs_read+0x138/0x190 + [ 41.517198] ksys_read+0x87/0xc0 + [ 41.535336] do_syscall_64+0x3b/0x90 + [ 41.551637] entry_SYSCALL_64_after_hwframe+0x44/0xae + [ 41.568050] RIP: 0033:0x48765b + [ 41.583955] Code: e8 4a 35 fe ff eb 88 cc cc cc cc cc cc cc cc e8 fb 7a fe ff 48 8b 7c 24 10 48 8b 74 24 18 48 8b 54 24 20 48 8b 44 24 08 0f 05 <48> 3d 01 f0 ff ff 76 20 48 c7 44 24 28 ff ff ff ff 48 c7 44 24 30 + [ 41.632818] RSP: 002b:000000c000a2f5b8 EFLAGS: 00000212 ORIG_RAX: 0000000000000000 + [ 41.664588] RAX: ffffffffffffffda RBX: 000000c000062000 RCX: 000000000048765b + [ 41.681205] RDX: 0000000000005e54 RSI: 000000c000e66000 RDI: 0000000000000016 + [ 41.697164] RBP: 000000c000a2f608 R08: 0000000000000001 R09: 00000000000001b4 + [ 41.713034] R10: 00000000000000b6 R11: 0000000000000212 R12: 00000000000000e9 + [ 41.728755] R13: 0000000000000001 R14: 000000c000a92000 R15: ffffffffffffffff + [ 41.744254] + [ 41.758585] Modules linked in: br_netfilter bridge veth netconsole virtio_net + + and + + [ 33.524802] BUG: Bad page state in process systemd-network pfn:11e60 + [ 33.528617] page ffffe05dc0147b00 ffffe05dc04e7a00 ffff8ae9851ec000 (1) len 82 offset 252 metasize 4 hroom 0 hdr_len 12 data ffff8ae9851ec10c data_meta ffff8ae9851ec108 data_end ffff8ae9851ec14e + [ 33.529764] page:000000003792b5ba refcount:0 mapcount:-512 mapping:0000000000000000 index:0x0 pfn:0x11e60 + [ 33.532463] flags: 0xfffffc0000000(node=0|zone=1|lastcpupid=0x1fffff) + [ 33.532468] raw: 000fffffc0000000 0000000000000000 dead000000000122 0000000000000000 + [ 33.532470] raw: 0000000000000000 0000000000000000 00000000fffffdff 0000000000000000 + [ 33.532471] page dumped because: nonzero mapcount + [ 33.532472] Modules linked in: br_netfilter bridge veth netconsole virtio_net + [ 33.532479] CPU: 0 PID: 791 Comm: systemd-network Kdump: loaded Not tainted 5.18.0-rc1+ #37 + [ 33.532482] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.15.0-1.fc35 04/01/2014 + [ 33.532484] Call Trace: + [ 33.532496] + [ 33.532500] dump_stack_lvl+0x45/0x5a + [ 33.532506] bad_page.cold+0x63/0x94 + [ 33.532510] free_pcp_prepare+0x290/0x420 + [ 33.532515] free_unref_page+0x1b/0x100 + [ 33.532518] skb_release_data+0x13f/0x1c0 + [ 33.532524] kfree_skb_reason+0x3e/0xc0 + [ 33.532527] ip6_mc_input+0x23c/0x2b0 + [ 33.532531] ip6_sublist_rcv_finish+0x83/0x90 + [ 33.532534] ip6_sublist_rcv+0x22b/0x2b0 + +[3] XDP program to reproduce(xdp_pass.c): + #include + #include + + SEC("xdp_pass") + int xdp_pkt_pass(struct xdp_md *ctx) + { + bpf_xdp_adjust_head(ctx, -(int)32); + return XDP_PASS; + } + + char _license[] SEC("license") = "GPL"; + + compile: clang -O2 -g -Wall -target bpf -c xdp_pass.c -o xdp_pass.o + load on virtio_net: ip link set enp1s0 xdpdrv obj xdp_pass.o sec xdp_pass + +CC: stable@vger.kernel.org +CC: Jason Wang +CC: Xuan Zhuo +CC: Daniel Borkmann +CC: "Michael S. Tsirkin" +CC: virtualization@lists.linux-foundation.org +Fixes: 8fb7da9e9907 ("virtio_net: get build_skb() buf by data ptr") +Signed-off-by: Nikolay Aleksandrov +Reviewed-by: Xuan Zhuo +Acked-by: Daniel Borkmann +Acked-by: Michael S. Tsirkin +Acked-by: Jason Wang +Link: https://lore.kernel.org/r/20220425103703.3067292-1-razor@blackwall.org +Signed-off-by: Paolo Abeni +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/virtio_net.c | 20 +++++++++++++++++++- + 1 file changed, 19 insertions(+), 1 deletion(-) + +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -965,6 +965,24 @@ static struct sk_buff *receive_mergeable + * xdp.data_meta were adjusted + */ + len = xdp.data_end - xdp.data + vi->hdr_len + metasize; ++ ++ /* recalculate headroom if xdp.data or xdp_data_meta ++ * were adjusted, note that offset should always point ++ * to the start of the reserved bytes for virtio_net ++ * header which are followed by xdp.data, that means ++ * that offset is equal to the headroom (when buf is ++ * starting at the beginning of the page, otherwise ++ * there is a base offset inside the page) but it's used ++ * with a different starting point (buf start) than ++ * xdp.data (buf start + vnet hdr size). If xdp.data or ++ * data_meta were adjusted by the xdp prog then the ++ * headroom size has changed and so has the offset, we ++ * can use data_hard_start, which points at buf start + ++ * vnet hdr size, to calculate the new headroom and use ++ * it later to compute buf start in page_to_skb() ++ */ ++ headroom = xdp.data - xdp.data_hard_start - metasize; ++ + /* We can only create skb based on xdp_page. */ + if (unlikely(xdp_page != page)) { + rcu_read_unlock(); +@@ -972,7 +990,7 @@ static struct sk_buff *receive_mergeable + head_skb = page_to_skb(vi, rq, xdp_page, offset, + len, PAGE_SIZE, false, + metasize, +- VIRTIO_XDP_HEADROOM); ++ headroom); + return head_skb; + } + break; diff --git a/queue-5.15/x86-pci-xen-disable-pci-msi-masking-for-xen_hvm-guests.patch b/queue-5.15/x86-pci-xen-disable-pci-msi-masking-for-xen_hvm-guests.patch new file mode 100644 index 00000000000..c448acb4f07 --- /dev/null +++ b/queue-5.15/x86-pci-xen-disable-pci-msi-masking-for-xen_hvm-guests.patch @@ -0,0 +1,57 @@ +From 7e0815b3e09986d2fe651199363e135b9358132a Mon Sep 17 00:00:00 2001 +From: Thomas Gleixner +Date: Thu, 28 Apr 2022 15:50:54 +0200 +Subject: x86/pci/xen: Disable PCI/MSI[-X] masking for XEN_HVM guests + +From: Thomas Gleixner + +commit 7e0815b3e09986d2fe651199363e135b9358132a upstream. + +When a XEN_HVM guest uses the XEN PIRQ/Eventchannel mechanism, then +PCI/MSI[-X] masking is solely controlled by the hypervisor, but contrary to +XEN_PV guests this does not disable PCI/MSI[-X] masking in the PCI/MSI +layer. + +This can lead to a situation where the PCI/MSI layer masks an MSI[-X] +interrupt and the hypervisor grants the write despite the fact that it +already requested the interrupt. As a consequence interrupt delivery on the +affected device is not happening ever. + +Set pci_msi_ignore_mask to prevent that like it's done for XEN_PV guests +already. + +Fixes: 809f9267bbab ("xen: map MSIs into pirqs") +Reported-by: Jeremi Piotrowski +Reported-by: Dusty Mabe +Reported-by: Salvatore Bonaccorso +Signed-off-by: Thomas Gleixner +Tested-by: Noah Meyerhans +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/r/87tuaduxj5.ffs@tglx +Signed-off-by: Greg Kroah-Hartman +--- + arch/x86/pci/xen.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +--- a/arch/x86/pci/xen.c ++++ b/arch/x86/pci/xen.c +@@ -472,7 +472,6 @@ static __init void xen_setup_pci_msi(voi + xen_msi_ops.setup_msi_irqs = xen_setup_msi_irqs; + } + xen_msi_ops.teardown_msi_irqs = xen_pv_teardown_msi_irqs; +- pci_msi_ignore_mask = 1; + } else if (xen_hvm_domain()) { + xen_msi_ops.setup_msi_irqs = xen_hvm_setup_msi_irqs; + xen_msi_ops.teardown_msi_irqs = xen_teardown_msi_irqs; +@@ -486,6 +485,11 @@ static __init void xen_setup_pci_msi(voi + * in allocating the native domain and never use it. + */ + x86_init.irqs.create_pci_msi_domain = xen_create_pci_msi_domain; ++ /* ++ * With XEN PIRQ/Eventchannels in use PCI/MSI[-X] masking is solely ++ * controlled by the hypervisor. ++ */ ++ pci_msi_ignore_mask = 1; + } + + #else /* CONFIG_PCI_MSI */