From: Greg Kroah-Hartman Date: Fri, 20 Oct 2023 20:50:44 +0000 (+0200) Subject: 4.19-stable patches X-Git-Tag: v4.14.328~80 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=65cd27283f341aad113255be475135eac00280dd;p=thirdparty%2Fkernel%2Fstable-queue.git 4.19-stable patches added patches: i40e-prevent-crash-on-probe-if-hw-registers-have-invalid-values.patch net-ipv4-fix-return-value-check-in-esp_remove_trailer.patch net-ipv6-fix-return-value-check-in-esp_remove_trailer.patch net-pktgen-fix-interface-flags-printing.patch net-rfkill-gpio-prevent-value-glitch-during-probe.patch net-sched-sch_hfsc-upgrade-rt-to-sc-when-it-becomes-a-inner-curve.patch net-usb-smsc95xx-fix-an-error-code-in-smsc95xx_reset.patch netfilter-nft_set_rbtree-.deactivate-fails-if-element-has-expired.patch tcp-fix-excessive-tlp-and-rack-timeouts-from-hz-rounding.patch tcp-tsq-relax-tcp_small_queue_check-when-rtx-queue-contains-a-single-skb.patch xfrm-fix-a-data-race-in-xfrm_gen_index.patch xfrm-interface-use-dev_stats_inc.patch --- diff --git a/queue-4.19/i40e-prevent-crash-on-probe-if-hw-registers-have-invalid-values.patch b/queue-4.19/i40e-prevent-crash-on-probe-if-hw-registers-have-invalid-values.patch new file mode 100644 index 00000000000..7881799f759 --- /dev/null +++ b/queue-4.19/i40e-prevent-crash-on-probe-if-hw-registers-have-invalid-values.patch @@ -0,0 +1,57 @@ +From fc6f716a5069180c40a8c9b63631e97da34f64a3 Mon Sep 17 00:00:00 2001 +From: Michal Schmidt +Date: Wed, 11 Oct 2023 16:33:32 -0700 +Subject: i40e: prevent crash on probe if hw registers have invalid values + +From: Michal Schmidt + +commit fc6f716a5069180c40a8c9b63631e97da34f64a3 upstream. + +The hardware provides the indexes of the first and the last available +queue and VF. From the indexes, the driver calculates the numbers of +queues and VFs. In theory, a faulty device might say the last index is +smaller than the first index. In that case, the driver's calculation +would underflow, it would attempt to write to non-existent registers +outside of the ioremapped range and crash. + +I ran into this not by having a faulty device, but by an operator error. +I accidentally ran a QE test meant for i40e devices on an ice device. +The test used 'echo i40e > /sys/...ice PCI device.../driver_override', +bound the driver to the device and crashed in one of the wr32 calls in +i40e_clear_hw. + +Add checks to prevent underflows in the calculations of num_queues and +num_vfs. With this fix, the wrong device probing reports errors and +returns a failure without crashing. + +Fixes: 838d41d92a90 ("i40e: clear all queues and interrupts") +Signed-off-by: Michal Schmidt +Reviewed-by: Simon Horman +Tested-by: Pucha Himasekhar Reddy (A Contingent worker at Intel) +Link: https://lore.kernel.org/r/20231011233334.336092-2-jacob.e.keller@intel.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/ethernet/intel/i40e/i40e_common.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/drivers/net/ethernet/intel/i40e/i40e_common.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_common.c +@@ -1332,7 +1332,7 @@ void i40e_clear_hw(struct i40e_hw *hw) + I40E_PFLAN_QALLOC_FIRSTQ_SHIFT; + j = (val & I40E_PFLAN_QALLOC_LASTQ_MASK) >> + I40E_PFLAN_QALLOC_LASTQ_SHIFT; +- if (val & I40E_PFLAN_QALLOC_VALID_MASK) ++ if (val & I40E_PFLAN_QALLOC_VALID_MASK && j >= base_queue) + num_queues = (j - base_queue) + 1; + else + num_queues = 0; +@@ -1342,7 +1342,7 @@ void i40e_clear_hw(struct i40e_hw *hw) + I40E_PF_VT_PFALLOC_FIRSTVF_SHIFT; + j = (val & I40E_PF_VT_PFALLOC_LASTVF_MASK) >> + I40E_PF_VT_PFALLOC_LASTVF_SHIFT; +- if (val & I40E_PF_VT_PFALLOC_VALID_MASK) ++ if (val & I40E_PF_VT_PFALLOC_VALID_MASK && j >= i) + num_vfs = (j - i) + 1; + else + num_vfs = 0; diff --git a/queue-4.19/net-ipv4-fix-return-value-check-in-esp_remove_trailer.patch b/queue-4.19/net-ipv4-fix-return-value-check-in-esp_remove_trailer.patch new file mode 100644 index 00000000000..df27eebd2ed --- /dev/null +++ b/queue-4.19/net-ipv4-fix-return-value-check-in-esp_remove_trailer.patch @@ -0,0 +1,32 @@ +From 513f61e2193350c7a345da98559b80f61aec4fa6 Mon Sep 17 00:00:00 2001 +From: Ma Ke +Date: Mon, 9 Oct 2023 09:13:37 +0800 +Subject: net: ipv4: fix return value check in esp_remove_trailer + +From: Ma Ke + +commit 513f61e2193350c7a345da98559b80f61aec4fa6 upstream. + +In esp_remove_trailer(), to avoid an unexpected result returned by +pskb_trim, we should check the return value of pskb_trim(). + +Signed-off-by: Ma Ke +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/esp4.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/ipv4/esp4.c ++++ b/net/ipv4/esp4.c +@@ -565,7 +565,9 @@ static inline int esp_remove_trailer(str + skb->csum = csum_block_sub(skb->csum, csumdiff, + skb->len - trimlen); + } +- pskb_trim(skb, skb->len - trimlen); ++ ret = pskb_trim(skb, skb->len - trimlen); ++ if (unlikely(ret)) ++ return ret; + + ret = nexthdr[1]; + diff --git a/queue-4.19/net-ipv6-fix-return-value-check-in-esp_remove_trailer.patch b/queue-4.19/net-ipv6-fix-return-value-check-in-esp_remove_trailer.patch new file mode 100644 index 00000000000..c36e8a9ecd0 --- /dev/null +++ b/queue-4.19/net-ipv6-fix-return-value-check-in-esp_remove_trailer.patch @@ -0,0 +1,32 @@ +From dad4e491e30b20f4dc615c9da65d2142d703b5c2 Mon Sep 17 00:00:00 2001 +From: Ma Ke +Date: Sat, 7 Oct 2023 08:59:53 +0800 +Subject: net: ipv6: fix return value check in esp_remove_trailer + +From: Ma Ke + +commit dad4e491e30b20f4dc615c9da65d2142d703b5c2 upstream. + +In esp_remove_trailer(), to avoid an unexpected result returned by +pskb_trim, we should check the return value of pskb_trim(). + +Signed-off-by: Ma Ke +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv6/esp6.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +--- a/net/ipv6/esp6.c ++++ b/net/ipv6/esp6.c +@@ -517,7 +517,9 @@ static inline int esp_remove_trailer(str + skb->csum = csum_block_sub(skb->csum, csumdiff, + skb->len - trimlen); + } +- pskb_trim(skb, skb->len - trimlen); ++ ret = pskb_trim(skb, skb->len - trimlen); ++ if (unlikely(ret)) ++ return ret; + + ret = nexthdr[1]; + diff --git a/queue-4.19/net-pktgen-fix-interface-flags-printing.patch b/queue-4.19/net-pktgen-fix-interface-flags-printing.patch new file mode 100644 index 00000000000..923da3b487e --- /dev/null +++ b/queue-4.19/net-pktgen-fix-interface-flags-printing.patch @@ -0,0 +1,60 @@ +From 1d30162f35c7a73fc2f8cdcdcdbd690bedb99d1a Mon Sep 17 00:00:00 2001 +From: Gavrilov Ilia +Date: Mon, 16 Oct 2023 14:08:59 +0000 +Subject: net: pktgen: Fix interface flags printing + +From: Gavrilov Ilia + +commit 1d30162f35c7a73fc2f8cdcdcdbd690bedb99d1a upstream. + +Device flags are displayed incorrectly: +1) The comparison (i == F_FLOW_SEQ) is always false, because F_FLOW_SEQ +is equal to (1 << FLOW_SEQ_SHIFT) == 2048, and the maximum value +of the 'i' variable is (NR_PKT_FLAG - 1) == 17. It should be compared +with FLOW_SEQ_SHIFT. + +2) Similarly to the F_IPSEC flag. + +3) Also add spaces to the print end of the string literal "spi:%u" +to prevent the output from merging with the flag that follows. + +Found by InfoTeCS on behalf of Linux Verification Center +(linuxtesting.org) with SVACE. + +Fixes: 99c6d3d20d62 ("pktgen: Remove brute-force printing of flags") +Signed-off-by: Gavrilov Ilia +Signed-off-by: David S. Miller +Signed-off-by: Greg Kroah-Hartman +--- + net/core/pktgen.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +--- a/net/core/pktgen.c ++++ b/net/core/pktgen.c +@@ -651,19 +651,19 @@ static int pktgen_if_show(struct seq_fil + seq_puts(seq, " Flags: "); + + for (i = 0; i < NR_PKT_FLAGS; i++) { +- if (i == F_FLOW_SEQ) ++ if (i == FLOW_SEQ_SHIFT) + if (!pkt_dev->cflows) + continue; + +- if (pkt_dev->flags & (1 << i)) ++ if (pkt_dev->flags & (1 << i)) { + seq_printf(seq, "%s ", pkt_flag_names[i]); +- else if (i == F_FLOW_SEQ) +- seq_puts(seq, "FLOW_RND "); +- + #ifdef CONFIG_XFRM +- if (i == F_IPSEC && pkt_dev->spi) +- seq_printf(seq, "spi:%u", pkt_dev->spi); ++ if (i == IPSEC_SHIFT && pkt_dev->spi) ++ seq_printf(seq, "spi:%u ", pkt_dev->spi); + #endif ++ } else if (i == FLOW_SEQ_SHIFT) { ++ seq_puts(seq, "FLOW_RND "); ++ } + } + + seq_puts(seq, "\n"); diff --git a/queue-4.19/net-rfkill-gpio-prevent-value-glitch-during-probe.patch b/queue-4.19/net-rfkill-gpio-prevent-value-glitch-during-probe.patch new file mode 100644 index 00000000000..0316d29b72c --- /dev/null +++ b/queue-4.19/net-rfkill-gpio-prevent-value-glitch-during-probe.patch @@ -0,0 +1,56 @@ +From b2f750c3a80b285cd60c9346f8c96bd0a2a66cde Mon Sep 17 00:00:00 2001 +From: Josua Mayer +Date: Wed, 4 Oct 2023 18:39:28 +0200 +Subject: net: rfkill: gpio: prevent value glitch during probe + +From: Josua Mayer + +commit b2f750c3a80b285cd60c9346f8c96bd0a2a66cde upstream. + +When either reset- or shutdown-gpio have are initially deasserted, +e.g. after a reboot - or when the hardware does not include pull-down, +there will be a short toggle of both IOs to logical 0 and back to 1. + +It seems that the rfkill default is unblocked, so the driver should not +glitch to output low during probe. +It can lead e.g. to unexpected lte modem reconnect: + +[1] root@localhost:~# dmesg | grep "usb 2-1" +[ 2.136124] usb 2-1: new SuperSpeed USB device number 2 using xhci-hcd +[ 21.215278] usb 2-1: USB disconnect, device number 2 +[ 28.833977] usb 2-1: new SuperSpeed USB device number 3 using xhci-hcd + +The glitch has been discovered on an arm64 board, now that device-tree +support for the rfkill-gpio driver has finally appeared :). + +Change the flags for devm_gpiod_get_optional from GPIOD_OUT_LOW to +GPIOD_ASIS to avoid any glitches. +The rfkill driver will set the intended value during rfkill_sync_work. + +Fixes: 7176ba23f8b5 ("net: rfkill: add generic gpio rfkill driver") +Signed-off-by: Josua Mayer +Link: https://lore.kernel.org/r/20231004163928.14609-1-josua@solid-run.com +Signed-off-by: Johannes Berg +Signed-off-by: Greg Kroah-Hartman +--- + net/rfkill/rfkill-gpio.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +--- a/net/rfkill/rfkill-gpio.c ++++ b/net/rfkill/rfkill-gpio.c +@@ -112,13 +112,13 @@ static int rfkill_gpio_probe(struct plat + + rfkill->clk = devm_clk_get(&pdev->dev, NULL); + +- gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_OUT_LOW); ++ gpio = devm_gpiod_get_optional(&pdev->dev, "reset", GPIOD_ASIS); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + + rfkill->reset_gpio = gpio; + +- gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_OUT_LOW); ++ gpio = devm_gpiod_get_optional(&pdev->dev, "shutdown", GPIOD_ASIS); + if (IS_ERR(gpio)) + return PTR_ERR(gpio); + diff --git a/queue-4.19/net-sched-sch_hfsc-upgrade-rt-to-sc-when-it-becomes-a-inner-curve.patch b/queue-4.19/net-sched-sch_hfsc-upgrade-rt-to-sc-when-it-becomes-a-inner-curve.patch new file mode 100644 index 00000000000..c1e0ad0d7f7 --- /dev/null +++ b/queue-4.19/net-sched-sch_hfsc-upgrade-rt-to-sc-when-it-becomes-a-inner-curve.patch @@ -0,0 +1,90 @@ +From a13b67c9a015c4e21601ef9aa4ec9c5d972df1b4 Mon Sep 17 00:00:00 2001 +From: Pedro Tammela +Date: Tue, 17 Oct 2023 11:36:02 -0300 +Subject: net/sched: sch_hfsc: upgrade 'rt' to 'sc' when it becomes a inner curve + +From: Pedro Tammela + +commit a13b67c9a015c4e21601ef9aa4ec9c5d972df1b4 upstream. + +Christian Theune says: + I upgraded from 6.1.38 to 6.1.55 this morning and it broke my traffic shaping script, + leaving me with a non-functional uplink on a remote router. + +A 'rt' curve cannot be used as a inner curve (parent class), but we were +allowing such configurations since the qdisc was introduced. Such +configurations would trigger a UAF as Budimir explains: + The parent will have vttree_insert() called on it in init_vf(), + but will not have vttree_remove() called on it in update_vf() + because it does not have the HFSC_FSC flag set. + +The qdisc always assumes that inner classes have the HFSC_FSC flag set. +This is by design as it doesn't make sense 'qdisc wise' for an 'rt' +curve to be an inner curve. + +Budimir's original patch disallows users to add classes with a 'rt' +parent, but this is too strict as it breaks users that have been using +'rt' as a inner class. Another approach, taken by this patch, is to +upgrade the inner 'rt' into a 'sc', warning the user in the process. +It avoids the UAF reported by Budimir while also being more permissive +to bad scripts/users/code using 'rt' as a inner class. + +Users checking the `tc class ls [...]` or `tc class get [...]` dumps would +observe the curve change and are potentially breaking with this change. + +v1->v2: https://lore.kernel.org/all/20231013151057.2611860-1-pctammela@mojatatu.com/ +- Correct 'Fixes' tag and merge with revert (Jakub) + +Cc: Christian Theune +Cc: Budimir Markovic +Fixes: b3d26c5702c7 ("net/sched: sch_hfsc: Ensure inner classes have fsc curve") +Signed-off-by: Pedro Tammela +Acked-by: Jamal Hadi Salim +Link: https://lore.kernel.org/r/20231017143602.3191556-1-pctammela@mojatatu.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/sched/sch_hfsc.c | 18 ++++++++++++++---- + 1 file changed, 14 insertions(+), 4 deletions(-) + +--- a/net/sched/sch_hfsc.c ++++ b/net/sched/sch_hfsc.c +@@ -913,6 +913,14 @@ hfsc_change_usc(struct hfsc_class *cl, s + cl->cl_flags |= HFSC_USC; + } + ++static void ++hfsc_upgrade_rt(struct hfsc_class *cl) ++{ ++ cl->cl_fsc = cl->cl_rsc; ++ rtsc_init(&cl->cl_virtual, &cl->cl_fsc, cl->cl_vt, cl->cl_total); ++ cl->cl_flags |= HFSC_FSC; ++} ++ + static const struct nla_policy hfsc_policy[TCA_HFSC_MAX + 1] = { + [TCA_HFSC_RSC] = { .len = sizeof(struct tc_service_curve) }, + [TCA_HFSC_FSC] = { .len = sizeof(struct tc_service_curve) }, +@@ -1021,10 +1029,6 @@ hfsc_change_class(struct Qdisc *sch, u32 + if (parent == NULL) + return -ENOENT; + } +- if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) { +- NL_SET_ERR_MSG(extack, "Invalid parent - parent class must have FSC"); +- return -EINVAL; +- } + + if (classid == 0 || TC_H_MAJ(classid ^ sch->handle) != 0) + return -EINVAL; +@@ -1077,6 +1081,12 @@ hfsc_change_class(struct Qdisc *sch, u32 + cl->cf_tree = RB_ROOT; + + sch_tree_lock(sch); ++ /* Check if the inner class is a misconfigured 'rt' */ ++ if (!(parent->cl_flags & HFSC_FSC) && parent != &q->root) { ++ NL_SET_ERR_MSG(extack, ++ "Forced curve change on parent 'rt' to 'sc'"); ++ hfsc_upgrade_rt(parent); ++ } + qdisc_class_hash_insert(&q->clhash, &cl->cl_common); + list_add_tail(&cl->siblings, &parent->children); + if (parent->level == 0) diff --git a/queue-4.19/net-usb-smsc95xx-fix-an-error-code-in-smsc95xx_reset.patch b/queue-4.19/net-usb-smsc95xx-fix-an-error-code-in-smsc95xx_reset.patch new file mode 100644 index 00000000000..96aa5144177 --- /dev/null +++ b/queue-4.19/net-usb-smsc95xx-fix-an-error-code-in-smsc95xx_reset.patch @@ -0,0 +1,32 @@ +From c53647a5df9e66dd9fedf240198e1fe50d88c286 Mon Sep 17 00:00:00 2001 +From: Dan Carpenter +Date: Mon, 16 Oct 2023 20:28:10 +0300 +Subject: net: usb: smsc95xx: Fix an error code in smsc95xx_reset() + +From: Dan Carpenter + +commit c53647a5df9e66dd9fedf240198e1fe50d88c286 upstream. + +Return a negative error code instead of success. + +Fixes: 2f7ca802bdae ("net: Add SMSC LAN9500 USB2.0 10/100 ethernet adapter driver") +Signed-off-by: Dan Carpenter +Reviewed-by: Andrew Lunn +Link: https://lore.kernel.org/r/147927f0-9ada-45cc-81ff-75a19dd30b76@moroto.mountain +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + drivers/net/usb/smsc95xx.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +--- a/drivers/net/usb/smsc95xx.c ++++ b/drivers/net/usb/smsc95xx.c +@@ -1054,7 +1054,7 @@ static int smsc95xx_reset(struct usbnet + + if (timeout >= 100) { + netdev_warn(dev->net, "timeout waiting for completion of Lite Reset\n"); +- return ret; ++ return -ETIMEDOUT; + } + + ret = smsc95xx_write_reg(dev, PM_CTRL, PM_CTL_PHY_RST_); diff --git a/queue-4.19/netfilter-nft_set_rbtree-.deactivate-fails-if-element-has-expired.patch b/queue-4.19/netfilter-nft_set_rbtree-.deactivate-fails-if-element-has-expired.patch new file mode 100644 index 00000000000..b584d186b02 --- /dev/null +++ b/queue-4.19/netfilter-nft_set_rbtree-.deactivate-fails-if-element-has-expired.patch @@ -0,0 +1,34 @@ +From d111692a59c1470ae530cbb39bcf0346c950ecc7 Mon Sep 17 00:00:00 2001 +From: Pablo Neira Ayuso +Date: Tue, 17 Oct 2023 12:28:27 +0200 +Subject: netfilter: nft_set_rbtree: .deactivate fails if element has expired + +From: Pablo Neira Ayuso + +commit d111692a59c1470ae530cbb39bcf0346c950ecc7 upstream. + +This allows to remove an expired element which is not possible in other +existing set backends, this is more noticeable if gc-interval is high so +expired elements remain in the tree. On-demand gc also does not help in +this case, because this is delete element path. Return NULL if element +has expired. + +Fixes: 8d8540c4f5e0 ("netfilter: nft_set_rbtree: add timeout support") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Florian Westphal +Signed-off-by: Greg Kroah-Hartman +--- + net/netfilter/nft_set_rbtree.c | 2 ++ + 1 file changed, 2 insertions(+) + +--- a/net/netfilter/nft_set_rbtree.c ++++ b/net/netfilter/nft_set_rbtree.c +@@ -326,6 +326,8 @@ static void *nft_rbtree_deactivate(const + nft_rbtree_interval_end(this)) { + parent = parent->rb_right; + continue; ++ } else if (nft_set_elem_expired(&rbe->ext)) { ++ break; + } else if (!nft_set_elem_active(&rbe->ext, genmask)) { + parent = parent->rb_left; + continue; diff --git a/queue-4.19/series b/queue-4.19/series index 3f1a5445ea7..ba401c6c59e 100644 --- a/queue-4.19/series +++ b/queue-4.19/series @@ -53,3 +53,15 @@ nfc-nci-fix-possible-null-pointer-dereference-in-send_acknowledge.patch regmap-fix-null-deref-on-lookup.patch kvm-x86-mask-lvtpc-when-handling-a-pmi.patch netfilter-nft_payload-fix-wrong-mac-header-matching.patch +xfrm-fix-a-data-race-in-xfrm_gen_index.patch +xfrm-interface-use-dev_stats_inc.patch +net-ipv4-fix-return-value-check-in-esp_remove_trailer.patch +net-ipv6-fix-return-value-check-in-esp_remove_trailer.patch +net-rfkill-gpio-prevent-value-glitch-during-probe.patch +tcp-fix-excessive-tlp-and-rack-timeouts-from-hz-rounding.patch +tcp-tsq-relax-tcp_small_queue_check-when-rtx-queue-contains-a-single-skb.patch +net-usb-smsc95xx-fix-an-error-code-in-smsc95xx_reset.patch +i40e-prevent-crash-on-probe-if-hw-registers-have-invalid-values.patch +net-sched-sch_hfsc-upgrade-rt-to-sc-when-it-becomes-a-inner-curve.patch +netfilter-nft_set_rbtree-.deactivate-fails-if-element-has-expired.patch +net-pktgen-fix-interface-flags-printing.patch diff --git a/queue-4.19/tcp-fix-excessive-tlp-and-rack-timeouts-from-hz-rounding.patch b/queue-4.19/tcp-fix-excessive-tlp-and-rack-timeouts-from-hz-rounding.patch new file mode 100644 index 00000000000..b628e6fc674 --- /dev/null +++ b/queue-4.19/tcp-fix-excessive-tlp-and-rack-timeouts-from-hz-rounding.patch @@ -0,0 +1,96 @@ +From 1c2709cfff1dedbb9591e989e2f001484208d914 Mon Sep 17 00:00:00 2001 +From: Neal Cardwell +Date: Sun, 15 Oct 2023 13:47:00 -0400 +Subject: tcp: fix excessive TLP and RACK timeouts from HZ rounding + +From: Neal Cardwell + +commit 1c2709cfff1dedbb9591e989e2f001484208d914 upstream. + +We discovered from packet traces of slow loss recovery on kernels with +the default HZ=250 setting (and min_rtt < 1ms) that after reordering, +when receiving a SACKed sequence range, the RACK reordering timer was +firing after about 16ms rather than the desired value of roughly +min_rtt/4 + 2ms. The problem is largely due to the RACK reorder timer +calculation adding in TCP_TIMEOUT_MIN, which is 2 jiffies. On kernels +with HZ=250, this is 2*4ms = 8ms. The TLP timer calculation has the +exact same issue. + +This commit fixes the TLP transmit timer and RACK reordering timer +floor calculation to more closely match the intended 2ms floor even on +kernels with HZ=250. It does this by adding in a new +TCP_TIMEOUT_MIN_US floor of 2000 us and then converting to jiffies, +instead of the current approach of converting to jiffies and then +adding th TCP_TIMEOUT_MIN value of 2 jiffies. + +Our testing has verified that on kernels with HZ=1000, as expected, +this does not produce significant changes in behavior, but on kernels +with the default HZ=250 the latency improvement can be large. For +example, our tests show that for HZ=250 kernels at low RTTs this fix +roughly halves the latency for the RACK reorder timer: instead of +mostly firing at 16ms it mostly fires at 8ms. + +Suggested-by: Eric Dumazet +Signed-off-by: Neal Cardwell +Signed-off-by: Yuchung Cheng +Fixes: bb4d991a28cc ("tcp: adjust tail loss probe timeout") +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20231015174700.2206872-1-ncardwell.sw@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + include/net/tcp.h | 3 +++ + net/ipv4/tcp_output.c | 9 +++++---- + net/ipv4/tcp_recovery.c | 2 +- + 3 files changed, 9 insertions(+), 5 deletions(-) + +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -141,6 +141,9 @@ void tcp_time_wait(struct sock *sk, int + #define TCP_RTO_MAX ((unsigned)(120*HZ)) + #define TCP_RTO_MIN ((unsigned)(HZ/5)) + #define TCP_TIMEOUT_MIN (2U) /* Min timeout for TCP timers in jiffies */ ++ ++#define TCP_TIMEOUT_MIN_US (2*USEC_PER_MSEC) /* Min TCP timeout in microsecs */ ++ + #define TCP_TIMEOUT_INIT ((unsigned)(1*HZ)) /* RFC6298 2.1 initial RTO value */ + #define TCP_TIMEOUT_FALLBACK ((unsigned)(3*HZ)) /* RFC 1122 initial RTO value, now + * used as a fallback RTO for the +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2449,7 +2449,7 @@ bool tcp_schedule_loss_probe(struct sock + { + struct inet_connection_sock *icsk = inet_csk(sk); + struct tcp_sock *tp = tcp_sk(sk); +- u32 timeout, rto_delta_us; ++ u32 timeout, timeout_us, rto_delta_us; + int early_retrans; + + /* Don't do any loss probe on a Fast Open connection before 3WHS +@@ -2473,11 +2473,12 @@ bool tcp_schedule_loss_probe(struct sock + * sample is available then probe after TCP_TIMEOUT_INIT. + */ + if (tp->srtt_us) { +- timeout = usecs_to_jiffies(tp->srtt_us >> 2); ++ timeout_us = tp->srtt_us >> 2; + if (tp->packets_out == 1) +- timeout += TCP_RTO_MIN; ++ timeout_us += tcp_rto_min_us(sk); + else +- timeout += TCP_TIMEOUT_MIN; ++ timeout_us += TCP_TIMEOUT_MIN_US; ++ timeout = usecs_to_jiffies(timeout_us); + } else { + timeout = TCP_TIMEOUT_INIT; + } +--- a/net/ipv4/tcp_recovery.c ++++ b/net/ipv4/tcp_recovery.c +@@ -122,7 +122,7 @@ bool tcp_rack_mark_lost(struct sock *sk) + tp->rack.advanced = 0; + tcp_rack_detect_loss(sk, &timeout); + if (timeout) { +- timeout = usecs_to_jiffies(timeout) + TCP_TIMEOUT_MIN; ++ timeout = usecs_to_jiffies(timeout + TCP_TIMEOUT_MIN_US); + inet_csk_reset_xmit_timer(sk, ICSK_TIME_REO_TIMEOUT, + timeout, inet_csk(sk)->icsk_rto); + } diff --git a/queue-4.19/tcp-tsq-relax-tcp_small_queue_check-when-rtx-queue-contains-a-single-skb.patch b/queue-4.19/tcp-tsq-relax-tcp_small_queue_check-when-rtx-queue-contains-a-single-skb.patch new file mode 100644 index 00000000000..89fce5ead65 --- /dev/null +++ b/queue-4.19/tcp-tsq-relax-tcp_small_queue_check-when-rtx-queue-contains-a-single-skb.patch @@ -0,0 +1,76 @@ +From f921a4a5bffa8a0005b190fb9421a7fc1fd716b6 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Tue, 17 Oct 2023 12:45:26 +0000 +Subject: tcp: tsq: relax tcp_small_queue_check() when rtx queue contains a single skb + +From: Eric Dumazet + +commit f921a4a5bffa8a0005b190fb9421a7fc1fd716b6 upstream. + +In commit 75eefc6c59fd ("tcp: tsq: add a shortcut in tcp_small_queue_check()") +we allowed to send an skb regardless of TSQ limits being hit if rtx queue +was empty or had a single skb, in order to better fill the pipe +when/if TX completions were slow. + +Then later, commit 75c119afe14f ("tcp: implement rb-tree based +retransmit queue") accidentally removed the special case for +one skb in rtx queue. + +Stefan Wahren reported a regression in single TCP flow throughput +using a 100Mbit fec link, starting from commit 65466904b015 ("tcp: adjust +TSO packet sizes based on min_rtt"). This last commit only made the +regression more visible, because it locked the TCP flow on a particular +behavior where TSQ prevented two skbs being pushed downstream, +adding silences on the wire between each TSO packet. + +Many thanks to Stefan for his invaluable help ! + +Fixes: 75c119afe14f ("tcp: implement rb-tree based retransmit queue") +Link: https://lore.kernel.org/netdev/7f31ddc8-9971-495e-a1f6-819df542e0af@gmx.net/ +Reported-by: Stefan Wahren +Tested-by: Stefan Wahren +Signed-off-by: Eric Dumazet +Acked-by: Neal Cardwell +Link: https://lore.kernel.org/r/20231017124526.4060202-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Greg Kroah-Hartman +--- + net/ipv4/tcp_output.c | 16 ++++++++++++++-- + 1 file changed, 14 insertions(+), 2 deletions(-) + +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -2220,6 +2220,18 @@ static int tcp_mtu_probe(struct sock *sk + return -1; + } + ++static bool tcp_rtx_queue_empty_or_single_skb(const struct sock *sk) ++{ ++ const struct rb_node *node = sk->tcp_rtx_queue.rb_node; ++ ++ /* No skb in the rtx queue. */ ++ if (!node) ++ return true; ++ ++ /* Only one skb in rtx queue. */ ++ return !node->rb_left && !node->rb_right; ++} ++ + /* TCP Small Queues : + * Control number of packets in qdisc/devices to two packets / or ~1 ms. + * (These limits are doubled for retransmits) +@@ -2242,12 +2254,12 @@ static bool tcp_small_queue_check(struct + limit <<= factor; + + if (refcount_read(&sk->sk_wmem_alloc) > limit) { +- /* Always send skb if rtx queue is empty. ++ /* Always send skb if rtx queue is empty or has one skb. + * No need to wait for TX completion to call us back, + * after softirq/tasklet schedule. + * This helps when TX completions are delayed too much. + */ +- if (tcp_rtx_queue_empty(sk)) ++ if (tcp_rtx_queue_empty_or_single_skb(sk)) + return false; + + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); diff --git a/queue-4.19/xfrm-fix-a-data-race-in-xfrm_gen_index.patch b/queue-4.19/xfrm-fix-a-data-race-in-xfrm_gen_index.patch new file mode 100644 index 00000000000..4e19eefd810 --- /dev/null +++ b/queue-4.19/xfrm-fix-a-data-race-in-xfrm_gen_index.patch @@ -0,0 +1,101 @@ +From 3e4bc23926b83c3c67e5f61ae8571602754131a6 Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Fri, 8 Sep 2023 18:13:59 +0000 +Subject: xfrm: fix a data-race in xfrm_gen_index() + +From: Eric Dumazet + +commit 3e4bc23926b83c3c67e5f61ae8571602754131a6 upstream. + +xfrm_gen_index() mutual exclusion uses net->xfrm.xfrm_policy_lock. + +This means we must use a per-netns idx_generator variable, +instead of a static one. +Alternative would be to use an atomic variable. + +syzbot reported: + +BUG: KCSAN: data-race in xfrm_sk_policy_insert / xfrm_sk_policy_insert + +write to 0xffffffff87005938 of 4 bytes by task 29466 on cpu 0: +xfrm_gen_index net/xfrm/xfrm_policy.c:1385 [inline] +xfrm_sk_policy_insert+0x262/0x640 net/xfrm/xfrm_policy.c:2347 +xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 +do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 +ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 +rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 +sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 +__sys_setsockopt+0x1c9/0x230 net/socket.c:2263 +__do_sys_setsockopt net/socket.c:2274 [inline] +__se_sys_setsockopt net/socket.c:2271 [inline] +__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read to 0xffffffff87005938 of 4 bytes by task 29460 on cpu 1: +xfrm_sk_policy_insert+0x13e/0x640 +xfrm_user_policy+0x413/0x540 net/xfrm/xfrm_state.c:2639 +do_ipv6_setsockopt+0x1317/0x2ce0 net/ipv6/ipv6_sockglue.c:943 +ipv6_setsockopt+0x57/0x130 net/ipv6/ipv6_sockglue.c:1012 +rawv6_setsockopt+0x21e/0x410 net/ipv6/raw.c:1054 +sock_common_setsockopt+0x61/0x70 net/core/sock.c:3697 +__sys_setsockopt+0x1c9/0x230 net/socket.c:2263 +__do_sys_setsockopt net/socket.c:2274 [inline] +__se_sys_setsockopt net/socket.c:2271 [inline] +__x64_sys_setsockopt+0x66/0x80 net/socket.c:2271 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0x00006ad8 -> 0x00006b18 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 1 PID: 29460 Comm: syz-executor.1 Not tainted 6.5.0-rc5-syzkaller-00243-g9106536c1aa3 #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 + +Fixes: 1121994c803f ("netns xfrm: policy insertion in netns") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Cc: Steffen Klassert +Cc: Herbert Xu +Acked-by: Herbert Xu +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + include/net/netns/xfrm.h | 1 + + net/xfrm/xfrm_policy.c | 6 ++---- + 2 files changed, 3 insertions(+), 4 deletions(-) + +--- a/include/net/netns/xfrm.h ++++ b/include/net/netns/xfrm.h +@@ -48,6 +48,7 @@ struct netns_xfrm { + struct list_head policy_all; + struct hlist_head *policy_byidx; + unsigned int policy_idx_hmask; ++ unsigned int idx_generator; + struct hlist_head policy_inexact[XFRM_POLICY_MAX]; + struct xfrm_policy_hash policy_bydst[XFRM_POLICY_MAX]; + unsigned int policy_count[XFRM_POLICY_MAX * 2]; +--- a/net/xfrm/xfrm_policy.c ++++ b/net/xfrm/xfrm_policy.c +@@ -654,8 +654,6 @@ EXPORT_SYMBOL(xfrm_policy_hash_rebuild); + * of an absolute inpredictability of ordering of rules. This will not pass. */ + static u32 xfrm_gen_index(struct net *net, int dir, u32 index) + { +- static u32 idx_generator; +- + for (;;) { + struct hlist_head *list; + struct xfrm_policy *p; +@@ -663,8 +661,8 @@ static u32 xfrm_gen_index(struct net *ne + int found; + + if (!index) { +- idx = (idx_generator | dir); +- idx_generator += 8; ++ idx = (net->xfrm.idx_generator | dir); ++ net->xfrm.idx_generator += 8; + } else { + idx = index; + index = 0; diff --git a/queue-4.19/xfrm-interface-use-dev_stats_inc.patch b/queue-4.19/xfrm-interface-use-dev_stats_inc.patch new file mode 100644 index 00000000000..3f2069d9b9a --- /dev/null +++ b/queue-4.19/xfrm-interface-use-dev_stats_inc.patch @@ -0,0 +1,182 @@ +From f7c4e3e5d4f6609b4725a97451948ca2e425379a Mon Sep 17 00:00:00 2001 +From: Eric Dumazet +Date: Tue, 5 Sep 2023 13:23:03 +0000 +Subject: xfrm: interface: use DEV_STATS_INC() + +From: Eric Dumazet + +commit f7c4e3e5d4f6609b4725a97451948ca2e425379a upstream. + +syzbot/KCSAN reported data-races in xfrm whenever dev->stats fields +are updated. + +It appears all of these updates can happen from multiple cpus. + +Adopt SMP safe DEV_STATS_INC() to update dev->stats fields. + +BUG: KCSAN: data-race in xfrmi_xmit / xfrmi_xmit + +read-write to 0xffff88813726b160 of 8 bytes by task 23986 on cpu 1: +xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 +__netdev_start_xmit include/linux/netdevice.h:4889 [inline] +netdev_start_xmit include/linux/netdevice.h:4903 [inline] +xmit_one net/core/dev.c:3544 [inline] +dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 +__dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 +dev_queue_xmit include/linux/netdevice.h:3082 [inline] +neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 +neigh_output include/net/neighbour.h:542 [inline] +ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 +ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 +NF_HOOK_COND include/linux/netfilter.h:293 [inline] +ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 +dst_output include/net/dst.h:458 [inline] +ip_local_out net/ipv4/ip_output.c:127 [inline] +ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 +udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 +udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 +inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 +sock_sendmsg_nosec net/socket.c:730 [inline] +sock_sendmsg net/socket.c:753 [inline] +____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 +___sys_sendmsg net/socket.c:2594 [inline] +__sys_sendmmsg+0x269/0x500 net/socket.c:2680 +__do_sys_sendmmsg net/socket.c:2709 [inline] +__se_sys_sendmmsg net/socket.c:2706 [inline] +__x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +read-write to 0xffff88813726b160 of 8 bytes by task 23987 on cpu 0: +xfrmi_xmit+0x74e/0xb20 net/xfrm/xfrm_interface_core.c:583 +__netdev_start_xmit include/linux/netdevice.h:4889 [inline] +netdev_start_xmit include/linux/netdevice.h:4903 [inline] +xmit_one net/core/dev.c:3544 [inline] +dev_hard_start_xmit+0x11b/0x3f0 net/core/dev.c:3560 +__dev_queue_xmit+0xeee/0x1de0 net/core/dev.c:4340 +dev_queue_xmit include/linux/netdevice.h:3082 [inline] +neigh_connected_output+0x231/0x2a0 net/core/neighbour.c:1581 +neigh_output include/net/neighbour.h:542 [inline] +ip_finish_output2+0x74a/0x850 net/ipv4/ip_output.c:230 +ip_finish_output+0xf4/0x240 net/ipv4/ip_output.c:318 +NF_HOOK_COND include/linux/netfilter.h:293 [inline] +ip_output+0xe5/0x1b0 net/ipv4/ip_output.c:432 +dst_output include/net/dst.h:458 [inline] +ip_local_out net/ipv4/ip_output.c:127 [inline] +ip_send_skb+0x72/0xe0 net/ipv4/ip_output.c:1487 +udp_send_skb+0x6a4/0x990 net/ipv4/udp.c:963 +udp_sendmsg+0x1249/0x12d0 net/ipv4/udp.c:1246 +inet_sendmsg+0x63/0x80 net/ipv4/af_inet.c:840 +sock_sendmsg_nosec net/socket.c:730 [inline] +sock_sendmsg net/socket.c:753 [inline] +____sys_sendmsg+0x37c/0x4d0 net/socket.c:2540 +___sys_sendmsg net/socket.c:2594 [inline] +__sys_sendmmsg+0x269/0x500 net/socket.c:2680 +__do_sys_sendmmsg net/socket.c:2709 [inline] +__se_sys_sendmmsg net/socket.c:2706 [inline] +__x64_sys_sendmmsg+0x57/0x60 net/socket.c:2706 +do_syscall_x64 arch/x86/entry/common.c:50 [inline] +do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 +entry_SYSCALL_64_after_hwframe+0x63/0xcd + +value changed: 0x00000000000010d7 -> 0x00000000000010d8 + +Reported by Kernel Concurrency Sanitizer on: +CPU: 0 PID: 23987 Comm: syz-executor.5 Not tainted 6.5.0-syzkaller-10885-g0468be89b3fa #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 07/26/2023 + +Fixes: f203b76d7809 ("xfrm: Add virtual xfrm interfaces") +Reported-by: syzbot +Signed-off-by: Eric Dumazet +Cc: Steffen Klassert +Signed-off-by: Steffen Klassert +Signed-off-by: Greg Kroah-Hartman +--- + net/xfrm/xfrm_interface_core.c | 22 ++++++++++------------ + 1 file changed, 10 insertions(+), 12 deletions(-) + +--- a/net/xfrm/xfrm_interface_core.c ++++ b/net/xfrm/xfrm_interface_core.c +@@ -219,8 +219,8 @@ static int xfrmi_rcv_cb(struct sk_buff * + skb->dev = dev; + + if (err) { +- dev->stats.rx_errors++; +- dev->stats.rx_dropped++; ++ DEV_STATS_INC(dev, rx_errors); ++ DEV_STATS_INC(dev, rx_dropped); + + return 0; + } +@@ -260,7 +260,6 @@ static int + xfrmi_xmit2(struct sk_buff *skb, struct net_device *dev, struct flowi *fl) + { + struct xfrm_if *xi = netdev_priv(dev); +- struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + unsigned int length = skb->len; + struct net_device *tdev; +@@ -286,7 +285,7 @@ xfrmi_xmit2(struct sk_buff *skb, struct + tdev = dst->dev; + + if (tdev == dev) { +- stats->collisions++; ++ DEV_STATS_INC(dev, collisions); + net_warn_ratelimited("%s: Local routing loop detected!\n", + dev->name); + goto tx_err_dst_release; +@@ -329,13 +328,13 @@ xmit: + tstats->tx_packets++; + u64_stats_update_end(&tstats->syncp); + } else { +- stats->tx_errors++; +- stats->tx_aborted_errors++; ++ DEV_STATS_INC(dev, tx_errors); ++ DEV_STATS_INC(dev, tx_aborted_errors); + } + + return 0; + tx_err_link_failure: +- stats->tx_carrier_errors++; ++ DEV_STATS_INC(dev, tx_carrier_errors); + dst_link_failure(skb); + tx_err_dst_release: + dst_release(dst); +@@ -345,7 +344,6 @@ tx_err_dst_release: + static netdev_tx_t xfrmi_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct xfrm_if *xi = netdev_priv(dev); +- struct net_device_stats *stats = &xi->dev->stats; + struct dst_entry *dst = skb_dst(skb); + struct flowi fl; + int ret; +@@ -362,7 +360,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_ + dst = ip6_route_output(dev_net(dev), NULL, &fl.u.ip6); + if (dst->error) { + dst_release(dst); +- stats->tx_carrier_errors++; ++ DEV_STATS_INC(dev, tx_carrier_errors); + goto tx_err; + } + skb_dst_set(skb, dst); +@@ -378,7 +376,7 @@ static netdev_tx_t xfrmi_xmit(struct sk_ + fl.u.ip4.flowi4_flags |= FLOWI_FLAG_ANYSRC; + rt = __ip_route_output_key(dev_net(dev), &fl.u.ip4); + if (IS_ERR(rt)) { +- stats->tx_carrier_errors++; ++ DEV_STATS_INC(dev, tx_carrier_errors); + goto tx_err; + } + skb_dst_set(skb, &rt->dst); +@@ -397,8 +395,8 @@ static netdev_tx_t xfrmi_xmit(struct sk_ + return NETDEV_TX_OK; + + tx_err: +- stats->tx_errors++; +- stats->tx_dropped++; ++ DEV_STATS_INC(dev, tx_errors); ++ DEV_STATS_INC(dev, tx_dropped); + kfree_skb(skb); + return NETDEV_TX_OK; + }