From: Sasha Levin Date: Sun, 22 Aug 2021 02:39:19 +0000 (-0400) Subject: Fixes for 5.4 X-Git-Tag: v5.13.13~21 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=e8beb0b41149844a5a7bfc816052f4649c2a53ca;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 5.4 Signed-off-by: Sasha Levin --- diff --git a/queue-5.4/bnxt-disable-napi-before-canceling-dim.patch b/queue-5.4/bnxt-disable-napi-before-canceling-dim.patch new file mode 100644 index 00000000000..59c25b885ea --- /dev/null +++ b/queue-5.4/bnxt-disable-napi-before-canceling-dim.patch @@ -0,0 +1,43 @@ +From fb5dd59dcee96c6e83b29515d03c793d670f454e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 14:42:40 -0700 +Subject: bnxt: disable napi before canceling DIM + +From: Jakub Kicinski + +[ Upstream commit 01cca6b9330ac7460de44eeeb3a0607f8aae69ff ] + +napi schedules DIM, napi has to be disabled first, +then DIM canceled. + +Noticed while reading the code. + +Fixes: 0bc0b97fca73 ("bnxt_en: cleanup DIM work on device shutdown") +Fixes: 6a8788f25625 ("bnxt_en: add support for software dynamic interrupt moderation") +Reviewed-by: Michael Chan +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 8111aefb2411..1b5839ad97b6 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -8346,10 +8346,9 @@ static void bnxt_disable_napi(struct bnxt *bp) + for (i = 0; i < bp->cp_nr_rings; i++) { + struct bnxt_cp_ring_info *cpr = &bp->bnapi[i]->cp_ring; + ++ napi_disable(&bp->bnapi[i]->napi); + if (bp->bnapi[i]->rx_ring) + cancel_work_sync(&cpr->dim.work); +- +- napi_disable(&bp->bnapi[i]->napi); + } + } + +-- +2.30.2 + diff --git a/queue-5.4/bnxt-don-t-lock-the-tx-queue-from-napi-poll.patch b/queue-5.4/bnxt-don-t-lock-the-tx-queue-from-napi-poll.patch new file mode 100644 index 00000000000..4496a3cb684 --- /dev/null +++ b/queue-5.4/bnxt-don-t-lock-the-tx-queue-from-napi-poll.patch @@ -0,0 +1,141 @@ +From 0b25b021a2020f746ea39d86ba542c199bf63927 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 14:42:39 -0700 +Subject: bnxt: don't lock the tx queue from napi poll + +From: Jakub Kicinski + +[ Upstream commit 3c603136c9f82833813af77185618de5af67676c ] + +We can't take the tx lock from the napi poll routine, because +netpoll can poll napi at any moment, including with the tx lock +already held. + +The tx lock is protecting against two paths - the disable +path, and (as Michael points out) the NETDEV_TX_BUSY case +which may occur if NAPI completions race with start_xmit +and both decide to re-enable the queue. + +For the disable/ifdown path use synchronize_net() to make sure +closing the device does not race we restarting the queues. +Annotate accesses to dev_state against data races. + +For the NAPI cleanup vs start_xmit path - appropriate barriers +are already in place in the main spot where Tx queue is stopped +but we need to do the same careful dance in the TX_BUSY case. + +Fixes: c0c050c58d84 ("bnxt_en: New Broadcom ethernet driver.") +Reviewed-by: Michael Chan +Reviewed-by: Edwin Peer +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 54 ++++++++++++++--------- + 1 file changed, 32 insertions(+), 22 deletions(-) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 287ea792922a..8111aefb2411 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -360,6 +360,26 @@ static u16 bnxt_xmit_get_cfa_action(struct sk_buff *skb) + return md_dst->u.port_info.port_id; + } + ++static bool bnxt_txr_netif_try_stop_queue(struct bnxt *bp, ++ struct bnxt_tx_ring_info *txr, ++ struct netdev_queue *txq) ++{ ++ netif_tx_stop_queue(txq); ++ ++ /* netif_tx_stop_queue() must be done before checking ++ * tx index in bnxt_tx_avail() below, because in ++ * bnxt_tx_int(), we update tx index before checking for ++ * netif_tx_queue_stopped(). ++ */ ++ smp_mb(); ++ if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh) { ++ netif_tx_wake_queue(txq); ++ return false; ++ } ++ ++ return true; ++} ++ + static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) + { + struct bnxt *bp = netdev_priv(dev); +@@ -387,8 +407,8 @@ static netdev_tx_t bnxt_start_xmit(struct sk_buff *skb, struct net_device *dev) + + free_size = bnxt_tx_avail(bp, txr); + if (unlikely(free_size < skb_shinfo(skb)->nr_frags + 2)) { +- netif_tx_stop_queue(txq); +- return NETDEV_TX_BUSY; ++ if (bnxt_txr_netif_try_stop_queue(bp, txr, txq)) ++ return NETDEV_TX_BUSY; + } + + length = skb->len; +@@ -597,16 +617,7 @@ tx_done: + if (netdev_xmit_more() && !tx_buf->is_push) + bnxt_db_write(bp, &txr->tx_db, prod); + +- netif_tx_stop_queue(txq); +- +- /* netif_tx_stop_queue() must be done before checking +- * tx index in bnxt_tx_avail() below, because in +- * bnxt_tx_int(), we update tx index before checking for +- * netif_tx_queue_stopped(). +- */ +- smp_mb(); +- if (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh) +- netif_tx_wake_queue(txq); ++ bnxt_txr_netif_try_stop_queue(bp, txr, txq); + } + return NETDEV_TX_OK; + +@@ -690,14 +701,9 @@ next_tx_int: + smp_mb(); + + if (unlikely(netif_tx_queue_stopped(txq)) && +- (bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh)) { +- __netif_tx_lock(txq, smp_processor_id()); +- if (netif_tx_queue_stopped(txq) && +- bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh && +- txr->dev_state != BNXT_DEV_STATE_CLOSING) +- netif_tx_wake_queue(txq); +- __netif_tx_unlock(txq); +- } ++ bnxt_tx_avail(bp, txr) > bp->tx_wake_thresh && ++ READ_ONCE(txr->dev_state) != BNXT_DEV_STATE_CLOSING) ++ netif_tx_wake_queue(txq); + } + + static struct page *__bnxt_alloc_rx_page(struct bnxt *bp, dma_addr_t *mapping, +@@ -8371,9 +8377,11 @@ void bnxt_tx_disable(struct bnxt *bp) + if (bp->tx_ring) { + for (i = 0; i < bp->tx_nr_rings; i++) { + txr = &bp->tx_ring[i]; +- txr->dev_state = BNXT_DEV_STATE_CLOSING; ++ WRITE_ONCE(txr->dev_state, BNXT_DEV_STATE_CLOSING); + } + } ++ /* Make sure napi polls see @dev_state change */ ++ synchronize_net(); + /* Drop carrier first to prevent TX timeout */ + netif_carrier_off(bp->dev); + /* Stop all TX queues */ +@@ -8387,8 +8395,10 @@ void bnxt_tx_enable(struct bnxt *bp) + + for (i = 0; i < bp->tx_nr_rings; i++) { + txr = &bp->tx_ring[i]; +- txr->dev_state = 0; ++ WRITE_ONCE(txr->dev_state, 0); + } ++ /* Make sure napi polls see @dev_state change */ ++ synchronize_net(); + netif_tx_wake_all_queues(bp->dev); + if (bp->link_info.link_up) + netif_carrier_on(bp->dev); +-- +2.30.2 + diff --git a/queue-5.4/bnxt_en-add-missing-dma-memory-barriers.patch b/queue-5.4/bnxt_en-add-missing-dma-memory-barriers.patch new file mode 100644 index 00000000000..9b990de7cef --- /dev/null +++ b/queue-5.4/bnxt_en-add-missing-dma-memory-barriers.patch @@ -0,0 +1,70 @@ +From a6ad1b5453c70185001ef64440f2953e322ff991 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 15 Aug 2021 16:15:37 -0400 +Subject: bnxt_en: Add missing DMA memory barriers + +From: Michael Chan + +[ Upstream commit 828affc27ed43441bd1efdaf4e07e96dd43a0362 ] + +Each completion ring entry has a valid bit to indicate that the entry +contains a valid completion event. The driver's main poll loop +__bnxt_poll_work() has the proper dma_rmb() to make sure the valid +bit of the next entry has been checked before proceeding further. +But when we call bnxt_rx_pkt() to process the RX event, the RX +completion event consists of two completion entries and only the +first entry has been checked to be valid. We need the same barrier +after checking the next completion entry. Add missing dma_rmb() +barriers in bnxt_rx_pkt() and other similar locations. + +Fixes: 67a95e2022c7 ("bnxt_en: Need memory barrier when processing the completion ring.") +Reported-by: Lance Richardson +Reviewed-by: Andy Gospodarek +Reviewed-by: Lance Richardson +Signed-off-by: Michael Chan +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 1b5839ad97b6..e67f07faca78 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -1724,6 +1724,10 @@ static int bnxt_rx_pkt(struct bnxt *bp, struct bnxt_cp_ring_info *cpr, + if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons)) + return -EBUSY; + ++ /* The valid test of the entry must be done first before ++ * reading any further. ++ */ ++ dma_rmb(); + prod = rxr->rx_prod; + + if (cmp_type == CMP_TYPE_RX_L2_TPA_START_CMP) { +@@ -1918,6 +1922,10 @@ static int bnxt_force_rx_discard(struct bnxt *bp, + if (!RX_CMP_VALID(rxcmp1, tmp_raw_cons)) + return -EBUSY; + ++ /* The valid test of the entry must be done first before ++ * reading any further. ++ */ ++ dma_rmb(); + cmp_type = RX_CMP_TYPE(rxcmp); + if (cmp_type == CMP_TYPE_RX_L2_CMP) { + rxcmp1->rx_cmp_cfa_code_errors_v2 |= +@@ -2314,6 +2322,10 @@ static int bnxt_poll_nitroa0(struct napi_struct *napi, int budget) + if (!TX_CMP_VALID(txcmp, raw_cons)) + break; + ++ /* The valid test of the entry must be done first before ++ * reading any further. ++ */ ++ dma_rmb(); + if ((TX_CMP_TYPE(txcmp) & 0x30) == 0x10) { + tmp_raw_cons = NEXT_RAW_CMP(raw_cons); + cp_cons = RING_CMP(tmp_raw_cons); +-- +2.30.2 + diff --git a/queue-5.4/bpf-clear-zext_dst-of-dead-insns.patch b/queue-5.4/bpf-clear-zext_dst-of-dead-insns.patch new file mode 100644 index 00000000000..d099f95d019 --- /dev/null +++ b/queue-5.4/bpf-clear-zext_dst-of-dead-insns.patch @@ -0,0 +1,65 @@ +From c2c3d8d0a10e43195e67431bdf1a5431a5546ce9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 12 Aug 2021 17:18:10 +0200 +Subject: bpf: Clear zext_dst of dead insns + +From: Ilya Leoshkevich + +[ Upstream commit 45c709f8c71b525b51988e782febe84ce933e7e0 ] + +"access skb fields ok" verifier test fails on s390 with the "verifier +bug. zext_dst is set, but no reg is defined" message. The first insns +of the test prog are ... + + 0: 61 01 00 00 00 00 00 00 ldxw %r0,[%r1+0] + 8: 35 00 00 01 00 00 00 00 jge %r0,0,1 + 10: 61 01 00 08 00 00 00 00 ldxw %r0,[%r1+8] + +... and the 3rd one is dead (this does not look intentional to me, but +this is a separate topic). + +sanitize_dead_code() converts dead insns into "ja -1", but keeps +zext_dst. When opt_subreg_zext_lo32_rnd_hi32() tries to parse such +an insn, it sees this discrepancy and bails. This problem can be seen +only with JITs whose bpf_jit_needs_zext() returns true. + +Fix by clearning dead insns' zext_dst. + +The commits that contributed to this problem are: + +1. 5aa5bd14c5f8 ("bpf: add initial suite for selftests"), which + introduced the test with the dead code. +2. 5327ed3d44b7 ("bpf: verifier: mark verified-insn with + sub-register zext flag"), which introduced the zext_dst flag. +3. 83a2881903f3 ("bpf: Account for BPF_FETCH in + insn_has_def32()"), which introduced the sanity check. +4. 9183671af6db ("bpf: Fix leakage under speculation on + mispredicted branches"), which bisect points to. + +It's best to fix this on stable branches that contain the second one, +since that's the point where the inconsistency was introduced. + +Fixes: 5327ed3d44b7 ("bpf: verifier: mark verified-insn with sub-register zext flag") +Signed-off-by: Ilya Leoshkevich +Signed-off-by: Daniel Borkmann +Link: https://lore.kernel.org/bpf/20210812151811.184086-2-iii@linux.ibm.com +Signed-off-by: Sasha Levin +--- + kernel/bpf/verifier.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/kernel/bpf/verifier.c b/kernel/bpf/verifier.c +index 52c2b11a0b47..0b5a446ee59c 100644 +--- a/kernel/bpf/verifier.c ++++ b/kernel/bpf/verifier.c +@@ -8586,6 +8586,7 @@ static void sanitize_dead_code(struct bpf_verifier_env *env) + if (aux_data[i].seen) + continue; + memcpy(insn + i, &trap, sizeof(trap)); ++ aux_data[i].zext_dst = false; + } + } + +-- +2.30.2 + diff --git a/queue-5.4/cpufreq-armada-37xx-forbid-cpufreq-for-1.2-ghz-varia.patch b/queue-5.4/cpufreq-armada-37xx-forbid-cpufreq-for-1.2-ghz-varia.patch new file mode 100644 index 00000000000..94a66049cd8 --- /dev/null +++ b/queue-5.4/cpufreq-armada-37xx-forbid-cpufreq-for-1.2-ghz-varia.patch @@ -0,0 +1,54 @@ +From c78c895f3773d925a51bbe89e33799e27293a6c0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 1 Jul 2021 00:56:01 +0200 +Subject: cpufreq: armada-37xx: forbid cpufreq for 1.2 GHz variant +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Marek Behún + +[ Upstream commit 484f2b7c61b9ae58cc00c5127bcbcd9177af8dfe ] + +The 1.2 GHz variant of the Armada 3720 SOC is unstable with DVFS: when +the SOC boots, the WTMI firmware sets clocks and AVS values that work +correctly with 1.2 GHz CPU frequency, but random crashes occur once +cpufreq driver starts scaling. + +We do not know currently what is the reason: +- it may be that the voltage value for L0 for 1.2 GHz variant provided + by the vendor in the OTP is simply incorrect when scaling is used, +- it may be that some delay is needed somewhere, +- it may be something else. + +The most sane solution now seems to be to simply forbid the cpufreq +driver on 1.2 GHz variant. + +Signed-off-by: Marek Behún +Fixes: 92ce45fb875d ("cpufreq: Add DVFS support for Armada 37xx") +Signed-off-by: Viresh Kumar +Signed-off-by: Sasha Levin +--- + drivers/cpufreq/armada-37xx-cpufreq.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/drivers/cpufreq/armada-37xx-cpufreq.c b/drivers/cpufreq/armada-37xx-cpufreq.c +index e4782f562e7a..2de7fd18f66a 100644 +--- a/drivers/cpufreq/armada-37xx-cpufreq.c ++++ b/drivers/cpufreq/armada-37xx-cpufreq.c +@@ -102,7 +102,11 @@ struct armada_37xx_dvfs { + }; + + static struct armada_37xx_dvfs armada_37xx_dvfs[] = { +- {.cpu_freq_max = 1200*1000*1000, .divider = {1, 2, 4, 6} }, ++ /* ++ * The cpufreq scaling for 1.2 GHz variant of the SOC is currently ++ * unstable because we do not know how to configure it properly. ++ */ ++ /* {.cpu_freq_max = 1200*1000*1000, .divider = {1, 2, 4, 6} }, */ + {.cpu_freq_max = 1000*1000*1000, .divider = {1, 2, 4, 5} }, + {.cpu_freq_max = 800*1000*1000, .divider = {1, 2, 3, 4} }, + {.cpu_freq_max = 600*1000*1000, .divider = {2, 4, 5, 6} }, +-- +2.30.2 + diff --git a/queue-5.4/dccp-add-do-while-0-stubs-for-dccp_pr_debug-macros.patch b/queue-5.4/dccp-add-do-while-0-stubs-for-dccp_pr_debug-macros.patch new file mode 100644 index 00000000000..36cb8854af0 --- /dev/null +++ b/queue-5.4/dccp-add-do-while-0-stubs-for-dccp_pr_debug-macros.patch @@ -0,0 +1,54 @@ +From 2eada32d4174313977c057de3747d7f2ce665fa8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 8 Aug 2021 16:04:40 -0700 +Subject: dccp: add do-while-0 stubs for dccp_pr_debug macros + +From: Randy Dunlap + +[ Upstream commit 86aab09a4870bb8346c9579864588c3d7f555299 ] + +GCC complains about empty macros in an 'if' statement, so convert +them to 'do {} while (0)' macros. + +Fixes these build warnings: + +net/dccp/output.c: In function 'dccp_xmit_packet': +../net/dccp/output.c:283:71: warning: suggest braces around empty body in an 'if' statement [-Wempty-body] + 283 | dccp_pr_debug("transmit_skb() returned err=%d\n", err); +net/dccp/ackvec.c: In function 'dccp_ackvec_update_old': +../net/dccp/ackvec.c:163:80: warning: suggest braces around empty body in an 'else' statement [-Wempty-body] + 163 | (unsigned long long)seqno, state); + +Fixes: dc841e30eaea ("dccp: Extend CCID packet dequeueing interface") +Fixes: 380240864451 ("dccp ccid-2: Update code for the Ack Vector input/registration routine") +Signed-off-by: Randy Dunlap +Cc: dccp@vger.kernel.org +Cc: "David S. Miller" +Cc: Jakub Kicinski +Cc: Gerrit Renker +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/dccp/dccp.h | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/dccp/dccp.h b/net/dccp/dccp.h +index 9c3b27c257bb..cb818617699c 100644 +--- a/net/dccp/dccp.h ++++ b/net/dccp/dccp.h +@@ -41,9 +41,9 @@ extern bool dccp_debug; + #define dccp_pr_debug_cat(format, a...) DCCP_PRINTK(dccp_debug, format, ##a) + #define dccp_debug(fmt, a...) dccp_pr_debug_cat(KERN_DEBUG fmt, ##a) + #else +-#define dccp_pr_debug(format, a...) +-#define dccp_pr_debug_cat(format, a...) +-#define dccp_debug(format, a...) ++#define dccp_pr_debug(format, a...) do {} while (0) ++#define dccp_pr_debug_cat(format, a...) do {} while (0) ++#define dccp_debug(format, a...) do {} while (0) + #endif + + extern struct inet_hashinfo dccp_hashinfo; +-- +2.30.2 + diff --git a/queue-5.4/i40e-fix-atr-queue-selection.patch b/queue-5.4/i40e-fix-atr-queue-selection.patch new file mode 100644 index 00000000000..299cc5b268e --- /dev/null +++ b/queue-5.4/i40e-fix-atr-queue-selection.patch @@ -0,0 +1,59 @@ +From a4228012932c24b1bb7c2646320b4ea29d6c386d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Aug 2021 10:42:16 -0700 +Subject: i40e: Fix ATR queue selection + +From: Arkadiusz Kubalewski + +[ Upstream commit a222be597e316389f9f8c26033352c124ce93056 ] + +Without this patch, ATR does not work. Receive/transmit uses queue +selection based on SW DCB hashing method. + +If traffic classes are not configured for PF, then use +netdev_pick_tx function for selecting queue for packet transmission. +Instead of calling i40e_swdcb_skb_tx_hash, call netdev_pick_tx, +which ensures that packet is transmitted/received from CPU that is +running the application. + +Reproduction steps: +1. Load i40e driver +2. Map each MSI interrupt of i40e port for each CPU +3. Disable ntuple, enable ATR i.e.: +ethtool -K $interface ntuple off +ethtool --set-priv-flags $interface flow-director-atr +4. Run application that is generating traffic and is bound to a +single CPU, i.e.: +taskset -c 9 netperf -H 1.1.1.1 -t TCP_RR -l 10 +5. Observe behavior: +Application's traffic should be restricted to the CPU provided in +taskset. + +Fixes: 89ec1f0886c1 ("i40e: Fix queue-to-TC mapping on Tx") +Signed-off-by: Przemyslaw Patynowski +Signed-off-by: Arkadiusz Kubalewski +Tested-by: Dave Switzer +Signed-off-by: Tony Nguyen +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 3 +-- + 1 file changed, 1 insertion(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index 8e38c547b53f..06987913837a 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -3553,8 +3553,7 @@ u16 i40e_lan_select_queue(struct net_device *netdev, + + /* is DCB enabled at all? */ + if (vsi->tc_config.numtc == 1) +- return i40e_swdcb_skb_tx_hash(netdev, skb, +- netdev->real_num_tx_queues); ++ return netdev_pick_tx(netdev, skb, sb_dev); + + prio = skb->priority; + hw = &vsi->back->hw; +-- +2.30.2 + diff --git a/queue-5.4/iavf-fix-ping-is-lost-after-untrusted-vf-had-tried-t.patch b/queue-5.4/iavf-fix-ping-is-lost-after-untrusted-vf-had-tried-t.patch new file mode 100644 index 00000000000..9da8843a992 --- /dev/null +++ b/queue-5.4/iavf-fix-ping-is-lost-after-untrusted-vf-had-tried-t.patch @@ -0,0 +1,133 @@ +From 8d57b41785021e0729054590d7009f7220b41e2e Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Aug 2021 10:42:17 -0700 +Subject: iavf: Fix ping is lost after untrusted VF had tried to change MAC + +From: Sylwester Dziedziuch + +[ Upstream commit 8da80c9d50220a8e4190a4eaa0dd6aeefcbbb5bf ] + +Make changes to MAC address dependent on the response of PF. +Disallow changes to HW MAC address and MAC filter from untrusted +VF, thanks to that ping is not lost if VF tries to change MAC. +Add a new field in iavf_mac_filter, to indicate whether there +was response from PF for given filter. Based on this field pass +or discard the filter. +If untrusted VF tried to change it's address, it's not changed. +Still filter was changed, because of that ping couldn't go through. + +Fixes: c5c922b3e09b ("iavf: fix MAC address setting for VFs when filter is rejected") +Signed-off-by: Przemyslaw Patynowski +Signed-off-by: Sylwester Dziedziuch +Signed-off-by: Mateusz Palczewski +Tested-by: Gurucharan G +Signed-off-by: Tony Nguyen +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/iavf/iavf.h | 1 + + drivers/net/ethernet/intel/iavf/iavf_main.c | 1 + + .../net/ethernet/intel/iavf/iavf_virtchnl.c | 47 ++++++++++++++++++- + 3 files changed, 47 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/iavf/iavf.h b/drivers/net/ethernet/intel/iavf/iavf.h +index 6b9117a350fa..81ca6472937d 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf.h ++++ b/drivers/net/ethernet/intel/iavf/iavf.h +@@ -134,6 +134,7 @@ struct iavf_q_vector { + struct iavf_mac_filter { + struct list_head list; + u8 macaddr[ETH_ALEN]; ++ bool is_new_mac; /* filter is new, wait for PF decision */ + bool remove; /* filter needs to be removed */ + bool add; /* filter needs to be added */ + }; +diff --git a/drivers/net/ethernet/intel/iavf/iavf_main.c b/drivers/net/ethernet/intel/iavf/iavf_main.c +index dc902e371c2c..94a3f000e999 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_main.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_main.c +@@ -761,6 +761,7 @@ struct iavf_mac_filter *iavf_add_filter(struct iavf_adapter *adapter, + + list_add_tail(&f->list, &adapter->mac_filter_list); + f->add = true; ++ f->is_new_mac = true; + adapter->aq_required |= IAVF_FLAG_AQ_ADD_MAC_FILTER; + } else { + f->remove = false; +diff --git a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +index 9655318803b7..4d471a6f2946 100644 +--- a/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c ++++ b/drivers/net/ethernet/intel/iavf/iavf_virtchnl.c +@@ -564,6 +564,47 @@ void iavf_del_ether_addrs(struct iavf_adapter *adapter) + kfree(veal); + } + ++/** ++ * iavf_mac_add_ok ++ * @adapter: adapter structure ++ * ++ * Submit list of filters based on PF response. ++ **/ ++static void iavf_mac_add_ok(struct iavf_adapter *adapter) ++{ ++ struct iavf_mac_filter *f, *ftmp; ++ ++ spin_lock_bh(&adapter->mac_vlan_list_lock); ++ list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) { ++ f->is_new_mac = false; ++ } ++ spin_unlock_bh(&adapter->mac_vlan_list_lock); ++} ++ ++/** ++ * iavf_mac_add_reject ++ * @adapter: adapter structure ++ * ++ * Remove filters from list based on PF response. ++ **/ ++static void iavf_mac_add_reject(struct iavf_adapter *adapter) ++{ ++ struct net_device *netdev = adapter->netdev; ++ struct iavf_mac_filter *f, *ftmp; ++ ++ spin_lock_bh(&adapter->mac_vlan_list_lock); ++ list_for_each_entry_safe(f, ftmp, &adapter->mac_filter_list, list) { ++ if (f->remove && ether_addr_equal(f->macaddr, netdev->dev_addr)) ++ f->remove = false; ++ ++ if (f->is_new_mac) { ++ list_del(&f->list); ++ kfree(f); ++ } ++ } ++ spin_unlock_bh(&adapter->mac_vlan_list_lock); ++} ++ + /** + * iavf_add_vlans + * @adapter: adapter structure +@@ -1316,6 +1357,7 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, + case VIRTCHNL_OP_ADD_ETH_ADDR: + dev_err(&adapter->pdev->dev, "Failed to add MAC filter, error %s\n", + iavf_stat_str(&adapter->hw, v_retval)); ++ iavf_mac_add_reject(adapter); + /* restore administratively set MAC address */ + ether_addr_copy(adapter->hw.mac.addr, netdev->dev_addr); + break; +@@ -1385,10 +1427,11 @@ void iavf_virtchnl_completion(struct iavf_adapter *adapter, + } + } + switch (v_opcode) { +- case VIRTCHNL_OP_ADD_ETH_ADDR: { ++ case VIRTCHNL_OP_ADD_ETH_ADDR: ++ if (!v_retval) ++ iavf_mac_add_ok(adapter); + if (!ether_addr_equal(netdev->dev_addr, adapter->hw.mac.addr)) + ether_addr_copy(netdev->dev_addr, adapter->hw.mac.addr); +- } + break; + case VIRTCHNL_OP_GET_STATS: { + struct iavf_eth_stats *stats = +-- +2.30.2 + diff --git a/queue-5.4/iommu-check-if-group-is-null-before-remove-device.patch b/queue-5.4/iommu-check-if-group-is-null-before-remove-device.patch new file mode 100644 index 00000000000..193cce503c8 --- /dev/null +++ b/queue-5.4/iommu-check-if-group-is-null-before-remove-device.patch @@ -0,0 +1,55 @@ +From 31572e7a3b58674bcd68893db81d12835435fdcd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 31 Jul 2021 09:47:37 +0200 +Subject: iommu: Check if group is NULL before remove device + +From: Frank Wunderlich + +[ Upstream commit 5aa95d8834e07907e64937d792c12ffef7fb271f ] + +If probe_device is failing, iommu_group is not initialized because +iommu_group_add_device is not reached, so freeing it will result +in NULL pointer access. + +iommu_bus_init + ->bus_iommu_probe + ->probe_iommu_group in for each:/* return -22 in fail case */ + ->iommu_probe_device + ->__iommu_probe_device /* return -22 here.*/ + -> ops->probe_device /* return -22 here.*/ + -> iommu_group_get_for_dev + -> ops->device_group + -> iommu_group_add_device //good case + ->remove_iommu_group //in fail case, it will remove group + ->iommu_release_device + ->iommu_group_remove_device // here we don't have group + +In my case ops->probe_device (mtk_iommu_probe_device from +mtk_iommu_v1.c) is due to failing fwspec->ops mismatch. + +Fixes: d72e31c93746 ("iommu: IOMMU Groups") +Signed-off-by: Frank Wunderlich +Link: https://lore.kernel.org/r/20210731074737.4573-1-linux@fw-web.de +Signed-off-by: Joerg Roedel +Signed-off-by: Sasha Levin +--- + drivers/iommu/iommu.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/iommu/iommu.c b/drivers/iommu/iommu.c +index 9d7232e26ecf..c5758fb696cc 100644 +--- a/drivers/iommu/iommu.c ++++ b/drivers/iommu/iommu.c +@@ -775,6 +775,9 @@ void iommu_group_remove_device(struct device *dev) + struct iommu_group *group = dev->iommu_group; + struct group_device *tmp_device, *device = NULL; + ++ if (!group) ++ return; ++ + dev_info(dev, "Removing from iommu group %d\n", group->id); + + /* Pre-notify listeners that a device is being removed. */ +-- +2.30.2 + diff --git a/queue-5.4/net-6pack-fix-slab-out-of-bounds-in-decode_data.patch b/queue-5.4/net-6pack-fix-slab-out-of-bounds-in-decode_data.patch new file mode 100644 index 00000000000..f7b99f3068a --- /dev/null +++ b/queue-5.4/net-6pack-fix-slab-out-of-bounds-in-decode_data.patch @@ -0,0 +1,67 @@ +From f56ebc8fee81e13d5378c9b1789dd4764c03b909 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Aug 2021 18:14:33 +0300 +Subject: net: 6pack: fix slab-out-of-bounds in decode_data + +From: Pavel Skripkin + +[ Upstream commit 19d1532a187669ce86d5a2696eb7275310070793 ] + +Syzbot reported slab-out-of bounds write in decode_data(). +The problem was in missing validation checks. + +Syzbot's reproducer generated malicious input, which caused +decode_data() to be called a lot in sixpack_decode(). Since +rx_count_cooked is only 400 bytes and noone reported before, +that 400 bytes is not enough, let's just check if input is malicious +and complain about buffer overrun. + +Fail log: +================================================================== +BUG: KASAN: slab-out-of-bounds in drivers/net/hamradio/6pack.c:843 +Write of size 1 at addr ffff888087c5544e by task kworker/u4:0/7 + +CPU: 0 PID: 7 Comm: kworker/u4:0 Not tainted 5.6.0-rc3-syzkaller #0 +... +Workqueue: events_unbound flush_to_ldisc +Call Trace: + __dump_stack lib/dump_stack.c:77 [inline] + dump_stack+0x197/0x210 lib/dump_stack.c:118 + print_address_description.constprop.0.cold+0xd4/0x30b mm/kasan/report.c:374 + __kasan_report.cold+0x1b/0x32 mm/kasan/report.c:506 + kasan_report+0x12/0x20 mm/kasan/common.c:641 + __asan_report_store1_noabort+0x17/0x20 mm/kasan/generic_report.c:137 + decode_data.part.0+0x23b/0x270 drivers/net/hamradio/6pack.c:843 + decode_data drivers/net/hamradio/6pack.c:965 [inline] + sixpack_decode drivers/net/hamradio/6pack.c:968 [inline] + +Reported-and-tested-by: syzbot+fc8cd9a673d4577fb2e4@syzkaller.appspotmail.com +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Pavel Skripkin +Reviewed-by: Dan Carpenter +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/hamradio/6pack.c | 6 ++++++ + 1 file changed, 6 insertions(+) + +diff --git a/drivers/net/hamradio/6pack.c b/drivers/net/hamradio/6pack.c +index 71d6629e65c9..da13683d52d1 100644 +--- a/drivers/net/hamradio/6pack.c ++++ b/drivers/net/hamradio/6pack.c +@@ -839,6 +839,12 @@ static void decode_data(struct sixpack *sp, unsigned char inbyte) + return; + } + ++ if (sp->rx_count_cooked + 2 >= sizeof(sp->cooked_buf)) { ++ pr_err("6pack: cooked buffer overrun, data loss\n"); ++ sp->rx_count = 0; ++ return; ++ } ++ + buf = sp->raw_buf; + sp->cooked_buf[sp->rx_count_cooked++] = + buf[0] | ((buf[1] << 2) & 0xc0); +-- +2.30.2 + diff --git a/queue-5.4/net-mdio-mux-don-t-ignore-memory-allocation-errors.patch b/queue-5.4/net-mdio-mux-don-t-ignore-memory-allocation-errors.patch new file mode 100644 index 00000000000..2eeace2fd0e --- /dev/null +++ b/queue-5.4/net-mdio-mux-don-t-ignore-memory-allocation-errors.patch @@ -0,0 +1,96 @@ +From ef19b1e55f0cd20877624b2fe850bbd725b1fc6c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 17 Aug 2021 20:38:02 -0700 +Subject: net: mdio-mux: Don't ignore memory allocation errors + +From: Saravana Kannan + +[ Upstream commit 99d81e942474cc7677d12f673f42a7ea699e2589 ] + +If we are seeing memory allocation errors, don't try to continue +registering child mdiobus devices. It's unlikely they'll succeed. + +Fixes: 342fa1964439 ("mdio: mux: make child bus walking more permissive and errors more verbose") +Signed-off-by: Saravana Kannan +Reviewed-by: Andrew Lunn +Acked-by: Marc Zyngier +Tested-by: Marc Zyngier +Acked-by: Kevin Hilman +Tested-by: Kevin Hilman +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/mdio-mux.c | 28 ++++++++++++++++++---------- + 1 file changed, 18 insertions(+), 10 deletions(-) + +diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c +index 6a1d3540210b..c96ef3b3fa3a 100644 +--- a/drivers/net/phy/mdio-mux.c ++++ b/drivers/net/phy/mdio-mux.c +@@ -82,6 +82,17 @@ out: + + static int parent_count; + ++static void mdio_mux_uninit_children(struct mdio_mux_parent_bus *pb) ++{ ++ struct mdio_mux_child_bus *cb = pb->children; ++ ++ while (cb) { ++ mdiobus_unregister(cb->mii_bus); ++ mdiobus_free(cb->mii_bus); ++ cb = cb->next; ++ } ++} ++ + int mdio_mux_init(struct device *dev, + struct device_node *mux_node, + int (*switch_fn)(int cur, int desired, void *data), +@@ -144,7 +155,7 @@ int mdio_mux_init(struct device *dev, + cb = devm_kzalloc(dev, sizeof(*cb), GFP_KERNEL); + if (!cb) { + ret_val = -ENOMEM; +- continue; ++ goto err_loop; + } + cb->bus_number = v; + cb->parent = pb; +@@ -152,8 +163,7 @@ int mdio_mux_init(struct device *dev, + cb->mii_bus = mdiobus_alloc(); + if (!cb->mii_bus) { + ret_val = -ENOMEM; +- devm_kfree(dev, cb); +- continue; ++ goto err_loop; + } + cb->mii_bus->priv = cb; + +@@ -182,6 +192,10 @@ int mdio_mux_init(struct device *dev, + + dev_err(dev, "Error: No acceptable child buses found\n"); + devm_kfree(dev, pb); ++ ++err_loop: ++ mdio_mux_uninit_children(pb); ++ of_node_put(child_bus_node); + err_pb_kz: + put_device(&parent_bus->dev); + err_parent_bus: +@@ -193,14 +207,8 @@ EXPORT_SYMBOL_GPL(mdio_mux_init); + void mdio_mux_uninit(void *mux_handle) + { + struct mdio_mux_parent_bus *pb = mux_handle; +- struct mdio_mux_child_bus *cb = pb->children; +- +- while (cb) { +- mdiobus_unregister(cb->mii_bus); +- mdiobus_free(cb->mii_bus); +- cb = cb->next; +- } + ++ mdio_mux_uninit_children(pb); + put_device(&pb->mii_bus->dev); + } + EXPORT_SYMBOL_GPL(mdio_mux_uninit); +-- +2.30.2 + diff --git a/queue-5.4/net-mdio-mux-handle-eprobe_defer-correctly.patch b/queue-5.4/net-mdio-mux-handle-eprobe_defer-correctly.patch new file mode 100644 index 00000000000..046f5cc6387 --- /dev/null +++ b/queue-5.4/net-mdio-mux-handle-eprobe_defer-correctly.patch @@ -0,0 +1,58 @@ +From 24174dfc032da5ba2073500b0c32584372585108 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 17 Aug 2021 20:38:03 -0700 +Subject: net: mdio-mux: Handle -EPROBE_DEFER correctly + +From: Saravana Kannan + +[ Upstream commit 7bd0cef5dac685f09ef8b0b2a7748ff42d284dc7 ] + +When registering mdiobus children, if we get an -EPROBE_DEFER, we shouldn't +ignore it and continue registering the rest of the mdiobus children. This +would permanently prevent the deferring child mdiobus from working instead +of reattempting it in the future. So, if a child mdiobus needs to be +reattempted in the future, defer the entire mdio-mux initialization. + +This fixes the issue where PHYs sitting under the mdio-mux aren't +initialized correctly if the PHY's interrupt controller is not yet ready +when the mdio-mux is being probed. Additional context in the link below. + +Fixes: 0ca2997d1452 ("netdev/of/phy: Add MDIO bus multiplexer support.") +Link: https://lore.kernel.org/lkml/CAGETcx95kHrv8wA-O+-JtfH7H9biJEGJtijuPVN0V5dUKUAB3A@mail.gmail.com/#t +Signed-off-by: Saravana Kannan +Reviewed-by: Andrew Lunn +Acked-by: Marc Zyngier +Tested-by: Marc Zyngier +Acked-by: Kevin Hilman +Tested-by: Kevin Hilman +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/mdio-mux.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/phy/mdio-mux.c b/drivers/net/phy/mdio-mux.c +index c96ef3b3fa3a..ccb3ee704eb1 100644 +--- a/drivers/net/phy/mdio-mux.c ++++ b/drivers/net/phy/mdio-mux.c +@@ -175,11 +175,15 @@ int mdio_mux_init(struct device *dev, + cb->mii_bus->write = mdio_mux_write; + r = of_mdiobus_register(cb->mii_bus, child_bus_node); + if (r) { ++ mdiobus_free(cb->mii_bus); ++ if (r == -EPROBE_DEFER) { ++ ret_val = r; ++ goto err_loop; ++ } ++ devm_kfree(dev, cb); + dev_err(dev, + "Error: Failed to register MDIO bus for child %pOF\n", + child_bus_node); +- mdiobus_free(cb->mii_bus); +- devm_kfree(dev, cb); + } else { + cb->next = pb->children; + pb->children = cb; +-- +2.30.2 + diff --git a/queue-5.4/net-qlcnic-add-missed-unlock-in-qlcnic_83xx_flash_re.patch b/queue-5.4/net-qlcnic-add-missed-unlock-in-qlcnic_83xx_flash_re.patch new file mode 100644 index 00000000000..a5c8230fc20 --- /dev/null +++ b/queue-5.4/net-qlcnic-add-missed-unlock-in-qlcnic_83xx_flash_re.patch @@ -0,0 +1,42 @@ +From b78272abfa40ddede0063c48d4f62fe41e6c6ec5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 16 Aug 2021 21:14:04 +0800 +Subject: net: qlcnic: add missed unlock in qlcnic_83xx_flash_read32 + +From: Dinghao Liu + +[ Upstream commit 0a298d133893c72c96e2156ed7cb0f0c4a306a3e ] + +qlcnic_83xx_unlock_flash() is called on all paths after we call +qlcnic_83xx_lock_flash(), except for one error path on failure +of QLCRD32(), which may cause a deadlock. This bug is suggested +by a static analysis tool, please advise. + +Fixes: 81d0aeb0a4fff ("qlcnic: flash template based firmware reset recovery") +Signed-off-by: Dinghao Liu +Link: https://lore.kernel.org/r/20210816131405.24024-1-dinghao.liu@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +index 29b9c728a65e..f2014c10f7c9 100644 +--- a/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c ++++ b/drivers/net/ethernet/qlogic/qlcnic/qlcnic_83xx_hw.c +@@ -3158,8 +3158,10 @@ int qlcnic_83xx_flash_read32(struct qlcnic_adapter *adapter, u32 flash_addr, + + indirect_addr = QLC_83XX_FLASH_DIRECT_DATA(addr); + ret = QLCRD32(adapter, indirect_addr, &err); +- if (err == -EIO) ++ if (err == -EIO) { ++ qlcnic_83xx_unlock_flash(adapter); + return err; ++ } + + word = ret; + *(u32 *)p_data = word; +-- +2.30.2 + diff --git a/queue-5.4/ovs-clear-skb-tstamp-in-forwarding-path.patch b/queue-5.4/ovs-clear-skb-tstamp-in-forwarding-path.patch new file mode 100644 index 00000000000..27470e467ab --- /dev/null +++ b/queue-5.4/ovs-clear-skb-tstamp-in-forwarding-path.patch @@ -0,0 +1,39 @@ +From 452fdec58660a92af8b9b6545d80610b98f19438 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Aug 2021 10:22:15 +0800 +Subject: ovs: clear skb->tstamp in forwarding path + +From: kaixi.fan + +[ Upstream commit 01634047bf0d5c2d9b7d8095bb4de1663dbeedeb ] + +fq qdisc requires tstamp to be cleared in the forwarding path. Now ovs +doesn't clear skb->tstamp. We encountered a problem with linux +version 5.4.56 and ovs version 2.14.1, and packets failed to +dequeue from qdisc when fq qdisc was attached to ovs port. + +Fixes: fb420d5d91c1 ("tcp/fq: move back to CLOCK_MONOTONIC") +Signed-off-by: kaixi.fan +Signed-off-by: xiexiaohui +Reviewed-by: Cong Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/openvswitch/vport.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/openvswitch/vport.c b/net/openvswitch/vport.c +index 3fc38d16c456..19af0efeb8dc 100644 +--- a/net/openvswitch/vport.c ++++ b/net/openvswitch/vport.c +@@ -499,6 +499,7 @@ void ovs_vport_send(struct vport *vport, struct sk_buff *skb, u8 mac_proto) + } + + skb->dev = vport->dev; ++ skb->tstamp = 0; + vport->ops->send(skb); + return; + +-- +2.30.2 + diff --git a/queue-5.4/ptp_pch-restore-dependency-on-pci.patch b/queue-5.4/ptp_pch-restore-dependency-on-pci.patch new file mode 100644 index 00000000000..3e7e10c832d --- /dev/null +++ b/queue-5.4/ptp_pch-restore-dependency-on-pci.patch @@ -0,0 +1,38 @@ +From 5fe32a02783b01bc75aa09b476bb89fa3fb11f7b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Aug 2021 20:33:27 +0300 +Subject: ptp_pch: Restore dependency on PCI + +From: Andy Shevchenko + +[ Upstream commit 55c8fca1dae1fb0d11deaa21b65a647dedb1bc50 ] + +During the swap dependency on PCH_GBE to selection PTP_1588_CLOCK_PCH +incidentally dropped the implicit dependency on the PCI. Restore it. + +Fixes: 18d359ceb044 ("pch_gbe, ptp_pch: Fix the dependency direction between these drivers") +Reported-by: kernel test robot +Signed-off-by: Andy Shevchenko +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/ptp/Kconfig | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/ptp/Kconfig b/drivers/ptp/Kconfig +index 0517272a268e..9fb6f7643ea9 100644 +--- a/drivers/ptp/Kconfig ++++ b/drivers/ptp/Kconfig +@@ -92,7 +92,8 @@ config DP83640_PHY + config PTP_1588_CLOCK_PCH + tristate "Intel PCH EG20T as PTP clock" + depends on X86_32 || COMPILE_TEST +- depends on HAS_IOMEM && NET ++ depends on HAS_IOMEM && PCI ++ depends on NET + imply PTP_1588_CLOCK + help + This driver adds support for using the PCH EG20T as a PTP +-- +2.30.2 + diff --git a/queue-5.4/series b/queue-5.4/series index 7f9d8073fe6..c6f22e53478 100644 --- a/queue-5.4/series +++ b/queue-5.4/series @@ -22,3 +22,23 @@ arm-dts-nomadik-fix-up-interrupt-controller-node-nam.patch net-usb-lan78xx-don-t-modify-phy_device-state-concur.patch drm-amd-display-fix-dynamic-bpp-issue-with-8k30-with.patch bluetooth-hidp-use-correct-wait-queue-when-removing-.patch +iommu-check-if-group-is-null-before-remove-device.patch +cpufreq-armada-37xx-forbid-cpufreq-for-1.2-ghz-varia.patch +dccp-add-do-while-0-stubs-for-dccp_pr_debug-macros.patch +virtio-protect-vqs-list-access.patch +vhost-fix-the-calculation-in-vhost_overflow.patch +bpf-clear-zext_dst-of-dead-insns.patch +bnxt-don-t-lock-the-tx-queue-from-napi-poll.patch +bnxt-disable-napi-before-canceling-dim.patch +net-6pack-fix-slab-out-of-bounds-in-decode_data.patch +ptp_pch-restore-dependency-on-pci.patch +bnxt_en-add-missing-dma-memory-barriers.patch +vrf-reset-skb-conntrack-connection-on-vrf-rcv.patch +virtio-net-support-xdp-when-not-more-queues.patch +virtio-net-use-netif_f_gro_hw-instead-of-netif_f_lro.patch +net-qlcnic-add-missed-unlock-in-qlcnic_83xx_flash_re.patch +net-mdio-mux-don-t-ignore-memory-allocation-errors.patch +net-mdio-mux-handle-eprobe_defer-correctly.patch +ovs-clear-skb-tstamp-in-forwarding-path.patch +i40e-fix-atr-queue-selection.patch +iavf-fix-ping-is-lost-after-untrusted-vf-had-tried-t.patch diff --git a/queue-5.4/vhost-fix-the-calculation-in-vhost_overflow.patch b/queue-5.4/vhost-fix-the-calculation-in-vhost_overflow.patch new file mode 100644 index 00000000000..37f4190d069 --- /dev/null +++ b/queue-5.4/vhost-fix-the-calculation-in-vhost_overflow.patch @@ -0,0 +1,49 @@ +From bcb9042bb3f5a18e6885131afb7ddbbffa90a04f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 28 Jul 2021 21:07:56 +0800 +Subject: vhost: Fix the calculation in vhost_overflow() + +From: Xie Yongji + +[ Upstream commit f7ad318ea0ad58ebe0e595e59aed270bb643b29b ] + +This fixes the incorrect calculation for integer overflow +when the last address of iova range is 0xffffffff. + +Fixes: ec33d031a14b ("vhost: detect 32 bit integer wrap around") +Reported-by: Jason Wang +Signed-off-by: Xie Yongji +Acked-by: Jason Wang +Link: https://lore.kernel.org/r/20210728130756.97-2-xieyongji@bytedance.com +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Sasha Levin +--- + drivers/vhost/vhost.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/drivers/vhost/vhost.c b/drivers/vhost/vhost.c +index a279ecacbf60..97be299f0a8d 100644 +--- a/drivers/vhost/vhost.c ++++ b/drivers/vhost/vhost.c +@@ -702,10 +702,16 @@ static bool log_access_ok(void __user *log_base, u64 addr, unsigned long sz) + (sz + VHOST_PAGE_SIZE * 8 - 1) / VHOST_PAGE_SIZE / 8); + } + ++/* Make sure 64 bit math will not overflow. */ + static bool vhost_overflow(u64 uaddr, u64 size) + { +- /* Make sure 64 bit math will not overflow. */ +- return uaddr > ULONG_MAX || size > ULONG_MAX || uaddr > ULONG_MAX - size; ++ if (uaddr > ULONG_MAX || size > ULONG_MAX) ++ return true; ++ ++ if (!size) ++ return false; ++ ++ return uaddr > ULONG_MAX - size + 1; + } + + /* Caller should have vq mutex and device mutex. */ +-- +2.30.2 + diff --git a/queue-5.4/virtio-net-support-xdp-when-not-more-queues.patch b/queue-5.4/virtio-net-support-xdp-when-not-more-queues.patch new file mode 100644 index 00000000000..fced2397a83 --- /dev/null +++ b/queue-5.4/virtio-net-support-xdp-when-not-more-queues.patch @@ -0,0 +1,168 @@ +From 2f3e671182f11df3607fcda06a01f1586cc4df7c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 10 Mar 2021 10:24:45 +0800 +Subject: virtio-net: support XDP when not more queues + +From: Xuan Zhuo + +[ Upstream commit 97c2c69e1926260c78c7f1c0b2c987934f1dc7a1 ] + +The number of queues implemented by many virtio backends is limited, +especially some machines have a large number of CPUs. In this case, it +is often impossible to allocate a separate queue for +XDP_TX/XDP_REDIRECT, then xdp cannot be loaded to work, even xdp does +not use the XDP_TX/XDP_REDIRECT. + +This patch allows XDP_TX/XDP_REDIRECT to run by reuse the existing SQ +with __netif_tx_lock() hold when there are not enough queues. + +Signed-off-by: Xuan Zhuo +Reviewed-by: Dust Li +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/virtio_net.c | 62 +++++++++++++++++++++++++++++++--------- + 1 file changed, 49 insertions(+), 13 deletions(-) + +diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c +index 15453d6fcc23..36f8aeb113a8 100644 +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -195,6 +195,9 @@ struct virtnet_info { + /* # of XDP queue pairs currently used by the driver */ + u16 xdp_queue_pairs; + ++ /* xdp_queue_pairs may be 0, when xdp is already loaded. So add this. */ ++ bool xdp_enabled; ++ + /* I like... big packets and I cannot lie! */ + bool big_packets; + +@@ -485,12 +488,41 @@ static int __virtnet_xdp_xmit_one(struct virtnet_info *vi, + return 0; + } + +-static struct send_queue *virtnet_xdp_sq(struct virtnet_info *vi) +-{ +- unsigned int qp; +- +- qp = vi->curr_queue_pairs - vi->xdp_queue_pairs + smp_processor_id(); +- return &vi->sq[qp]; ++/* when vi->curr_queue_pairs > nr_cpu_ids, the txq/sq is only used for xdp tx on ++ * the current cpu, so it does not need to be locked. ++ * ++ * Here we use marco instead of inline functions because we have to deal with ++ * three issues at the same time: 1. the choice of sq. 2. judge and execute the ++ * lock/unlock of txq 3. make sparse happy. It is difficult for two inline ++ * functions to perfectly solve these three problems at the same time. ++ */ ++#define virtnet_xdp_get_sq(vi) ({ \ ++ struct netdev_queue *txq; \ ++ typeof(vi) v = (vi); \ ++ unsigned int qp; \ ++ \ ++ if (v->curr_queue_pairs > nr_cpu_ids) { \ ++ qp = v->curr_queue_pairs - v->xdp_queue_pairs; \ ++ qp += smp_processor_id(); \ ++ txq = netdev_get_tx_queue(v->dev, qp); \ ++ __netif_tx_acquire(txq); \ ++ } else { \ ++ qp = smp_processor_id() % v->curr_queue_pairs; \ ++ txq = netdev_get_tx_queue(v->dev, qp); \ ++ __netif_tx_lock(txq, raw_smp_processor_id()); \ ++ } \ ++ v->sq + qp; \ ++}) ++ ++#define virtnet_xdp_put_sq(vi, q) { \ ++ struct netdev_queue *txq; \ ++ typeof(vi) v = (vi); \ ++ \ ++ txq = netdev_get_tx_queue(v->dev, (q) - v->sq); \ ++ if (v->curr_queue_pairs > nr_cpu_ids) \ ++ __netif_tx_release(txq); \ ++ else \ ++ __netif_tx_unlock(txq); \ + } + + static int virtnet_xdp_xmit(struct net_device *dev, +@@ -516,7 +548,7 @@ static int virtnet_xdp_xmit(struct net_device *dev, + if (!xdp_prog) + return -ENXIO; + +- sq = virtnet_xdp_sq(vi); ++ sq = virtnet_xdp_get_sq(vi); + + if (unlikely(flags & ~XDP_XMIT_FLAGS_MASK)) { + ret = -EINVAL; +@@ -564,12 +596,13 @@ out: + sq->stats.kicks += kicks; + u64_stats_update_end(&sq->stats.syncp); + ++ virtnet_xdp_put_sq(vi, sq); + return ret; + } + + static unsigned int virtnet_get_headroom(struct virtnet_info *vi) + { +- return vi->xdp_queue_pairs ? VIRTIO_XDP_HEADROOM : 0; ++ return vi->xdp_enabled ? VIRTIO_XDP_HEADROOM : 0; + } + + /* We copy the packet for XDP in the following cases: +@@ -1458,12 +1491,13 @@ static int virtnet_poll(struct napi_struct *napi, int budget) + xdp_do_flush_map(); + + if (xdp_xmit & VIRTIO_XDP_TX) { +- sq = virtnet_xdp_sq(vi); ++ sq = virtnet_xdp_get_sq(vi); + if (virtqueue_kick_prepare(sq->vq) && virtqueue_notify(sq->vq)) { + u64_stats_update_begin(&sq->stats.syncp); + sq->stats.kicks++; + u64_stats_update_end(&sq->stats.syncp); + } ++ virtnet_xdp_put_sq(vi, sq); + } + + return received; +@@ -2480,10 +2514,9 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, + + /* XDP requires extra queues for XDP_TX */ + if (curr_qp + xdp_qp > vi->max_queue_pairs) { +- NL_SET_ERR_MSG_MOD(extack, "Too few free TX rings available"); +- netdev_warn(dev, "request %i queues but max is %i\n", ++ netdev_warn(dev, "XDP request %i queues but max is %i. XDP_TX and XDP_REDIRECT will operate in a slower locked tx mode.\n", + curr_qp + xdp_qp, vi->max_queue_pairs); +- return -ENOMEM; ++ xdp_qp = 0; + } + + old_prog = rtnl_dereference(vi->rq[0].xdp_prog); +@@ -2520,11 +2553,14 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, + vi->xdp_queue_pairs = xdp_qp; + + if (prog) { ++ vi->xdp_enabled = true; + for (i = 0; i < vi->max_queue_pairs; i++) { + rcu_assign_pointer(vi->rq[i].xdp_prog, prog); + if (i == 0 && !old_prog) + virtnet_clear_guest_offloads(vi); + } ++ } else { ++ vi->xdp_enabled = false; + } + + for (i = 0; i < vi->max_queue_pairs; i++) { +@@ -2609,7 +2645,7 @@ static int virtnet_set_features(struct net_device *dev, + int err; + + if ((dev->features ^ features) & NETIF_F_LRO) { +- if (vi->xdp_queue_pairs) ++ if (vi->xdp_enabled) + return -EBUSY; + + if (features & NETIF_F_LRO) +-- +2.30.2 + diff --git a/queue-5.4/virtio-net-use-netif_f_gro_hw-instead-of-netif_f_lro.patch b/queue-5.4/virtio-net-use-netif_f_gro_hw-instead-of-netif_f_lro.patch new file mode 100644 index 00000000000..98b3ddf6f4a --- /dev/null +++ b/queue-5.4/virtio-net-use-netif_f_gro_hw-instead-of-netif_f_lro.patch @@ -0,0 +1,101 @@ +From 382337353db50d281b5994145b84946fb5a93dac Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 17 Aug 2021 16:06:59 +0800 +Subject: virtio-net: use NETIF_F_GRO_HW instead of NETIF_F_LRO + +From: Jason Wang + +[ Upstream commit dbcf24d153884439dad30484a0e3f02350692e4c ] + +Commit a02e8964eaf92 ("virtio-net: ethtool configurable LRO") +maps LRO to virtio guest offloading features and allows the +administrator to enable and disable those features via ethtool. + +This leads to several issues: + +- For a device that doesn't support control guest offloads, the "LRO" + can't be disabled triggering WARN in dev_disable_lro() when turning + off LRO or when enabling forwarding bridging etc. + +- For a device that supports control guest offloads, the guest + offloads are disabled in cases of bridging, forwarding etc slowing + down the traffic. + +Fix this by using NETIF_F_GRO_HW instead. Though the spec does not +guarantee packets to be re-segmented as the original ones, +we can add that to the spec, possibly with a flag for devices to +differentiate between GRO and LRO. + +Further, we never advertised LRO historically before a02e8964eaf92 +("virtio-net: ethtool configurable LRO") and so bridged/forwarded +configs effectively always relied on virtio receive offloads behaving +like GRO - thus even if this breaks any configs it is at least not +a regression. + +Fixes: a02e8964eaf92 ("virtio-net: ethtool configurable LRO") +Acked-by: Michael S. Tsirkin +Reported-by: Ivan +Tested-by: Ivan +Signed-off-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/virtio_net.c | 14 +++++++------- + 1 file changed, 7 insertions(+), 7 deletions(-) + +diff --git a/drivers/net/virtio_net.c b/drivers/net/virtio_net.c +index 36f8aeb113a8..37c2cecd1e50 100644 +--- a/drivers/net/virtio_net.c ++++ b/drivers/net/virtio_net.c +@@ -63,7 +63,7 @@ static const unsigned long guest_offloads[] = { + VIRTIO_NET_F_GUEST_CSUM + }; + +-#define GUEST_OFFLOAD_LRO_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ ++#define GUEST_OFFLOAD_GRO_HW_MASK ((1ULL << VIRTIO_NET_F_GUEST_TSO4) | \ + (1ULL << VIRTIO_NET_F_GUEST_TSO6) | \ + (1ULL << VIRTIO_NET_F_GUEST_ECN) | \ + (1ULL << VIRTIO_NET_F_GUEST_UFO)) +@@ -2493,7 +2493,7 @@ static int virtnet_xdp_set(struct net_device *dev, struct bpf_prog *prog, + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_ECN) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_UFO) || + virtio_has_feature(vi->vdev, VIRTIO_NET_F_GUEST_CSUM))) { +- NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing LRO/CSUM, disable LRO/CSUM first"); ++ NL_SET_ERR_MSG_MOD(extack, "Can't set XDP while host is implementing GRO_HW/CSUM, disable GRO_HW/CSUM first"); + return -EOPNOTSUPP; + } + +@@ -2644,15 +2644,15 @@ static int virtnet_set_features(struct net_device *dev, + u64 offloads; + int err; + +- if ((dev->features ^ features) & NETIF_F_LRO) { ++ if ((dev->features ^ features) & NETIF_F_GRO_HW) { + if (vi->xdp_enabled) + return -EBUSY; + +- if (features & NETIF_F_LRO) ++ if (features & NETIF_F_GRO_HW) + offloads = vi->guest_offloads_capable; + else + offloads = vi->guest_offloads_capable & +- ~GUEST_OFFLOAD_LRO_MASK; ++ ~GUEST_OFFLOAD_GRO_HW_MASK; + + err = virtnet_set_guest_offloads(vi, offloads); + if (err) +@@ -3128,9 +3128,9 @@ static int virtnet_probe(struct virtio_device *vdev) + dev->features |= NETIF_F_RXCSUM; + if (virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO4) || + virtio_has_feature(vdev, VIRTIO_NET_F_GUEST_TSO6)) +- dev->features |= NETIF_F_LRO; ++ dev->features |= NETIF_F_GRO_HW; + if (virtio_has_feature(vdev, VIRTIO_NET_F_CTRL_GUEST_OFFLOADS)) +- dev->hw_features |= NETIF_F_LRO; ++ dev->hw_features |= NETIF_F_GRO_HW; + + dev->vlan_features = dev->features; + +-- +2.30.2 + diff --git a/queue-5.4/virtio-protect-vqs-list-access.patch b/queue-5.4/virtio-protect-vqs-list-access.patch new file mode 100644 index 00000000000..7519ad12901 --- /dev/null +++ b/queue-5.4/virtio-protect-vqs-list-access.patch @@ -0,0 +1,97 @@ +From ec7c95e32fa39841854ad14149368a7a80ee74a7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 21 Jul 2021 17:26:47 +0300 +Subject: virtio: Protect vqs list access + +From: Parav Pandit + +[ Upstream commit 0e566c8f0f2e8325e35f6f97e13cde5356b41814 ] + +VQs may be accessed to mark the device broken while they are +created/destroyed. Hence protect the access to the vqs list. + +Fixes: e2dcdfe95c0b ("virtio: virtio_break_device() to mark all virtqueues broken.") +Signed-off-by: Parav Pandit +Link: https://lore.kernel.org/r/20210721142648.1525924-4-parav@nvidia.com +Signed-off-by: Michael S. Tsirkin +Signed-off-by: Sasha Levin +--- + drivers/virtio/virtio.c | 1 + + drivers/virtio/virtio_ring.c | 8 ++++++++ + include/linux/virtio.h | 1 + + 3 files changed, 10 insertions(+) + +diff --git a/drivers/virtio/virtio.c b/drivers/virtio/virtio.c +index a977e32a88f2..59a05f1b8105 100644 +--- a/drivers/virtio/virtio.c ++++ b/drivers/virtio/virtio.c +@@ -342,6 +342,7 @@ int register_virtio_device(struct virtio_device *dev) + virtio_add_status(dev, VIRTIO_CONFIG_S_ACKNOWLEDGE); + + INIT_LIST_HEAD(&dev->vqs); ++ spin_lock_init(&dev->vqs_list_lock); + + /* + * device_add() causes the bus infrastructure to look for a matching +diff --git a/drivers/virtio/virtio_ring.c b/drivers/virtio/virtio_ring.c +index 97e8a195e18f..f6011c9ed32f 100644 +--- a/drivers/virtio/virtio_ring.c ++++ b/drivers/virtio/virtio_ring.c +@@ -1668,7 +1668,9 @@ static struct virtqueue *vring_create_virtqueue_packed( + cpu_to_le16(vq->packed.event_flags_shadow); + } + ++ spin_lock(&vdev->vqs_list_lock); + list_add_tail(&vq->vq.list, &vdev->vqs); ++ spin_unlock(&vdev->vqs_list_lock); + return &vq->vq; + + err_desc_extra: +@@ -2126,7 +2128,9 @@ struct virtqueue *__vring_new_virtqueue(unsigned int index, + memset(vq->split.desc_state, 0, vring.num * + sizeof(struct vring_desc_state_split)); + ++ spin_lock(&vdev->vqs_list_lock); + list_add_tail(&vq->vq.list, &vdev->vqs); ++ spin_unlock(&vdev->vqs_list_lock); + return &vq->vq; + } + EXPORT_SYMBOL_GPL(__vring_new_virtqueue); +@@ -2210,7 +2214,9 @@ void vring_del_virtqueue(struct virtqueue *_vq) + } + if (!vq->packed_ring) + kfree(vq->split.desc_state); ++ spin_lock(&vq->vq.vdev->vqs_list_lock); + list_del(&_vq->list); ++ spin_unlock(&vq->vq.vdev->vqs_list_lock); + kfree(vq); + } + EXPORT_SYMBOL_GPL(vring_del_virtqueue); +@@ -2274,10 +2280,12 @@ void virtio_break_device(struct virtio_device *dev) + { + struct virtqueue *_vq; + ++ spin_lock(&dev->vqs_list_lock); + list_for_each_entry(_vq, &dev->vqs, list) { + struct vring_virtqueue *vq = to_vvq(_vq); + vq->broken = true; + } ++ spin_unlock(&dev->vqs_list_lock); + } + EXPORT_SYMBOL_GPL(virtio_break_device); + +diff --git a/include/linux/virtio.h b/include/linux/virtio.h +index 15f906e4a748..7c075463c7f2 100644 +--- a/include/linux/virtio.h ++++ b/include/linux/virtio.h +@@ -111,6 +111,7 @@ struct virtio_device { + bool config_enabled; + bool config_change_pending; + spinlock_t config_lock; ++ spinlock_t vqs_list_lock; /* Protects VQs list access */ + struct device dev; + struct virtio_device_id id; + const struct virtio_config_ops *config; +-- +2.30.2 + diff --git a/queue-5.4/vrf-reset-skb-conntrack-connection-on-vrf-rcv.patch b/queue-5.4/vrf-reset-skb-conntrack-connection-on-vrf-rcv.patch new file mode 100644 index 00000000000..c8adb86e944 --- /dev/null +++ b/queue-5.4/vrf-reset-skb-conntrack-connection-on-vrf-rcv.patch @@ -0,0 +1,209 @@ +From 45aad27907403d19cf8e78488efd99f1e02b1506 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 15 Aug 2021 12:00:02 +0000 +Subject: vrf: Reset skb conntrack connection on VRF rcv + +From: Lahav Schlesinger + +[ Upstream commit 09e856d54bda5f288ef8437a90ab2b9b3eab83d1 ] + +To fix the "reverse-NAT" for replies. + +When a packet is sent over a VRF, the POST_ROUTING hooks are called +twice: Once from the VRF interface, and once from the "actual" +interface the packet will be sent from: +1) First SNAT: l3mdev_l3_out() -> vrf_l3_out() -> .. -> vrf_output_direct() + This causes the POST_ROUTING hooks to run. +2) Second SNAT: 'ip_output()' calls POST_ROUTING hooks again. + +Similarly for replies, first ip_rcv() calls PRE_ROUTING hooks, and +second vrf_l3_rcv() calls them again. + +As an example, consider the following SNAT rule: +> iptables -t nat -A POSTROUTING -p udp -m udp --dport 53 -j SNAT --to-source 2.2.2.2 -o vrf_1 + +In this case sending over a VRF will create 2 conntrack entries. +The first is from the VRF interface, which performs the IP SNAT. +The second will run the SNAT, but since the "expected reply" will remain +the same, conntrack randomizes the source port of the packet: +e..g With a socket bound to 1.1.1.1:10000, sending to 3.3.3.3:53, the conntrack +rules are: +udp 17 29 src=2.2.2.2 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=61033 packets=0 bytes=0 mark=0 use=1 +udp 17 29 src=1.1.1.1 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=10000 packets=0 bytes=0 mark=0 use=1 + +i.e. First SNAT IP from 1.1.1.1 --> 2.2.2.2, and second the src port is +SNAT-ed from 10000 --> 61033. + +But when a reply is sent (3.3.3.3:53 -> 2.2.2.2:61033) only the later +conntrack entry is matched: +udp 17 29 src=2.2.2.2 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 src=3.3.3.3 dst=2.2.2.2 sport=53 dport=61033 packets=1 bytes=49 mark=0 use=1 +udp 17 28 src=1.1.1.1 dst=3.3.3.3 sport=10000 dport=53 packets=1 bytes=68 [UNREPLIED] src=3.3.3.3 dst=2.2.2.2 sport=53 dport=10000 packets=0 bytes=0 mark=0 use=1 + +And a "port 61033 unreachable" ICMP packet is sent back. + +The issue is that when PRE_ROUTING hooks are called from vrf_l3_rcv(), +the skb already has a conntrack flow attached to it, which means +nf_conntrack_in() will not resolve the flow again. + +This means only the dest port is "reverse-NATed" (61033 -> 10000) but +the dest IP remains 2.2.2.2, and since the socket is bound to 1.1.1.1 it's +not received. +This can be verified by logging the 4-tuple of the packet in '__udp4_lib_rcv()'. + +The fix is then to reset the flow when skb is received on a VRF, to let +conntrack resolve the flow again (which now will hit the earlier flow). + +To reproduce: (Without the fix "Got pkt_to_nat_port" will not be printed by + running 'bash ./repro'): + $ cat run_in_A1.py + import logging + logging.getLogger("scapy.runtime").setLevel(logging.ERROR) + from scapy.all import * + import argparse + + def get_packet_to_send(udp_dst_port, msg_name): + return Ether(src='11:22:33:44:55:66', dst=iface_mac)/ \ + IP(src='3.3.3.3', dst='2.2.2.2')/ \ + UDP(sport=53, dport=udp_dst_port)/ \ + Raw(f'{msg_name}\x0012345678901234567890') + + parser = argparse.ArgumentParser() + parser.add_argument('-iface_mac', dest="iface_mac", type=str, required=True, + help="From run_in_A3.py") + parser.add_argument('-socket_port', dest="socket_port", type=str, + required=True, help="From run_in_A3.py") + parser.add_argument('-v1_mac', dest="v1_mac", type=str, required=True, + help="From script") + + args, _ = parser.parse_known_args() + iface_mac = args.iface_mac + socket_port = int(args.socket_port) + v1_mac = args.v1_mac + + print(f'Source port before NAT: {socket_port}') + + while True: + pkts = sniff(iface='_v0', store=True, count=1, timeout=10) + if 0 == len(pkts): + print('Something failed, rerun the script :(', flush=True) + break + pkt = pkts[0] + if not pkt.haslayer('UDP'): + continue + + pkt_sport = pkt.getlayer('UDP').sport + print(f'Source port after NAT: {pkt_sport}', flush=True) + + pkt_to_send = get_packet_to_send(pkt_sport, 'pkt_to_nat_port') + sendp(pkt_to_send, '_v0', verbose=False) # Will not be received + + pkt_to_send = get_packet_to_send(socket_port, 'pkt_to_socket_port') + sendp(pkt_to_send, '_v0', verbose=False) + break + + $ cat run_in_A2.py + import socket + import netifaces + + print(f"{netifaces.ifaddresses('e00000')[netifaces.AF_LINK][0]['addr']}", + flush=True) + s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) + s.setsockopt(socket.SOL_SOCKET, socket.SO_BINDTODEVICE, + str('vrf_1' + '\0').encode('utf-8')) + s.connect(('3.3.3.3', 53)) + print(f'{s. getsockname()[1]}', flush=True) + s.settimeout(5) + + while True: + try: + # Periodically send in order to keep the conntrack entry alive. + s.send(b'a'*40) + resp = s.recvfrom(1024) + msg_name = resp[0].decode('utf-8').split('\0')[0] + print(f"Got {msg_name}", flush=True) + except Exception as e: + pass + + $ cat repro.sh + ip netns del A1 2> /dev/null + ip netns del A2 2> /dev/null + ip netns add A1 + ip netns add A2 + + ip -n A1 link add _v0 type veth peer name _v1 netns A2 + ip -n A1 link set _v0 up + + ip -n A2 link add e00000 type bond + ip -n A2 link add lo0 type dummy + ip -n A2 link add vrf_1 type vrf table 10001 + ip -n A2 link set vrf_1 up + ip -n A2 link set e00000 master vrf_1 + + ip -n A2 addr add 1.1.1.1/24 dev e00000 + ip -n A2 link set e00000 up + ip -n A2 link set _v1 master e00000 + ip -n A2 link set _v1 up + ip -n A2 link set lo0 up + ip -n A2 addr add 2.2.2.2/32 dev lo0 + + ip -n A2 neigh add 1.1.1.10 lladdr 77:77:77:77:77:77 dev e00000 + ip -n A2 route add 3.3.3.3/32 via 1.1.1.10 dev e00000 table 10001 + + ip netns exec A2 iptables -t nat -A POSTROUTING -p udp -m udp --dport 53 -j \ + SNAT --to-source 2.2.2.2 -o vrf_1 + + sleep 5 + ip netns exec A2 python3 run_in_A2.py > x & + XPID=$! + sleep 5 + + IFACE_MAC=`sed -n 1p x` + SOCKET_PORT=`sed -n 2p x` + V1_MAC=`ip -n A2 link show _v1 | sed -n 2p | awk '{print $2'}` + ip netns exec A1 python3 run_in_A1.py -iface_mac ${IFACE_MAC} -socket_port \ + ${SOCKET_PORT} -v1_mac ${SOCKET_PORT} + sleep 5 + + kill -9 $XPID + wait $XPID 2> /dev/null + ip netns del A1 + ip netns del A2 + tail x -n 2 + rm x + set +x + +Fixes: 73e20b761acf ("net: vrf: Add support for PREROUTING rules on vrf device") +Signed-off-by: Lahav Schlesinger +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20210815120002.2787653-1-lschlesinger@drivenets.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/vrf.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/drivers/net/vrf.c b/drivers/net/vrf.c +index f08ed52d51f3..9b626c169554 100644 +--- a/drivers/net/vrf.c ++++ b/drivers/net/vrf.c +@@ -1036,6 +1036,8 @@ static struct sk_buff *vrf_ip6_rcv(struct net_device *vrf_dev, + bool need_strict = rt6_need_strict(&ipv6_hdr(skb)->daddr); + bool is_ndisc = ipv6_ndisc_frame(skb); + ++ nf_reset_ct(skb); ++ + /* loopback, multicast & non-ND link-local traffic; do not push through + * packet taps again. Reset pkt_type for upper layers to process skb. + * For strict packets with a source LLA, determine the dst using the +@@ -1092,6 +1094,8 @@ static struct sk_buff *vrf_ip_rcv(struct net_device *vrf_dev, + skb->skb_iif = vrf_dev->ifindex; + IPCB(skb)->flags |= IPSKB_L3SLAVE; + ++ nf_reset_ct(skb); ++ + if (ipv4_is_multicast(ip_hdr(skb)->daddr)) + goto out; + +-- +2.30.2 +