]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
Fixes for 6.1
authorSasha Levin <sashal@kernel.org>
Sun, 25 Aug 2024 11:50:52 +0000 (07:50 -0400)
committerSasha Levin <sashal@kernel.org>
Sun, 25 Aug 2024 11:50:52 +0000 (07:50 -0400)
Signed-off-by: Sasha Levin <sashal@kernel.org>
40 files changed:
queue-6.1/bluetooth-hci_core-fix-le-quote-calculation.patch [new file with mode: 0644]
queue-6.1/bluetooth-smp-fix-assumption-of-central-always-being.patch [new file with mode: 0644]
queue-6.1/bonding-fix-bond_ipsec_offload_ok-return-type.patch [new file with mode: 0644]
queue-6.1/bonding-fix-null-pointer-deref-in-bond_ipsec_offload.patch [new file with mode: 0644]
queue-6.1/bonding-fix-xfrm-real_dev-null-pointer-dereference.patch [new file with mode: 0644]
queue-6.1/bonding-fix-xfrm-state-handling-when-clearing-active.patch [new file with mode: 0644]
queue-6.1/dpaa2-switch-fix-error-checking-in-dpaa2_switch_seed.patch [new file with mode: 0644]
queue-6.1/drm-amdkfd-reserve-the-bo-before-validating-it.patch [new file with mode: 0644]
queue-6.1/ice-add-xdp_buff-to-ice_rx_ring-struct.patch [new file with mode: 0644]
queue-6.1/ice-fix-ice_last_offset-formula.patch [new file with mode: 0644]
queue-6.1/ice-fix-page-reuse-when-page_size-is-over-8k.patch [new file with mode: 0644]
queue-6.1/ice-prepare-legacy-rx-for-upcoming-xdp-multi-buffer-.patch [new file with mode: 0644]
queue-6.1/ice-pull-out-next_to_clean-bump-out-of-ice_put_rx_bu.patch [new file with mode: 0644]
queue-6.1/ice-store-page-count-inside-ice_rx_buf.patch [new file with mode: 0644]
queue-6.1/ip6_tunnel-fix-broken-gro.patch [new file with mode: 0644]
queue-6.1/ipv6-fix-possible-uaf-in-ip6_finish_output2.patch [new file with mode: 0644]
queue-6.1/ipv6-prevent-possible-uaf-in-ip6_xmit.patch [new file with mode: 0644]
queue-6.1/ipv6-prevent-uaf-in-ip6_send_skb.patch [new file with mode: 0644]
queue-6.1/kcm-serialise-kcm_sendmsg-for-the-same-socket.patch [new file with mode: 0644]
queue-6.1/net-dsa-mv88e6xxx-fix-out-of-bound-access.patch [new file with mode: 0644]
queue-6.1/net-dsa-tag_ocelot-call-only-the-relevant-portion-of.patch [new file with mode: 0644]
queue-6.1/net-dsa-tag_ocelot-do-not-rely-on-skb_mac_header-for.patch [new file with mode: 0644]
queue-6.1/net-mctp-test-use-correct-skb-for-route-input-check.patch [new file with mode: 0644]
queue-6.1/net-mscc-ocelot-fix-qos-class-for-injected-packets-w.patch [new file with mode: 0644]
queue-6.1/net-mscc-ocelot-serialize-access-to-the-injection-ex.patch [new file with mode: 0644]
queue-6.1/net-mscc-ocelot-use-ocelot_xmit_get_vlan_info-also-f.patch [new file with mode: 0644]
queue-6.1/net-xilinx-axienet-always-disable-promiscuous-mode.patch [new file with mode: 0644]
queue-6.1/net-xilinx-axienet-fix-dangling-multicast-addresses.patch [new file with mode: 0644]
queue-6.1/netem-fix-return-value-if-duplicate-enqueue-fails.patch [new file with mode: 0644]
queue-6.1/netfilter-flowtable-validate-vlan-header.patch [new file with mode: 0644]
queue-6.1/netfilter-nft_counter-disable-bh-in-nft_counter_offl.patch [new file with mode: 0644]
queue-6.1/netfilter-nft_counter-synchronize-nft_counter_reset-.patch [new file with mode: 0644]
queue-6.1/octeontx2-af-fix-cpt-af-register-offset-calculation.patch [new file with mode: 0644]
queue-6.1/selftests-net-synchronize-udpgro-tests-tx-and-rx-con.patch [new file with mode: 0644]
queue-6.1/selftests-udpgro-report-error-when-receive-failed.patch [new file with mode: 0644]
queue-6.1/series
queue-6.1/tc-testing-don-t-access-non-existent-variable-on-exc.patch [new file with mode: 0644]
queue-6.1/tcp-dccp-bypass-empty-buckets-in-inet_twsk_purge.patch [new file with mode: 0644]
queue-6.1/tcp-dccp-do-not-care-about-families-in-inet_twsk_pur.patch [new file with mode: 0644]
queue-6.1/tcp-prevent-concurrent-execution-of-tcp_sk_exit_batc.patch [new file with mode: 0644]

diff --git a/queue-6.1/bluetooth-hci_core-fix-le-quote-calculation.patch b/queue-6.1/bluetooth-hci_core-fix-le-quote-calculation.patch
new file mode 100644 (file)
index 0000000..a6e7f8d
--- /dev/null
@@ -0,0 +1,76 @@
+From ec1c9a4df4f8310a3d4b420029bcaaf3e22545c2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 12 Aug 2024 11:22:08 -0400
+Subject: Bluetooth: hci_core: Fix LE quote calculation
+
+From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+
+[ Upstream commit 932021a11805b9da4bd6abf66fe233cccd59fe0e ]
+
+Function hci_sched_le needs to update the respective counter variable
+inplace other the likes of hci_quote_sent would attempt to use the
+possible outdated value of conn->{le_cnt,acl_cnt}.
+
+Link: https://github.com/bluez/bluez/issues/915
+Fixes: 73d80deb7bdf ("Bluetooth: prioritizing data over HCI")
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/hci_core.c | 19 +++++++------------
+ 1 file changed, 7 insertions(+), 12 deletions(-)
+
+diff --git a/net/bluetooth/hci_core.c b/net/bluetooth/hci_core.c
+index cf164ec9899c3..210e03a3609d4 100644
+--- a/net/bluetooth/hci_core.c
++++ b/net/bluetooth/hci_core.c
+@@ -3744,19 +3744,19 @@ static void hci_sched_le(struct hci_dev *hdev)
+ {
+       struct hci_chan *chan;
+       struct sk_buff *skb;
+-      int quote, cnt, tmp;
++      int quote, *cnt, tmp;
+       BT_DBG("%s", hdev->name);
+       if (!hci_conn_num(hdev, LE_LINK))
+               return;
+-      cnt = hdev->le_pkts ? hdev->le_cnt : hdev->acl_cnt;
++      cnt = hdev->le_pkts ? &hdev->le_cnt : &hdev->acl_cnt;
+-      __check_timeout(hdev, cnt, LE_LINK);
++      __check_timeout(hdev, *cnt, LE_LINK);
+-      tmp = cnt;
+-      while (cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
++      tmp = *cnt;
++      while (*cnt && (chan = hci_chan_sent(hdev, LE_LINK, &quote))) {
+               u32 priority = (skb_peek(&chan->data_q))->priority;
+               while (quote-- && (skb = skb_peek(&chan->data_q))) {
+                       BT_DBG("chan %p skb %p len %d priority %u", chan, skb,
+@@ -3771,7 +3771,7 @@ static void hci_sched_le(struct hci_dev *hdev)
+                       hci_send_frame(hdev, skb);
+                       hdev->le_last_tx = jiffies;
+-                      cnt--;
++                      (*cnt)--;
+                       chan->sent++;
+                       chan->conn->sent++;
+@@ -3781,12 +3781,7 @@ static void hci_sched_le(struct hci_dev *hdev)
+               }
+       }
+-      if (hdev->le_pkts)
+-              hdev->le_cnt = cnt;
+-      else
+-              hdev->acl_cnt = cnt;
+-
+-      if (cnt != tmp)
++      if (*cnt != tmp)
+               hci_prio_recalculate(hdev, LE_LINK);
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.1/bluetooth-smp-fix-assumption-of-central-always-being.patch b/queue-6.1/bluetooth-smp-fix-assumption-of-central-always-being.patch
new file mode 100644 (file)
index 0000000..ce44de6
--- /dev/null
@@ -0,0 +1,447 @@
+From 55bb96d0aa71fd3938bdd64730c5144628903f4a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 30 Aug 2023 15:08:06 -0700
+Subject: Bluetooth: SMP: Fix assumption of Central always being Initiator
+
+From: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+
+[ Upstream commit 28cd47f75185c4818b0fb1b46f2f02faaba96376 ]
+
+SMP initiator role shall be considered the one that initiates the
+pairing procedure with SMP_CMD_PAIRING_REQ:
+
+BLUETOOTH CORE SPECIFICATION Version 5.3 | Vol 3, Part H
+page 1557:
+
+Figure 2.1: LE pairing phases
+
+Note that by sending SMP_CMD_SECURITY_REQ it doesn't change the role to
+be Initiator.
+
+Link: https://github.com/bluez/bluez/issues/567
+Fixes: b28b4943660f ("Bluetooth: Add strict checks for allowed SMP PDUs")
+Signed-off-by: Luiz Augusto von Dentz <luiz.von.dentz@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/bluetooth/smp.c | 144 ++++++++++++++++++++++----------------------
+ 1 file changed, 72 insertions(+), 72 deletions(-)
+
+diff --git a/net/bluetooth/smp.c b/net/bluetooth/smp.c
+index ecb005bce65ac..d444fa1bd9f97 100644
+--- a/net/bluetooth/smp.c
++++ b/net/bluetooth/smp.c
+@@ -913,7 +913,7 @@ static int tk_request(struct l2cap_conn *conn, u8 remote_oob, u8 auth,
+        * Confirms and the responder Enters the passkey.
+        */
+       if (smp->method == OVERLAP) {
+-              if (hcon->role == HCI_ROLE_MASTER)
++              if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+                       smp->method = CFM_PASSKEY;
+               else
+                       smp->method = REQ_PASSKEY;
+@@ -963,7 +963,7 @@ static u8 smp_confirm(struct smp_chan *smp)
+       smp_send_cmd(smp->conn, SMP_CMD_PAIRING_CONFIRM, sizeof(cp), &cp);
+-      if (conn->hcon->out)
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+       else
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+@@ -979,7 +979,8 @@ static u8 smp_random(struct smp_chan *smp)
+       int ret;
+       bt_dev_dbg(conn->hcon->hdev, "conn %p %s", conn,
+-                 conn->hcon->out ? "initiator" : "responder");
++                 test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
++                 "responder");
+       ret = smp_c1(smp->tk, smp->rrnd, smp->preq, smp->prsp,
+                    hcon->init_addr_type, &hcon->init_addr,
+@@ -993,7 +994,7 @@ static u8 smp_random(struct smp_chan *smp)
+               return SMP_CONFIRM_FAILED;
+       }
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               u8 stk[16];
+               __le64 rand = 0;
+               __le16 ediv = 0;
+@@ -1250,14 +1251,15 @@ static void smp_distribute_keys(struct smp_chan *smp)
+       rsp = (void *) &smp->prsp[1];
+       /* The responder sends its keys first */
+-      if (hcon->out && (smp->remote_key_dist & KEY_DIST_MASK)) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags) &&
++          (smp->remote_key_dist & KEY_DIST_MASK)) {
+               smp_allow_key_dist(smp);
+               return;
+       }
+       req = (void *) &smp->preq[1];
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               keydist = &rsp->init_key_dist;
+               *keydist &= req->init_key_dist;
+       } else {
+@@ -1426,7 +1428,7 @@ static int sc_mackey_and_ltk(struct smp_chan *smp, u8 mackey[16], u8 ltk[16])
+       struct hci_conn *hcon = smp->conn->hcon;
+       u8 *na, *nb, a[7], b[7];
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               na   = smp->prnd;
+               nb   = smp->rrnd;
+       } else {
+@@ -1454,7 +1456,7 @@ static void sc_dhkey_check(struct smp_chan *smp)
+       a[6] = hcon->init_addr_type;
+       b[6] = hcon->resp_addr_type;
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               local_addr = a;
+               remote_addr = b;
+               memcpy(io_cap, &smp->preq[1], 3);
+@@ -1533,7 +1535,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
+               /* The round is only complete when the initiator
+                * receives pairing random.
+                */
+-              if (!hcon->out) {
++              if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+                       smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+                                    sizeof(smp->prnd), smp->prnd);
+                       if (smp->passkey_round == 20)
+@@ -1561,7 +1563,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+-              if (hcon->out) {
++              if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+                       smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+                                    sizeof(smp->prnd), smp->prnd);
+                       return 0;
+@@ -1572,7 +1574,7 @@ static u8 sc_passkey_round(struct smp_chan *smp, u8 smp_op)
+       case SMP_CMD_PUBLIC_KEY:
+       default:
+               /* Initiating device starts the round */
+-              if (!hcon->out)
++              if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+                       return 0;
+               bt_dev_dbg(hdev, "Starting passkey round %u",
+@@ -1617,7 +1619,7 @@ static int sc_user_reply(struct smp_chan *smp, u16 mgmt_op, __le32 passkey)
+       }
+       /* Initiator sends DHKey check first */
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               sc_dhkey_check(smp);
+               SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+       } else if (test_and_clear_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags)) {
+@@ -1740,7 +1742,7 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
+       struct smp_cmd_pairing rsp, *req = (void *) skb->data;
+       struct l2cap_chan *chan = conn->smp;
+       struct hci_dev *hdev = conn->hcon->hdev;
+-      struct smp_chan *smp;
++      struct smp_chan *smp = chan->data;
+       u8 key_size, auth, sec_level;
+       int ret;
+@@ -1749,16 +1751,14 @@ static u8 smp_cmd_pairing_req(struct l2cap_conn *conn, struct sk_buff *skb)
+       if (skb->len < sizeof(*req))
+               return SMP_INVALID_PARAMS;
+-      if (conn->hcon->role != HCI_ROLE_SLAVE)
++      if (smp && test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               return SMP_CMD_NOTSUPP;
+-      if (!chan->data)
++      if (!smp) {
+               smp = smp_chan_create(conn);
+-      else
+-              smp = chan->data;
+-
+-      if (!smp)
+-              return SMP_UNSPECIFIED;
++              if (!smp)
++                      return SMP_UNSPECIFIED;
++      }
+       /* We didn't start the pairing, so match remote */
+       auth = req->auth_req & AUTH_REQ_MASK(hdev);
+@@ -1940,7 +1940,7 @@ static u8 smp_cmd_pairing_rsp(struct l2cap_conn *conn, struct sk_buff *skb)
+       if (skb->len < sizeof(*rsp))
+               return SMP_INVALID_PARAMS;
+-      if (conn->hcon->role != HCI_ROLE_MASTER)
++      if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               return SMP_CMD_NOTSUPP;
+       skb_pull(skb, sizeof(*rsp));
+@@ -2035,7 +2035,7 @@ static u8 sc_check_confirm(struct smp_chan *smp)
+       if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+               return sc_passkey_round(smp, SMP_CMD_PAIRING_CONFIRM);
+-      if (conn->hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+                            smp->prnd);
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+@@ -2057,7 +2057,7 @@ static int fixup_sc_false_positive(struct smp_chan *smp)
+       u8 auth;
+       /* The issue is only observed when we're in responder role */
+-      if (hcon->out)
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               return SMP_UNSPECIFIED;
+       if (hci_dev_test_flag(hdev, HCI_SC_ONLY)) {
+@@ -2093,7 +2093,8 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
+       struct hci_dev *hdev = hcon->hdev;
+       bt_dev_dbg(hdev, "conn %p %s", conn,
+-                 hcon->out ? "initiator" : "responder");
++                 test_bit(SMP_FLAG_INITIATOR, &smp->flags) ? "initiator" :
++                 "responder");
+       if (skb->len < sizeof(smp->pcnf))
+               return SMP_INVALID_PARAMS;
+@@ -2115,7 +2116,7 @@ static u8 smp_cmd_pairing_confirm(struct l2cap_conn *conn, struct sk_buff *skb)
+                       return ret;
+       }
+-      if (conn->hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM, sizeof(smp->prnd),
+                            smp->prnd);
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RANDOM);
+@@ -2150,7 +2151,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+       if (!test_bit(SMP_FLAG_SC, &smp->flags))
+               return smp_random(smp);
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               pkax = smp->local_pk;
+               pkbx = smp->remote_pk;
+               na   = smp->prnd;
+@@ -2163,7 +2164,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+       }
+       if (smp->method == REQ_OOB) {
+-              if (!hcon->out)
++              if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+                       smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+                                    sizeof(smp->prnd), smp->prnd);
+               SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+@@ -2174,7 +2175,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+       if (smp->method == REQ_PASSKEY || smp->method == DSP_PASSKEY)
+               return sc_passkey_round(smp, SMP_CMD_PAIRING_RANDOM);
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               u8 cfm[16];
+               err = smp_f4(smp->tfm_cmac, smp->remote_pk, smp->local_pk,
+@@ -2215,7 +2216,7 @@ static u8 smp_cmd_pairing_random(struct l2cap_conn *conn, struct sk_buff *skb)
+               return SMP_UNSPECIFIED;
+       if (smp->method == REQ_OOB) {
+-              if (hcon->out) {
++              if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+                       sc_dhkey_check(smp);
+                       SMP_ALLOW_CMD(smp, SMP_CMD_DHKEY_CHECK);
+               }
+@@ -2289,10 +2290,27 @@ bool smp_sufficient_security(struct hci_conn *hcon, u8 sec_level,
+       return false;
+ }
++static void smp_send_pairing_req(struct smp_chan *smp, __u8 auth)
++{
++      struct smp_cmd_pairing cp;
++
++      if (smp->conn->hcon->type == ACL_LINK)
++              build_bredr_pairing_cmd(smp, &cp, NULL);
++      else
++              build_pairing_cmd(smp->conn, &cp, NULL, auth);
++
++      smp->preq[0] = SMP_CMD_PAIRING_REQ;
++      memcpy(&smp->preq[1], &cp, sizeof(cp));
++
++      smp_send_cmd(smp->conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
++      SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
++
++      set_bit(SMP_FLAG_INITIATOR, &smp->flags);
++}
++
+ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
+ {
+       struct smp_cmd_security_req *rp = (void *) skb->data;
+-      struct smp_cmd_pairing cp;
+       struct hci_conn *hcon = conn->hcon;
+       struct hci_dev *hdev = hcon->hdev;
+       struct smp_chan *smp;
+@@ -2341,16 +2359,20 @@ static u8 smp_cmd_security_req(struct l2cap_conn *conn, struct sk_buff *skb)
+       skb_pull(skb, sizeof(*rp));
+-      memset(&cp, 0, sizeof(cp));
+-      build_pairing_cmd(conn, &cp, NULL, auth);
++      smp_send_pairing_req(smp, auth);
+-      smp->preq[0] = SMP_CMD_PAIRING_REQ;
+-      memcpy(&smp->preq[1], &cp, sizeof(cp));
++      return 0;
++}
+-      smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+-      SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
++static void smp_send_security_req(struct smp_chan *smp, __u8 auth)
++{
++      struct smp_cmd_security_req cp;
+-      return 0;
++      cp.auth_req = auth;
++      smp_send_cmd(smp->conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
++      SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
++
++      clear_bit(SMP_FLAG_INITIATOR, &smp->flags);
+ }
+ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
+@@ -2421,23 +2443,11 @@ int smp_conn_security(struct hci_conn *hcon, __u8 sec_level)
+                       authreq |= SMP_AUTH_MITM;
+       }
+-      if (hcon->role == HCI_ROLE_MASTER) {
+-              struct smp_cmd_pairing cp;
+-
+-              build_pairing_cmd(conn, &cp, NULL, authreq);
+-              smp->preq[0] = SMP_CMD_PAIRING_REQ;
+-              memcpy(&smp->preq[1], &cp, sizeof(cp));
+-
+-              smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(cp), &cp);
+-              SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
+-      } else {
+-              struct smp_cmd_security_req cp;
+-              cp.auth_req = authreq;
+-              smp_send_cmd(conn, SMP_CMD_SECURITY_REQ, sizeof(cp), &cp);
+-              SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_REQ);
+-      }
++      if (hcon->role == HCI_ROLE_MASTER)
++              smp_send_pairing_req(smp, authreq);
++      else
++              smp_send_security_req(smp, authreq);
+-      set_bit(SMP_FLAG_INITIATOR, &smp->flags);
+       ret = 0;
+ unlock:
+@@ -2688,8 +2698,6 @@ static int smp_cmd_sign_info(struct l2cap_conn *conn, struct sk_buff *skb)
+ static u8 sc_select_method(struct smp_chan *smp)
+ {
+-      struct l2cap_conn *conn = smp->conn;
+-      struct hci_conn *hcon = conn->hcon;
+       struct smp_cmd_pairing *local, *remote;
+       u8 local_mitm, remote_mitm, local_io, remote_io, method;
+@@ -2702,7 +2710,7 @@ static u8 sc_select_method(struct smp_chan *smp)
+        * the "struct smp_cmd_pairing" from them we need to skip the
+        * first byte which contains the opcode.
+        */
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               local = (void *) &smp->preq[1];
+               remote = (void *) &smp->prsp[1];
+       } else {
+@@ -2771,7 +2779,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+       /* Non-initiating device sends its public key after receiving
+        * the key from the initiating device.
+        */
+-      if (!hcon->out) {
++      if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               err = sc_send_public_key(smp);
+               if (err)
+                       return err;
+@@ -2833,7 +2841,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+       }
+       if (smp->method == REQ_OOB) {
+-              if (hcon->out)
++              if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+                       smp_send_cmd(conn, SMP_CMD_PAIRING_RANDOM,
+                                    sizeof(smp->prnd), smp->prnd);
+@@ -2842,7 +2850,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+               return 0;
+       }
+-      if (hcon->out)
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_CONFIRM);
+       if (smp->method == REQ_PASSKEY) {
+@@ -2857,7 +2865,7 @@ static int smp_cmd_public_key(struct l2cap_conn *conn, struct sk_buff *skb)
+       /* The Initiating device waits for the non-initiating device to
+        * send the confirm value.
+        */
+-      if (conn->hcon->out)
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags))
+               return 0;
+       err = smp_f4(smp->tfm_cmac, smp->local_pk, smp->remote_pk, smp->prnd,
+@@ -2891,7 +2899,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
+       a[6] = hcon->init_addr_type;
+       b[6] = hcon->resp_addr_type;
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               local_addr = a;
+               remote_addr = b;
+               memcpy(io_cap, &smp->prsp[1], 3);
+@@ -2916,7 +2924,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
+       if (crypto_memneq(check->e, e, 16))
+               return SMP_DHKEY_CHECK_FAILED;
+-      if (!hcon->out) {
++      if (!test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               if (test_bit(SMP_FLAG_WAIT_USER, &smp->flags)) {
+                       set_bit(SMP_FLAG_DHKEY_PENDING, &smp->flags);
+                       return 0;
+@@ -2928,7 +2936,7 @@ static int smp_cmd_dhkey_check(struct l2cap_conn *conn, struct sk_buff *skb)
+       sc_add_ltk(smp);
+-      if (hcon->out) {
++      if (test_bit(SMP_FLAG_INITIATOR, &smp->flags)) {
+               hci_le_start_enc(hcon, 0, 0, smp->tk, smp->enc_key_size);
+               hcon->enc_key_size = smp->enc_key_size;
+       }
+@@ -3077,7 +3085,6 @@ static void bredr_pairing(struct l2cap_chan *chan)
+       struct l2cap_conn *conn = chan->conn;
+       struct hci_conn *hcon = conn->hcon;
+       struct hci_dev *hdev = hcon->hdev;
+-      struct smp_cmd_pairing req;
+       struct smp_chan *smp;
+       bt_dev_dbg(hdev, "chan %p", chan);
+@@ -3129,14 +3136,7 @@ static void bredr_pairing(struct l2cap_chan *chan)
+       bt_dev_dbg(hdev, "starting SMP over BR/EDR");
+-      /* Prepare and send the BR/EDR SMP Pairing Request */
+-      build_bredr_pairing_cmd(smp, &req, NULL);
+-
+-      smp->preq[0] = SMP_CMD_PAIRING_REQ;
+-      memcpy(&smp->preq[1], &req, sizeof(req));
+-
+-      smp_send_cmd(conn, SMP_CMD_PAIRING_REQ, sizeof(req), &req);
+-      SMP_ALLOW_CMD(smp, SMP_CMD_PAIRING_RSP);
++      smp_send_pairing_req(smp, 0x00);
+ }
+ static void smp_resume_cb(struct l2cap_chan *chan)
+-- 
+2.43.0
+
diff --git a/queue-6.1/bonding-fix-bond_ipsec_offload_ok-return-type.patch b/queue-6.1/bonding-fix-bond_ipsec_offload_ok-return-type.patch
new file mode 100644 (file)
index 0000000..75708e2
--- /dev/null
@@ -0,0 +1,68 @@
+From a9032efb26139f75ee491fcc4702e68f835df48a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 16 Aug 2024 14:48:10 +0300
+Subject: bonding: fix bond_ipsec_offload_ok return type
+
+From: Nikolay Aleksandrov <razor@blackwall.org>
+
+[ Upstream commit fc59b9a5f7201b9f7272944596113a82cc7773d5 ]
+
+Fix the return type which should be bool.
+
+Fixes: 955b785ec6b3 ("bonding: fix suspicious RCU usage in bond_ipsec_offload_ok()")
+Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/bonding/bond_main.c | 18 ++++++------------
+ 1 file changed, 6 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
+index be5348d0b22e5..5c7b249a1bcfa 100644
+--- a/drivers/net/bonding/bond_main.c
++++ b/drivers/net/bonding/bond_main.c
+@@ -595,34 +595,28 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
+       struct net_device *real_dev;
+       struct slave *curr_active;
+       struct bonding *bond;
+-      int err;
++      bool ok = false;
+       bond = netdev_priv(bond_dev);
+       rcu_read_lock();
+       curr_active = rcu_dereference(bond->curr_active_slave);
+       real_dev = curr_active->dev;
+-      if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP) {
+-              err = false;
++      if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+               goto out;
+-      }
+-      if (!xs->xso.real_dev) {
+-              err = false;
++      if (!xs->xso.real_dev)
+               goto out;
+-      }
+       if (!real_dev->xfrmdev_ops ||
+           !real_dev->xfrmdev_ops->xdo_dev_offload_ok ||
+-          netif_is_bond_master(real_dev)) {
+-              err = false;
++          netif_is_bond_master(real_dev))
+               goto out;
+-      }
+-      err = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
++      ok = real_dev->xfrmdev_ops->xdo_dev_offload_ok(skb, xs);
+ out:
+       rcu_read_unlock();
+-      return err;
++      return ok;
+ }
+ static const struct xfrmdev_ops bond_xfrmdev_ops = {
+-- 
+2.43.0
+
diff --git a/queue-6.1/bonding-fix-null-pointer-deref-in-bond_ipsec_offload.patch b/queue-6.1/bonding-fix-null-pointer-deref-in-bond_ipsec_offload.patch
new file mode 100644 (file)
index 0000000..28271de
--- /dev/null
@@ -0,0 +1,37 @@
+From 7abf778de24c2cc4f31846d32dd3196e27451227 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 16 Aug 2024 14:48:11 +0300
+Subject: bonding: fix null pointer deref in bond_ipsec_offload_ok
+
+From: Nikolay Aleksandrov <razor@blackwall.org>
+
+[ Upstream commit 95c90e4ad89d493a7a14fa200082e466e2548f9d ]
+
+We must check if there is an active slave before dereferencing the pointer.
+
+Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
+Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/bonding/bond_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
+index 5c7b249a1bcfa..2414861e04eb0 100644
+--- a/drivers/net/bonding/bond_main.c
++++ b/drivers/net/bonding/bond_main.c
+@@ -600,6 +600,8 @@ static bool bond_ipsec_offload_ok(struct sk_buff *skb, struct xfrm_state *xs)
+       bond = netdev_priv(bond_dev);
+       rcu_read_lock();
+       curr_active = rcu_dereference(bond->curr_active_slave);
++      if (!curr_active)
++              goto out;
+       real_dev = curr_active->dev;
+       if (BOND_MODE(bond) != BOND_MODE_ACTIVEBACKUP)
+-- 
+2.43.0
+
diff --git a/queue-6.1/bonding-fix-xfrm-real_dev-null-pointer-dereference.patch b/queue-6.1/bonding-fix-xfrm-real_dev-null-pointer-dereference.patch
new file mode 100644 (file)
index 0000000..d6f6d45
--- /dev/null
@@ -0,0 +1,81 @@
+From f396657cefa68e23285f641a23b72f36ad935e1a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 16 Aug 2024 14:48:12 +0300
+Subject: bonding: fix xfrm real_dev null pointer dereference
+
+From: Nikolay Aleksandrov <razor@blackwall.org>
+
+[ Upstream commit f8cde9805981c50d0c029063dc7d82821806fc44 ]
+
+We shouldn't set real_dev to NULL because packets can be in transit and
+xfrm might call xdo_dev_offload_ok() in parallel. All callbacks assume
+real_dev is set.
+
+ Example trace:
+ kernel: BUG: unable to handle page fault for address: 0000000000001030
+ kernel: bond0: (slave eni0np1): making interface the new active one
+ kernel: #PF: supervisor write access in kernel mode
+ kernel: #PF: error_code(0x0002) - not-present page
+ kernel: PGD 0 P4D 0
+ kernel: Oops: 0002 [#1] PREEMPT SMP
+ kernel: CPU: 4 PID: 2237 Comm: ping Not tainted 6.7.7+ #12
+ kernel: Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 1.16.3-2.fc40 04/01/2014
+ kernel: RIP: 0010:nsim_ipsec_offload_ok+0xc/0x20 [netdevsim]
+ kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+ kernel: Code: e0 0f 0b 48 83 7f 38 00 74 de 0f 0b 48 8b 47 08 48 8b 37 48 8b 78 40 e9 b2 e5 9a d7 66 90 0f 1f 44 00 00 48 8b 86 80 02 00 00 <83> 80 30 10 00 00 01 b8 01 00 00 00 c3 0f 1f 80 00 00 00 00 0f 1f
+ kernel: bond0: (slave eni0np1): making interface the new active one
+ kernel: RSP: 0018:ffffabde81553b98 EFLAGS: 00010246
+ kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+ kernel:
+ kernel: RAX: 0000000000000000 RBX: ffff9eb404e74900 RCX: ffff9eb403d97c60
+ kernel: RDX: ffffffffc090de10 RSI: ffff9eb404e74900 RDI: ffff9eb3c5de9e00
+ kernel: RBP: ffff9eb3c0a42000 R08: 0000000000000010 R09: 0000000000000014
+ kernel: R10: 7974203030303030 R11: 3030303030303030 R12: 0000000000000000
+ kernel: R13: ffff9eb3c5de9e00 R14: ffffabde81553cc8 R15: ffff9eb404c53000
+ kernel: FS:  00007f2a77a3ad00(0000) GS:ffff9eb43bd00000(0000) knlGS:0000000000000000
+ kernel: CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+ kernel: CR2: 0000000000001030 CR3: 00000001122ab000 CR4: 0000000000350ef0
+ kernel: bond0: (slave eni0np1): making interface the new active one
+ kernel: Call Trace:
+ kernel:  <TASK>
+ kernel:  ? __die+0x1f/0x60
+ kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+ kernel:  ? page_fault_oops+0x142/0x4c0
+ kernel:  ? do_user_addr_fault+0x65/0x670
+ kernel:  ? kvm_read_and_reset_apf_flags+0x3b/0x50
+ kernel: bond0: (slave eni0np1): making interface the new active one
+ kernel:  ? exc_page_fault+0x7b/0x180
+ kernel:  ? asm_exc_page_fault+0x22/0x30
+ kernel:  ? nsim_bpf_uninit+0x50/0x50 [netdevsim]
+ kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+ kernel:  ? nsim_ipsec_offload_ok+0xc/0x20 [netdevsim]
+ kernel: bond0: (slave eni0np1): making interface the new active one
+ kernel:  bond_ipsec_offload_ok+0x7b/0x90 [bonding]
+ kernel:  xfrm_output+0x61/0x3b0
+ kernel: bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+ kernel:  ip_push_pending_frames+0x56/0x80
+
+Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
+Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/bonding/bond_main.c | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/drivers/net/bonding/bond_main.c b/drivers/net/bonding/bond_main.c
+index 2414861e04eb0..c218352814430 100644
+--- a/drivers/net/bonding/bond_main.c
++++ b/drivers/net/bonding/bond_main.c
+@@ -578,7 +578,6 @@ static void bond_ipsec_del_sa_all(struct bonding *bond)
+               } else {
+                       slave->dev->xfrmdev_ops->xdo_dev_state_delete(ipsec->xs);
+               }
+-              ipsec->xs->xso.real_dev = NULL;
+       }
+       spin_unlock_bh(&bond->ipsec_lock);
+       rcu_read_unlock();
+-- 
+2.43.0
+
diff --git a/queue-6.1/bonding-fix-xfrm-state-handling-when-clearing-active.patch b/queue-6.1/bonding-fix-xfrm-state-handling-when-clearing-active.patch
new file mode 100644 (file)
index 0000000..ace480a
--- /dev/null
@@ -0,0 +1,43 @@
+From 463fbe98cdf1625749a0ea6d1c32220699e2f9d3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 16 Aug 2024 14:48:13 +0300
+Subject: bonding: fix xfrm state handling when clearing active slave
+
+From: Nikolay Aleksandrov <razor@blackwall.org>
+
+[ Upstream commit c4c5c5d2ef40a9f67a9241dc5422eac9ffe19547 ]
+
+If the active slave is cleared manually the xfrm state is not flushed.
+This leads to xfrm add/del imbalance and adding the same state multiple
+times. For example when the device cannot handle anymore states we get:
+ [ 1169.884811] bond0: (slave eni0np1): bond_ipsec_add_sa_all: failed to add SA
+because it's filled with the same state after multiple active slave
+clearings. This change also has a few nice side effects: user-space
+gets a notification for the change, the old device gets its mac address
+and promisc/mcast adjusted properly.
+
+Fixes: 18cb261afd7b ("bonding: support hardware encryption offload to slaves")
+Signed-off-by: Nikolay Aleksandrov <razor@blackwall.org>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/bonding/bond_options.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/bonding/bond_options.c b/drivers/net/bonding/bond_options.c
+index 685fb4703ee1f..06c4cd0f00024 100644
+--- a/drivers/net/bonding/bond_options.c
++++ b/drivers/net/bonding/bond_options.c
+@@ -932,7 +932,7 @@ static int bond_option_active_slave_set(struct bonding *bond,
+       /* check to see if we are clearing active */
+       if (!slave_dev) {
+               netdev_dbg(bond->dev, "Clearing current active slave\n");
+-              RCU_INIT_POINTER(bond->curr_active_slave, NULL);
++              bond_change_active_slave(bond, NULL);
+               bond_select_active_slave(bond);
+       } else {
+               struct slave *old_active = rtnl_dereference(bond->curr_active_slave);
+-- 
+2.43.0
+
diff --git a/queue-6.1/dpaa2-switch-fix-error-checking-in-dpaa2_switch_seed.patch b/queue-6.1/dpaa2-switch-fix-error-checking-in-dpaa2_switch_seed.patch
new file mode 100644 (file)
index 0000000..f852ba1
--- /dev/null
@@ -0,0 +1,56 @@
+From 51a05e96516e03c3789924b69655fdbfa0e8cb0c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 17 Aug 2024 09:52:46 +0300
+Subject: dpaa2-switch: Fix error checking in dpaa2_switch_seed_bp()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit c50e7475961c36ec4d21d60af055b32f9436b431 ]
+
+The dpaa2_switch_add_bufs() function returns the number of bufs that it
+was able to add.  It returns BUFS_PER_CMD (7) for complete success or a
+smaller number if there are not enough pages available.  However, the
+error checking is looking at the total number of bufs instead of the
+number which were added on this iteration.  Thus the error checking
+only works correctly for the first iteration through the loop and
+subsequent iterations are always counted as a success.
+
+Fix this by checking only the bufs added in the current iteration.
+
+Fixes: 0b1b71370458 ("staging: dpaa2-switch: handle Rx path on control interface")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Reviewed-by: Ioana Ciornei <ioana.ciornei@nxp.com>
+Tested-by: Ioana Ciornei <ioana.ciornei@nxp.com>
+Link: https://patch.msgid.link/eec27f30-b43f-42b6-b8ee-04a6f83423b6@stanley.mountain
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+index b98ef4ba172f6..d6c871f227947 100644
+--- a/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
++++ b/drivers/net/ethernet/freescale/dpaa2/dpaa2-switch.c
+@@ -2583,13 +2583,14 @@ static int dpaa2_switch_refill_bp(struct ethsw_core *ethsw)
+ static int dpaa2_switch_seed_bp(struct ethsw_core *ethsw)
+ {
+-      int *count, i;
++      int *count, ret, i;
+       for (i = 0; i < DPAA2_ETHSW_NUM_BUFS; i += BUFS_PER_CMD) {
++              ret = dpaa2_switch_add_bufs(ethsw, ethsw->bpid);
+               count = &ethsw->buf_count;
+-              *count += dpaa2_switch_add_bufs(ethsw, ethsw->bpid);
++              *count += ret;
+-              if (unlikely(*count < BUFS_PER_CMD))
++              if (unlikely(ret < BUFS_PER_CMD))
+                       return -ENOMEM;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.1/drm-amdkfd-reserve-the-bo-before-validating-it.patch b/queue-6.1/drm-amdkfd-reserve-the-bo-before-validating-it.patch
new file mode 100644 (file)
index 0000000..3c07dbb
--- /dev/null
@@ -0,0 +1,126 @@
+From a801782e970f078f6318a74ceada0d31c9ea3815 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Jan 2024 12:27:07 +0800
+Subject: drm/amdkfd: reserve the BO before validating it
+
+From: Lang Yu <Lang.Yu@amd.com>
+
+[ Upstream commit 0c93bd49576677ae1a18817d5ec000ef031d5187 ]
+
+Fix a warning.
+
+v2: Avoid unmapping attachment repeatedly when ERESTARTSYS.
+
+v3: Lock the BO before accessing ttm->sg to avoid race conditions.(Felix)
+
+[   41.708711] WARNING: CPU: 0 PID: 1463 at drivers/gpu/drm/ttm/ttm_bo.c:846 ttm_bo_validate+0x146/0x1b0 [ttm]
+[   41.708989] Call Trace:
+[   41.708992]  <TASK>
+[   41.708996]  ? show_regs+0x6c/0x80
+[   41.709000]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
+[   41.709008]  ? __warn+0x93/0x190
+[   41.709014]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
+[   41.709024]  ? report_bug+0x1f9/0x210
+[   41.709035]  ? handle_bug+0x46/0x80
+[   41.709041]  ? exc_invalid_op+0x1d/0x80
+[   41.709048]  ? asm_exc_invalid_op+0x1f/0x30
+[   41.709057]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
+[   41.709185]  ? ttm_bo_validate+0x146/0x1b0 [ttm]
+[   41.709197]  ? amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x2c/0x80 [amdgpu]
+[   41.709337]  ? srso_alias_return_thunk+0x5/0x7f
+[   41.709346]  kfd_mem_dmaunmap_attachment+0x9e/0x1e0 [amdgpu]
+[   41.709467]  amdgpu_amdkfd_gpuvm_dmaunmap_mem+0x56/0x80 [amdgpu]
+[   41.709586]  kfd_ioctl_unmap_memory_from_gpu+0x1b7/0x300 [amdgpu]
+[   41.709710]  kfd_ioctl+0x1ec/0x650 [amdgpu]
+[   41.709822]  ? __pfx_kfd_ioctl_unmap_memory_from_gpu+0x10/0x10 [amdgpu]
+[   41.709945]  ? srso_alias_return_thunk+0x5/0x7f
+[   41.709949]  ? tomoyo_file_ioctl+0x20/0x30
+[   41.709959]  __x64_sys_ioctl+0x9c/0xd0
+[   41.709967]  do_syscall_64+0x3f/0x90
+[   41.709973]  entry_SYSCALL_64_after_hwframe+0x6e/0xd8
+
+Fixes: 101b8104307e ("drm/amdkfd: Move dma unmapping after TLB flush")
+Signed-off-by: Lang Yu <Lang.Yu@amd.com>
+Reviewed-by: Felix Kuehling <Felix.Kuehling@amd.com>
+Signed-off-by: Alex Deucher <alexander.deucher@amd.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h    |  2 +-
+ .../gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c  | 20 ++++++++++++++++---
+ drivers/gpu/drm/amd/amdkfd/kfd_chardev.c      |  4 +++-
+ 3 files changed, 21 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+index 585d608c10e8e..4b694886715cf 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
+@@ -286,7 +286,7 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(struct amdgpu_device *adev,
+                                         struct kgd_mem *mem, void *drm_priv);
+ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
+               struct amdgpu_device *adev, struct kgd_mem *mem, void *drm_priv);
+-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
++int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv);
+ int amdgpu_amdkfd_gpuvm_sync_memory(
+               struct amdgpu_device *adev, struct kgd_mem *mem, bool intr);
+ int amdgpu_amdkfd_gpuvm_map_gtt_bo_to_kernel(struct kgd_mem *mem,
+diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+index 3e7f4d8dc9d13..d486f5dc052e4 100644
+--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
+@@ -2025,21 +2025,35 @@ int amdgpu_amdkfd_gpuvm_map_memory_to_gpu(
+       return ret;
+ }
+-void amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
++int amdgpu_amdkfd_gpuvm_dmaunmap_mem(struct kgd_mem *mem, void *drm_priv)
+ {
+       struct kfd_mem_attachment *entry;
+       struct amdgpu_vm *vm;
++      int ret;
+       vm = drm_priv_to_vm(drm_priv);
+       mutex_lock(&mem->lock);
++      ret = amdgpu_bo_reserve(mem->bo, true);
++      if (ret)
++              goto out;
++
+       list_for_each_entry(entry, &mem->attachments, list) {
+-              if (entry->bo_va->base.vm == vm)
+-                      kfd_mem_dmaunmap_attachment(mem, entry);
++              if (entry->bo_va->base.vm != vm)
++                      continue;
++              if (entry->bo_va->base.bo->tbo.ttm &&
++                  !entry->bo_va->base.bo->tbo.ttm->sg)
++                      continue;
++
++              kfd_mem_dmaunmap_attachment(mem, entry);
+       }
++      amdgpu_bo_unreserve(mem->bo);
++out:
+       mutex_unlock(&mem->lock);
++
++      return ret;
+ }
+ int amdgpu_amdkfd_gpuvm_unmap_memory_from_gpu(
+diff --git a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+index 2b21ce967e766..e3cd66c4d95d8 100644
+--- a/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
++++ b/drivers/gpu/drm/amd/amdkfd/kfd_chardev.c
+@@ -1410,7 +1410,9 @@ static int kfd_ioctl_unmap_memory_from_gpu(struct file *filep,
+                       kfd_flush_tlb(peer_pdd, TLB_FLUSH_HEAVYWEIGHT);
+               /* Remove dma mapping after tlb flush to avoid IO_PAGE_FAULT */
+-              amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
++              err = amdgpu_amdkfd_gpuvm_dmaunmap_mem(mem, peer_pdd->drm_priv);
++              if (err)
++                      goto sync_memory_failed;
+       }
+       mutex_unlock(&p->mutex);
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-add-xdp_buff-to-ice_rx_ring-struct.patch b/queue-6.1/ice-add-xdp_buff-to-ice_rx_ring-struct.patch
new file mode 100644 (file)
index 0000000..9098621
--- /dev/null
@@ -0,0 +1,157 @@
+From f7b90d53f5a829f4e859a3ec164b81c6716805d6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Jan 2023 21:44:55 +0100
+Subject: ice: Add xdp_buff to ice_rx_ring struct
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit cb0473e0e9dccaa0ddafb252f2c3ef943b86bb56 ]
+
+In preparation for XDP multi-buffer support, let's store xdp_buff on
+Rx ring struct. This will allow us to combine fragmented frames across
+separate NAPI cycles in the same way as currently skb fragments are
+handled. This means that skb pointer on Rx ring will become redundant
+and will be removed. For now it is kept and layout of Rx ring struct was
+not inspected, some member movement will be needed later on so that will
+be the time to take care of it.
+
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Link: https://lore.kernel.org/bpf/20230131204506.219292-3-maciej.fijalkowski@intel.com
+Stable-dep-of: 50b2143356e8 ("ice: fix page reuse when PAGE_SIZE is over 8k")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_base.c |  1 +
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 39 +++++++++++++----------
+ drivers/net/ethernet/intel/ice/ice_txrx.h |  1 +
+ 3 files changed, 25 insertions(+), 16 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
+index c7c6f01538e0d..4db4ec4b8857a 100644
+--- a/drivers/net/ethernet/intel/ice/ice_base.c
++++ b/drivers/net/ethernet/intel/ice/ice_base.c
+@@ -534,6 +534,7 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+               }
+       }
++      xdp_init_buff(&ring->xdp, ice_rx_pg_size(ring) / 2, &ring->xdp_rxq);
+       err = ice_setup_rx_ctx(ring);
+       if (err) {
+               dev_err(dev, "ice_setup_rx_ctx failed for RxQ %d, err %d\n",
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 2db20263420d8..d3411170f3eaf 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -523,8 +523,16 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
+       return -ENOMEM;
+ }
++/**
++ * ice_rx_frame_truesize
++ * @rx_ring: ptr to Rx ring
++ * @size: size
++ *
++ * calculate the truesize with taking into the account PAGE_SIZE of
++ * underlying arch
++ */
+ static unsigned int
+-ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, unsigned int __maybe_unused size)
++ice_rx_frame_truesize(struct ice_rx_ring *rx_ring, const unsigned int size)
+ {
+       unsigned int truesize;
+@@ -1103,21 +1111,20 @@ ice_is_non_eop(struct ice_rx_ring *rx_ring, union ice_32b_rx_flex_desc *rx_desc)
+  */
+ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+ {
+-      unsigned int total_rx_bytes = 0, total_rx_pkts = 0, frame_sz = 0;
++      unsigned int total_rx_bytes = 0, total_rx_pkts = 0;
+       u16 cleaned_count = ICE_DESC_UNUSED(rx_ring);
+       unsigned int offset = rx_ring->rx_offset;
++      struct xdp_buff *xdp = &rx_ring->xdp;
+       struct ice_tx_ring *xdp_ring = NULL;
+       unsigned int xdp_res, xdp_xmit = 0;
+       struct sk_buff *skb = rx_ring->skb;
+       struct bpf_prog *xdp_prog = NULL;
+-      struct xdp_buff xdp;
+       bool failure;
+       /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+ #if (PAGE_SIZE < 8192)
+-      frame_sz = ice_rx_frame_truesize(rx_ring, 0);
++      xdp->frame_sz = ice_rx_frame_truesize(rx_ring, 0);
+ #endif
+-      xdp_init_buff(&xdp, frame_sz, &rx_ring->xdp_rxq);
+       xdp_prog = READ_ONCE(rx_ring->xdp_prog);
+       if (xdp_prog)
+@@ -1171,30 +1178,30 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               rx_buf = ice_get_rx_buf(rx_ring, size, &rx_buf_pgcnt);
+               if (!size) {
+-                      xdp.data = NULL;
+-                      xdp.data_end = NULL;
+-                      xdp.data_hard_start = NULL;
+-                      xdp.data_meta = NULL;
++                      xdp->data = NULL;
++                      xdp->data_end = NULL;
++                      xdp->data_hard_start = NULL;
++                      xdp->data_meta = NULL;
+                       goto construct_skb;
+               }
+               hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
+                            offset;
+-              xdp_prepare_buff(&xdp, hard_start, offset, size, !!offset);
++              xdp_prepare_buff(xdp, hard_start, offset, size, !!offset);
+ #if (PAGE_SIZE > 4096)
+               /* At larger PAGE_SIZE, frame_sz depend on len size */
+-              xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
++              xdp->frame_sz = ice_rx_frame_truesize(rx_ring, size);
+ #endif
+               if (!xdp_prog)
+                       goto construct_skb;
+-              xdp_res = ice_run_xdp(rx_ring, &xdp, xdp_prog, xdp_ring);
++              xdp_res = ice_run_xdp(rx_ring, xdp, xdp_prog, xdp_ring);
+               if (!xdp_res)
+                       goto construct_skb;
+               if (xdp_res & (ICE_XDP_TX | ICE_XDP_REDIR)) {
+                       xdp_xmit |= xdp_res;
+-                      ice_rx_buf_adjust_pg_offset(rx_buf, xdp.frame_sz);
++                      ice_rx_buf_adjust_pg_offset(rx_buf, xdp->frame_sz);
+               } else {
+                       rx_buf->pagecnt_bias++;
+               }
+@@ -1207,11 +1214,11 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+ construct_skb:
+               if (skb) {
+                       ice_add_rx_frag(rx_ring, rx_buf, skb, size);
+-              } else if (likely(xdp.data)) {
++              } else if (likely(xdp->data)) {
+                       if (ice_ring_uses_build_skb(rx_ring))
+-                              skb = ice_build_skb(rx_ring, rx_buf, &xdp);
++                              skb = ice_build_skb(rx_ring, rx_buf, xdp);
+                       else
+-                              skb = ice_construct_skb(rx_ring, rx_buf, &xdp);
++                              skb = ice_construct_skb(rx_ring, rx_buf, xdp);
+               }
+               /* exit if we failed to retrieve a buffer */
+               if (!skb) {
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index bfbe4b16df96a..ef8245f795c5b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -295,6 +295,7 @@ struct ice_rx_ring {
+       struct bpf_prog *xdp_prog;
+       struct ice_tx_ring *xdp_ring;
+       struct xsk_buff_pool *xsk_pool;
++      struct xdp_buff xdp;
+       struct sk_buff *skb;
+       dma_addr_t dma;                 /* physical address of ring */
+       u64 cached_phctime;
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-fix-ice_last_offset-formula.patch b/queue-6.1/ice-fix-ice_last_offset-formula.patch
new file mode 100644 (file)
index 0000000..f272f4b
--- /dev/null
@@ -0,0 +1,39 @@
+From 36b9f3f0e9634a6ed1ff7ebb14fc3baf67d13b2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 Aug 2024 12:53:25 +0200
+Subject: ice: fix ICE_LAST_OFFSET formula
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit b966ad832942b5a11e002f9b5ef102b08425b84a ]
+
+For bigger PAGE_SIZE archs, ice driver works on 3k Rx buffers.
+Therefore, ICE_LAST_OFFSET should take into account ICE_RXBUF_3072, not
+ICE_RXBUF_2048.
+
+Fixes: 7237f5b0dba4 ("ice: introduce legacy Rx flag")
+Suggested-by: Luiz Capitulino <luizcap@redhat.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 6ad4bdb0124e7..8577bf0ed5402 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -812,7 +812,7 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
+               return false;
+ #if (PAGE_SIZE >= 8192)
+ #define ICE_LAST_OFFSET \
+-      (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
++      (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_3072)
+       if (rx_buf->page_offset > ICE_LAST_OFFSET)
+               return false;
+ #endif /* PAGE_SIZE >= 8192) */
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-fix-page-reuse-when-page_size-is-over-8k.patch b/queue-6.1/ice-fix-page-reuse-when-page_size-is-over-8k.patch
new file mode 100644 (file)
index 0000000..2c4b2a4
--- /dev/null
@@ -0,0 +1,67 @@
+From 397898ecdfd709636dc4c82b49da092e8e00166e Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 7 Aug 2024 12:53:24 +0200
+Subject: ice: fix page reuse when PAGE_SIZE is over 8k
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 50b2143356e888777fc5bca023c39f34f404613a ]
+
+Architectures that have PAGE_SIZE >= 8192 such as arm64 should act the
+same as x86 currently, meaning reuse of a page should only take place
+when no one else is busy with it.
+
+Do two things independently of underlying PAGE_SIZE:
+- store the page count under ice_rx_buf::pgcnt
+- then act upon its value vs ice_rx_buf::pagecnt_bias when making the
+  decision regarding page reuse
+
+Fixes: 2b245cb29421 ("ice: Implement transmit and NAPI support")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Tested-by: Chandan Kumar Rout <chandanx.rout@intel.com> (A Contingent Worker at Intel)
+Signed-off-by: Tony Nguyen <anthony.l.nguyen@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 12 +++---------
+ 1 file changed, 3 insertions(+), 9 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 6f930d99b496b..6ad4bdb0124e7 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -807,16 +807,15 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
+       if (!dev_page_is_reusable(page))
+               return false;
+-#if (PAGE_SIZE < 8192)
+       /* if we are only owner of page we can reuse it */
+       if (unlikely(rx_buf->pgcnt - pagecnt_bias > 1))
+               return false;
+-#else
++#if (PAGE_SIZE >= 8192)
+ #define ICE_LAST_OFFSET \
+       (SKB_WITH_OVERHEAD(PAGE_SIZE) - ICE_RXBUF_2048)
+       if (rx_buf->page_offset > ICE_LAST_OFFSET)
+               return false;
+-#endif /* PAGE_SIZE < 8192) */
++#endif /* PAGE_SIZE >= 8192) */
+       /* If we have drained the page fragment pool we need to update
+        * the pagecnt_bias and page count so that we fully restock the
+@@ -904,12 +903,7 @@ ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
+       struct ice_rx_buf *rx_buf;
+       rx_buf = &rx_ring->rx_buf[ntc];
+-      rx_buf->pgcnt =
+-#if (PAGE_SIZE < 8192)
+-              page_count(rx_buf->page);
+-#else
+-              0;
+-#endif
++      rx_buf->pgcnt = page_count(rx_buf->page);
+       prefetchw(rx_buf->page);
+       if (!size)
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-prepare-legacy-rx-for-upcoming-xdp-multi-buffer-.patch b/queue-6.1/ice-prepare-legacy-rx-for-upcoming-xdp-multi-buffer-.patch
new file mode 100644 (file)
index 0000000..7e68af7
--- /dev/null
@@ -0,0 +1,189 @@
+From 8b9909b519c45ba924a2f15a586aee0de8c98848 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Jan 2023 21:44:54 +0100
+Subject: ice: Prepare legacy-rx for upcoming XDP multi-buffer support
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit c61bcebde72de7f5dc194d28f29894f0f7661ff7 ]
+
+Rx path is going to be modified in a way that fragmented frame will be
+gathered within xdp_buff in the first place. This approach implies that
+underlying buffer has to provide tailroom for skb_shared_info. This is
+currently the case when ring uses build_skb but not when legacy-rx knob
+is turned on. This case configures 2k Rx buffers and has no way to
+provide either headroom or tailroom - FWIW it currently has
+XDP_PACKET_HEADROOM which is broken and in here it is removed. 2k Rx
+buffers were used so driver in this setting was able to support 9k MTU
+as it can chain up to 5 Rx buffers. With offset configuring HW writing
+2k of a data was passing the half of the page which broke the assumption
+of our internal page recycling tricks.
+
+Now if above got fixed and legacy-rx path would be left as is, when
+referring to skb_shared_info via xdp_get_shared_info_from_buff(),
+packet's content would be corrupted again. Hence size of Rx buffer needs
+to be lowered and therefore supported MTU. This operation will allow us
+to keep the unified data path and with 8k MTU users (if any of
+legacy-rx) would still be good to go. However, tendency is to drop the
+support for this code path at some point.
+
+Add ICE_RXBUF_1664 as vsi::rx_buf_len and ICE_MAX_FRAME_LEGACY_RX (8320)
+as vsi::max_frame for legacy-rx. For bigger page sizes configure 3k Rx
+buffers, not 2k.
+
+Since headroom support is removed, disable data_meta support on legacy-rx.
+When preparing XDP buff, rely on ice_rx_ring::rx_offset setting when
+deciding whether to support data_meta or not.
+
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Link: https://lore.kernel.org/bpf/20230131204506.219292-2-maciej.fijalkowski@intel.com
+Stable-dep-of: 50b2143356e8 ("ice: fix page reuse when PAGE_SIZE is over 8k")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_base.c |  3 ---
+ drivers/net/ethernet/intel/ice/ice_lib.c  |  8 ++------
+ drivers/net/ethernet/intel/ice/ice_main.c | 10 ++++++++--
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 17 +++++------------
+ drivers/net/ethernet/intel/ice/ice_txrx.h |  2 ++
+ 5 files changed, 17 insertions(+), 23 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
+index 818eca6aa4a41..c7c6f01538e0d 100644
+--- a/drivers/net/ethernet/intel/ice/ice_base.c
++++ b/drivers/net/ethernet/intel/ice/ice_base.c
+@@ -355,9 +355,6 @@ static unsigned int ice_rx_offset(struct ice_rx_ring *rx_ring)
+ {
+       if (ice_ring_uses_build_skb(rx_ring))
+               return ICE_SKB_PAD;
+-      else if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+-              return XDP_PACKET_HEADROOM;
+-
+       return 0;
+ }
+diff --git a/drivers/net/ethernet/intel/ice/ice_lib.c b/drivers/net/ethernet/intel/ice/ice_lib.c
+index 7661e735d0992..347c6c23bfc1c 100644
+--- a/drivers/net/ethernet/intel/ice/ice_lib.c
++++ b/drivers/net/ethernet/intel/ice/ice_lib.c
+@@ -1818,8 +1818,8 @@ void ice_update_eth_stats(struct ice_vsi *vsi)
+ void ice_vsi_cfg_frame_size(struct ice_vsi *vsi)
+ {
+       if (!vsi->netdev || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags)) {
+-              vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+-              vsi->rx_buf_len = ICE_RXBUF_2048;
++              vsi->max_frame = ICE_MAX_FRAME_LEGACY_RX;
++              vsi->rx_buf_len = ICE_RXBUF_1664;
+ #if (PAGE_SIZE < 8192)
+       } else if (!ICE_2K_TOO_SMALL_WITH_PADDING &&
+                  (vsi->netdev->mtu <= ETH_DATA_LEN)) {
+@@ -1828,11 +1828,7 @@ void ice_vsi_cfg_frame_size(struct ice_vsi *vsi)
+ #endif
+       } else {
+               vsi->max_frame = ICE_AQ_SET_MAC_FRAME_SIZE_MAX;
+-#if (PAGE_SIZE < 8192)
+               vsi->rx_buf_len = ICE_RXBUF_3072;
+-#else
+-              vsi->rx_buf_len = ICE_RXBUF_2048;
+-#endif
+       }
+ }
+diff --git a/drivers/net/ethernet/intel/ice/ice_main.c b/drivers/net/ethernet/intel/ice/ice_main.c
+index 6e55861dd86fe..9dbfbc90485e4 100644
+--- a/drivers/net/ethernet/intel/ice/ice_main.c
++++ b/drivers/net/ethernet/intel/ice/ice_main.c
+@@ -7328,8 +7328,8 @@ static void ice_rebuild(struct ice_pf *pf, enum ice_reset_req reset_type)
+  */
+ static int ice_max_xdp_frame_size(struct ice_vsi *vsi)
+ {
+-      if (PAGE_SIZE >= 8192 || test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
+-              return ICE_RXBUF_2048 - XDP_PACKET_HEADROOM;
++      if (test_bit(ICE_FLAG_LEGACY_RX, vsi->back->flags))
++              return ICE_RXBUF_1664;
+       else
+               return ICE_RXBUF_3072;
+ }
+@@ -7362,6 +7362,12 @@ static int ice_change_mtu(struct net_device *netdev, int new_mtu)
+                                  frame_size - ICE_ETH_PKT_HDR_PAD);
+                       return -EINVAL;
+               }
++      } else if (test_bit(ICE_FLAG_LEGACY_RX, pf->flags)) {
++              if (new_mtu + ICE_ETH_PKT_HDR_PAD > ICE_MAX_FRAME_LEGACY_RX) {
++                      netdev_err(netdev, "Too big MTU for legacy-rx; Max is %d\n",
++                                 ICE_MAX_FRAME_LEGACY_RX - ICE_ETH_PKT_HDR_PAD);
++                      return -EINVAL;
++              }
+       }
+       /* if a reset is in progress, wait for some time for it to complete */
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index bd62781191b3d..2db20263420d8 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -984,17 +984,15 @@ static struct sk_buff *
+ ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+                 struct xdp_buff *xdp)
+ {
+-      unsigned int metasize = xdp->data - xdp->data_meta;
+       unsigned int size = xdp->data_end - xdp->data;
+       unsigned int headlen;
+       struct sk_buff *skb;
+       /* prefetch first cache line of first page */
+-      net_prefetch(xdp->data_meta);
++      net_prefetch(xdp->data);
+       /* allocate a skb to store the frags */
+-      skb = __napi_alloc_skb(&rx_ring->q_vector->napi,
+-                             ICE_RX_HDR_SIZE + metasize,
++      skb = __napi_alloc_skb(&rx_ring->q_vector->napi, ICE_RX_HDR_SIZE,
+                              GFP_ATOMIC | __GFP_NOWARN);
+       if (unlikely(!skb))
+               return NULL;
+@@ -1006,13 +1004,8 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+               headlen = eth_get_headlen(skb->dev, xdp->data, ICE_RX_HDR_SIZE);
+       /* align pull length to size of long to optimize memcpy performance */
+-      memcpy(__skb_put(skb, headlen + metasize), xdp->data_meta,
+-             ALIGN(headlen + metasize, sizeof(long)));
+-
+-      if (metasize) {
+-              skb_metadata_set(skb, metasize);
+-              __skb_pull(skb, metasize);
+-      }
++      memcpy(__skb_put(skb, headlen), xdp->data, ALIGN(headlen,
++                                                       sizeof(long)));
+       /* if we exhaust the linear part then add what is left as a frag */
+       size -= headlen;
+@@ -1187,7 +1180,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               hard_start = page_address(rx_buf->page) + rx_buf->page_offset -
+                            offset;
+-              xdp_prepare_buff(&xdp, hard_start, offset, size, true);
++              xdp_prepare_buff(&xdp, hard_start, offset, size, !!offset);
+ #if (PAGE_SIZE > 4096)
+               /* At larger PAGE_SIZE, frame_sz depend on len size */
+               xdp.frame_sz = ice_rx_frame_truesize(rx_ring, size);
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index 932b5661ec4d6..bfbe4b16df96a 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -9,10 +9,12 @@
+ #define ICE_DFLT_IRQ_WORK     256
+ #define ICE_RXBUF_3072                3072
+ #define ICE_RXBUF_2048                2048
++#define ICE_RXBUF_1664                1664
+ #define ICE_RXBUF_1536                1536
+ #define ICE_MAX_CHAINED_RX_BUFS       5
+ #define ICE_MAX_BUF_TXD               8
+ #define ICE_MIN_TX_LEN                17
++#define ICE_MAX_FRAME_LEGACY_RX 8320
+ /* The size limit for a transmit buffer in a descriptor is (16K - 1).
+  * In order to align with the read requests we will align the value to
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-pull-out-next_to_clean-bump-out-of-ice_put_rx_bu.patch b/queue-6.1/ice-pull-out-next_to_clean-bump-out-of-ice_put_rx_bu.patch
new file mode 100644 (file)
index 0000000..315ceb4
--- /dev/null
@@ -0,0 +1,129 @@
+From f7516356fb4b3f52398e6feee8989a849753f378 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Jan 2023 21:44:57 +0100
+Subject: ice: Pull out next_to_clean bump out of ice_put_rx_buf()
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit d7956d81f1502d3818500cff4847f3e9ae0c6aa4 ]
+
+Plan is to move ice_put_rx_buf() to the end of ice_clean_rx_irq() so
+in order to keep the ability of walking through HW Rx descriptors, pull
+out next_to_clean handling out of ice_put_rx_buf().
+
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Link: https://lore.kernel.org/bpf/20230131204506.219292-5-maciej.fijalkowski@intel.com
+Stable-dep-of: 50b2143356e8 ("ice: fix page reuse when PAGE_SIZE is over 8k")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 29 +++++++++++++----------
+ 1 file changed, 16 insertions(+), 13 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 977b268802dfa..6f930d99b496b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -898,11 +898,12 @@ ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf)
+  * for use by the CPU.
+  */
+ static struct ice_rx_buf *
+-ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size)
++ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
++             const unsigned int ntc)
+ {
+       struct ice_rx_buf *rx_buf;
+-      rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
++      rx_buf = &rx_ring->rx_buf[ntc];
+       rx_buf->pgcnt =
+ #if (PAGE_SIZE < 8192)
+               page_count(rx_buf->page);
+@@ -1040,19 +1041,12 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+  * @rx_ring: Rx descriptor ring to transact packets on
+  * @rx_buf: Rx buffer to pull data from
+  *
+- * This function will update next_to_clean and then clean up the contents
+- * of the rx_buf. It will either recycle the buffer or unmap it and free
+- * the associated resources.
++ * This function will clean up the contents of the rx_buf. It will either
++ * recycle the buffer or unmap it and free the associated resources.
+  */
+ static void
+ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
+ {
+-      u16 ntc = rx_ring->next_to_clean + 1;
+-
+-      /* fetch, update, and store next to clean */
+-      ntc = (ntc < rx_ring->count) ? ntc : 0;
+-      rx_ring->next_to_clean = ntc;
+-
+       if (!rx_buf)
+               return;
+@@ -1114,6 +1108,8 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+       unsigned int xdp_res, xdp_xmit = 0;
+       struct sk_buff *skb = rx_ring->skb;
+       struct bpf_prog *xdp_prog = NULL;
++      u32 ntc = rx_ring->next_to_clean;
++      u32 cnt = rx_ring->count;
+       bool failure;
+       /* Frame size depend on rx_ring setup when PAGE_SIZE=4K */
+@@ -1136,7 +1132,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               u16 rx_ptype;
+               /* get the Rx desc from Rx ring based on 'next_to_clean' */
+-              rx_desc = ICE_RX_DESC(rx_ring, rx_ring->next_to_clean);
++              rx_desc = ICE_RX_DESC(rx_ring, ntc);
+               /* status_error_len will always be zero for unused descriptors
+                * because it's cleared in cleanup, and overlaps with hdr_addr
+@@ -1160,6 +1156,8 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+                           ctrl_vsi->vf)
+                               ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
++                      if (++ntc == cnt)
++                              ntc = 0;
+                       ice_put_rx_buf(rx_ring, NULL);
+                       cleaned_count++;
+                       continue;
+@@ -1169,7 +1167,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       ICE_RX_FLX_DESC_PKT_LEN_M;
+               /* retrieve a buffer from the ring */
+-              rx_buf = ice_get_rx_buf(rx_ring, size);
++              rx_buf = ice_get_rx_buf(rx_ring, size, ntc);
+               if (!size) {
+                       xdp->data = NULL;
+@@ -1203,6 +1201,8 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               total_rx_pkts++;
+               cleaned_count++;
++              if (++ntc == cnt)
++                      ntc = 0;
+               ice_put_rx_buf(rx_ring, rx_buf);
+               continue;
+ construct_skb:
+@@ -1222,6 +1222,8 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       break;
+               }
++              if (++ntc == cnt)
++                      ntc = 0;
+               ice_put_rx_buf(rx_ring, rx_buf);
+               cleaned_count++;
+@@ -1262,6 +1264,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               total_rx_pkts++;
+       }
++      rx_ring->next_to_clean = ntc;
+       /* return up to cleaned_count buffers to hardware */
+       failure = ice_alloc_rx_bufs(rx_ring, cleaned_count);
+-- 
+2.43.0
+
diff --git a/queue-6.1/ice-store-page-count-inside-ice_rx_buf.patch b/queue-6.1/ice-store-page-count-inside-ice_rx_buf.patch
new file mode 100644 (file)
index 0000000..5df803a
--- /dev/null
@@ -0,0 +1,178 @@
+From 1487b5d5a6b47fe2c002bd7daae2d059c440f7e7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Jan 2023 21:44:56 +0100
+Subject: ice: Store page count inside ice_rx_buf
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit ac0753391195011ded23696d7882428e5c419a98 ]
+
+This will allow us to avoid carrying additional auxiliary array of page
+counts when dealing with XDP multi buffer support. Previously combining
+fragmented frame to skb was not affected in the same way as XDP would be
+as whole frame is needed to be in place before executing XDP prog.
+Therefore, when going through HW Rx descriptors one-by-one, calls to
+ice_put_rx_buf() need to be taken *after* running XDP prog on a
+potentially multi buffered frame, so some additional storage of
+page count is needed.
+
+By adding page count to rx buf, it will make it easier to walk through
+processed entries at the end of rx cleaning routine and decide whether
+or not buffers should be recycled.
+
+While at it, bump ice_rx_buf::pagecnt_bias from u16 up to u32. It was
+proven many times that calculations on variables smaller than standard
+register size are harmful. This was also the case during experiments
+with embedding page count to ice_rx_buf - when this was added as u16 it
+had a performance impact.
+
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Daniel Borkmann <daniel@iogearbox.net>
+Reviewed-by: Alexander Lobakin <alexandr.lobakin@intel.com>
+Link: https://lore.kernel.org/bpf/20230131204506.219292-4-maciej.fijalkowski@intel.com
+Stable-dep-of: 50b2143356e8 ("ice: fix page reuse when PAGE_SIZE is over 8k")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 26 +++++++++--------------
+ drivers/net/ethernet/intel/ice/ice_txrx.h |  3 ++-
+ 2 files changed, 12 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index d3411170f3eaf..977b268802dfa 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -791,7 +791,6 @@ ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
+ /**
+  * ice_can_reuse_rx_page - Determine if page can be reused for another Rx
+  * @rx_buf: buffer containing the page
+- * @rx_buf_pgcnt: rx_buf page refcount pre xdp_do_redirect() call
+  *
+  * If page is reusable, we have a green light for calling ice_reuse_rx_page,
+  * which will assign the current buffer to the buffer that next_to_alloc is
+@@ -799,7 +798,7 @@ ice_rx_buf_adjust_pg_offset(struct ice_rx_buf *rx_buf, unsigned int size)
+  * page freed
+  */
+ static bool
+-ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt)
++ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf)
+ {
+       unsigned int pagecnt_bias = rx_buf->pagecnt_bias;
+       struct page *page = rx_buf->page;
+@@ -810,7 +809,7 @@ ice_can_reuse_rx_page(struct ice_rx_buf *rx_buf, int rx_buf_pgcnt)
+ #if (PAGE_SIZE < 8192)
+       /* if we are only owner of page we can reuse it */
+-      if (unlikely((rx_buf_pgcnt - pagecnt_bias) > 1))
++      if (unlikely(rx_buf->pgcnt - pagecnt_bias > 1))
+               return false;
+ #else
+ #define ICE_LAST_OFFSET \
+@@ -894,19 +893,17 @@ ice_reuse_rx_page(struct ice_rx_ring *rx_ring, struct ice_rx_buf *old_buf)
+  * ice_get_rx_buf - Fetch Rx buffer and synchronize data for use
+  * @rx_ring: Rx descriptor ring to transact packets on
+  * @size: size of buffer to add to skb
+- * @rx_buf_pgcnt: rx_buf page refcount
+  *
+  * This function will pull an Rx buffer from the ring and synchronize it
+  * for use by the CPU.
+  */
+ static struct ice_rx_buf *
+-ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size,
+-             int *rx_buf_pgcnt)
++ice_get_rx_buf(struct ice_rx_ring *rx_ring, const unsigned int size)
+ {
+       struct ice_rx_buf *rx_buf;
+       rx_buf = &rx_ring->rx_buf[rx_ring->next_to_clean];
+-      *rx_buf_pgcnt =
++      rx_buf->pgcnt =
+ #if (PAGE_SIZE < 8192)
+               page_count(rx_buf->page);
+ #else
+@@ -1042,15 +1039,13 @@ ice_construct_skb(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+  * ice_put_rx_buf - Clean up used buffer and either recycle or free
+  * @rx_ring: Rx descriptor ring to transact packets on
+  * @rx_buf: Rx buffer to pull data from
+- * @rx_buf_pgcnt: Rx buffer page count pre xdp_do_redirect()
+  *
+  * This function will update next_to_clean and then clean up the contents
+  * of the rx_buf. It will either recycle the buffer or unmap it and free
+  * the associated resources.
+  */
+ static void
+-ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+-             int rx_buf_pgcnt)
++ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf)
+ {
+       u16 ntc = rx_ring->next_to_clean + 1;
+@@ -1061,7 +1056,7 @@ ice_put_rx_buf(struct ice_rx_ring *rx_ring, struct ice_rx_buf *rx_buf,
+       if (!rx_buf)
+               return;
+-      if (ice_can_reuse_rx_page(rx_buf, rx_buf_pgcnt)) {
++      if (ice_can_reuse_rx_page(rx_buf)) {
+               /* hand second half of page back to the ring */
+               ice_reuse_rx_page(rx_ring, rx_buf);
+       } else {
+@@ -1137,7 +1132,6 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               unsigned char *hard_start;
+               unsigned int size;
+               u16 stat_err_bits;
+-              int rx_buf_pgcnt;
+               u16 vlan_tag = 0;
+               u16 rx_ptype;
+@@ -1166,7 +1160,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       if (rx_desc->wb.rxdid == FDIR_DESC_RXDID &&
+                           ctrl_vsi->vf)
+                               ice_vc_fdir_irq_handler(ctrl_vsi, rx_desc);
+-                      ice_put_rx_buf(rx_ring, NULL, 0);
++                      ice_put_rx_buf(rx_ring, NULL);
+                       cleaned_count++;
+                       continue;
+               }
+@@ -1175,7 +1169,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       ICE_RX_FLX_DESC_PKT_LEN_M;
+               /* retrieve a buffer from the ring */
+-              rx_buf = ice_get_rx_buf(rx_ring, size, &rx_buf_pgcnt);
++              rx_buf = ice_get_rx_buf(rx_ring, size);
+               if (!size) {
+                       xdp->data = NULL;
+@@ -1209,7 +1203,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+               total_rx_pkts++;
+               cleaned_count++;
+-              ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
++              ice_put_rx_buf(rx_ring, rx_buf);
+               continue;
+ construct_skb:
+               if (skb) {
+@@ -1228,7 +1222,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                       break;
+               }
+-              ice_put_rx_buf(rx_ring, rx_buf, rx_buf_pgcnt);
++              ice_put_rx_buf(rx_ring, rx_buf);
+               cleaned_count++;
+               /* skip if it is NOP desc */
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index ef8245f795c5b..c1d9b3cebb059 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -172,7 +172,8 @@ struct ice_rx_buf {
+       dma_addr_t dma;
+       struct page *page;
+       unsigned int page_offset;
+-      u16 pagecnt_bias;
++      unsigned int pgcnt;
++      unsigned int pagecnt_bias;
+ };
+ struct ice_q_stats {
+-- 
+2.43.0
+
diff --git a/queue-6.1/ip6_tunnel-fix-broken-gro.patch b/queue-6.1/ip6_tunnel-fix-broken-gro.patch
new file mode 100644 (file)
index 0000000..625cb6d
--- /dev/null
@@ -0,0 +1,78 @@
+From 9871a8e9025768cfb2232221931b6235f5615680 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 17:14:16 +0200
+Subject: ip6_tunnel: Fix broken GRO
+
+From: Thomas Bogendoerfer <tbogendoerfer@suse.de>
+
+[ Upstream commit 4b3e33fcc38f7750604b065c55a43e94c5bc3145 ]
+
+GRO code checks for matching layer 2 headers to see, if packet belongs
+to the same flow and because ip6 tunnel set dev->hard_header_len
+this check fails in cases, where it shouldn't. To fix this don't
+set hard_header_len, but use needed_headroom like ipv4/ip_tunnel.c
+does.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Thomas Bogendoerfer <tbogendoerfer@suse.de>
+Link: https://patch.msgid.link/20240815151419.109864-1-tbogendoerfer@suse.de
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ip6_tunnel.c | 12 +++++++-----
+ 1 file changed, 7 insertions(+), 5 deletions(-)
+
+diff --git a/net/ipv6/ip6_tunnel.c b/net/ipv6/ip6_tunnel.c
+index 2699915bb85be..f3324f2a40466 100644
+--- a/net/ipv6/ip6_tunnel.c
++++ b/net/ipv6/ip6_tunnel.c
+@@ -1510,7 +1510,8 @@ static void ip6_tnl_link_config(struct ip6_tnl *t)
+                       tdev = __dev_get_by_index(t->net, p->link);
+               if (tdev) {
+-                      dev->hard_header_len = tdev->hard_header_len + t_hlen;
++                      dev->needed_headroom = tdev->hard_header_len +
++                              tdev->needed_headroom + t_hlen;
+                       mtu = min_t(unsigned int, tdev->mtu, IP6_MAX_MTU);
+                       mtu = mtu - t_hlen;
+@@ -1734,7 +1735,9 @@ ip6_tnl_siocdevprivate(struct net_device *dev, struct ifreq *ifr,
+ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+ {
+       struct ip6_tnl *tnl = netdev_priv(dev);
++      int t_hlen;
++      t_hlen = tnl->hlen + sizeof(struct ipv6hdr);
+       if (tnl->parms.proto == IPPROTO_IPV6) {
+               if (new_mtu < IPV6_MIN_MTU)
+                       return -EINVAL;
+@@ -1743,10 +1746,10 @@ int ip6_tnl_change_mtu(struct net_device *dev, int new_mtu)
+                       return -EINVAL;
+       }
+       if (tnl->parms.proto == IPPROTO_IPV6 || tnl->parms.proto == 0) {
+-              if (new_mtu > IP6_MAX_MTU - dev->hard_header_len)
++              if (new_mtu > IP6_MAX_MTU - dev->hard_header_len - t_hlen)
+                       return -EINVAL;
+       } else {
+-              if (new_mtu > IP_MAX_MTU - dev->hard_header_len)
++              if (new_mtu > IP_MAX_MTU - dev->hard_header_len - t_hlen)
+                       return -EINVAL;
+       }
+       dev->mtu = new_mtu;
+@@ -1892,12 +1895,11 @@ ip6_tnl_dev_init_gen(struct net_device *dev)
+       t_hlen = t->hlen + sizeof(struct ipv6hdr);
+       dev->type = ARPHRD_TUNNEL6;
+-      dev->hard_header_len = LL_MAX_HEADER + t_hlen;
+       dev->mtu = ETH_DATA_LEN - t_hlen;
+       if (!(t->parms.flags & IP6_TNL_F_IGN_ENCAP_LIMIT))
+               dev->mtu -= 8;
+       dev->min_mtu = ETH_MIN_MTU;
+-      dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len;
++      dev->max_mtu = IP6_MAX_MTU - dev->hard_header_len - t_hlen;
+       netdev_hold(dev, &t->dev_tracker, GFP_KERNEL);
+       return 0;
+-- 
+2.43.0
+
diff --git a/queue-6.1/ipv6-fix-possible-uaf-in-ip6_finish_output2.patch b/queue-6.1/ipv6-fix-possible-uaf-in-ip6_finish_output2.patch
new file mode 100644 (file)
index 0000000..f0f89c9
--- /dev/null
@@ -0,0 +1,49 @@
+From caa607f6a8b62faf6c48dcfbcb0a946f52134a5f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Aug 2024 16:08:58 +0000
+Subject: ipv6: fix possible UAF in ip6_finish_output2()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit da273b377ae0d9bd255281ed3c2adb228321687b ]
+
+If skb_expand_head() returns NULL, skb has been freed
+and associated dst/idev could also have been freed.
+
+We need to hold rcu_read_lock() to make sure the dst and
+associated idev are alive.
+
+Fixes: 5796015fa968 ("ipv6: allocate enough headroom in ip6_finish_output2()")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Vasily Averin <vasily.averin@linux.dev>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://patch.msgid.link/20240820160859.3786976-3-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ip6_output.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index 796cf0a0a4225..cfca9627398ee 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -69,11 +69,15 @@ static int ip6_finish_output2(struct net *net, struct sock *sk, struct sk_buff *
+       /* Be paranoid, rather than too clever. */
+       if (unlikely(hh_len > skb_headroom(skb)) && dev->header_ops) {
++              /* Make sure idev stays alive */
++              rcu_read_lock();
+               skb = skb_expand_head(skb, hh_len);
+               if (!skb) {
+                       IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
++                      rcu_read_unlock();
+                       return -ENOMEM;
+               }
++              rcu_read_unlock();
+       }
+       hdr = ipv6_hdr(skb);
+-- 
+2.43.0
+
diff --git a/queue-6.1/ipv6-prevent-possible-uaf-in-ip6_xmit.patch b/queue-6.1/ipv6-prevent-possible-uaf-in-ip6_xmit.patch
new file mode 100644 (file)
index 0000000..3a08192
--- /dev/null
@@ -0,0 +1,48 @@
+From 0ad605bd732f80427d1d7c4101328c96d4cc6545 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Aug 2024 16:08:59 +0000
+Subject: ipv6: prevent possible UAF in ip6_xmit()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 2d5ff7e339d04622d8282661df36151906d0e1c7 ]
+
+If skb_expand_head() returns NULL, skb has been freed
+and the associated dst/idev could also have been freed.
+
+We must use rcu_read_lock() to prevent a possible UAF.
+
+Fixes: 0c9f227bee11 ("ipv6: use skb_expand_head in ip6_xmit")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: Vasily Averin <vasily.averin@linux.dev>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://patch.msgid.link/20240820160859.3786976-4-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ip6_output.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index cfca9627398ee..f2227e662d1cf 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -278,11 +278,15 @@ int ip6_xmit(const struct sock *sk, struct sk_buff *skb, struct flowi6 *fl6,
+               head_room += opt->opt_nflen + opt->opt_flen;
+       if (unlikely(head_room > skb_headroom(skb))) {
++              /* Make sure idev stays alive */
++              rcu_read_lock();
+               skb = skb_expand_head(skb, head_room);
+               if (!skb) {
+                       IP6_INC_STATS(net, idev, IPSTATS_MIB_OUTDISCARDS);
++                      rcu_read_unlock();
+                       return -ENOBUFS;
+               }
++              rcu_read_unlock();
+       }
+       if (opt) {
+-- 
+2.43.0
+
diff --git a/queue-6.1/ipv6-prevent-uaf-in-ip6_send_skb.patch b/queue-6.1/ipv6-prevent-uaf-in-ip6_send_skb.patch
new file mode 100644 (file)
index 0000000..60c8536
--- /dev/null
@@ -0,0 +1,158 @@
+From 9445c2c32f1f2946d4aac92867e49e38a79bbbd3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Aug 2024 16:08:57 +0000
+Subject: ipv6: prevent UAF in ip6_send_skb()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit faa389b2fbaaec7fd27a390b4896139f9da662e3 ]
+
+syzbot reported an UAF in ip6_send_skb() [1]
+
+After ip6_local_out() has returned, we no longer can safely
+dereference rt, unless we hold rcu_read_lock().
+
+A similar issue has been fixed in commit
+a688caa34beb ("ipv6: take rcu lock in rawv6_send_hdrinc()")
+
+Another potential issue in ip6_finish_output2() is handled in a
+separate patch.
+
+[1]
+ BUG: KASAN: slab-use-after-free in ip6_send_skb+0x18d/0x230 net/ipv6/ip6_output.c:1964
+Read of size 8 at addr ffff88806dde4858 by task syz.1.380/6530
+
+CPU: 1 UID: 0 PID: 6530 Comm: syz.1.380 Not tainted 6.11.0-rc3-syzkaller-00306-gdf6cbc62cc9b #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 08/06/2024
+Call Trace:
+ <TASK>
+  __dump_stack lib/dump_stack.c:93 [inline]
+  dump_stack_lvl+0x241/0x360 lib/dump_stack.c:119
+  print_address_description mm/kasan/report.c:377 [inline]
+  print_report+0x169/0x550 mm/kasan/report.c:488
+  kasan_report+0x143/0x180 mm/kasan/report.c:601
+  ip6_send_skb+0x18d/0x230 net/ipv6/ip6_output.c:1964
+  rawv6_push_pending_frames+0x75c/0x9e0 net/ipv6/raw.c:588
+  rawv6_sendmsg+0x19c7/0x23c0 net/ipv6/raw.c:926
+  sock_sendmsg_nosec net/socket.c:730 [inline]
+  __sock_sendmsg+0x1a6/0x270 net/socket.c:745
+  sock_write_iter+0x2dd/0x400 net/socket.c:1160
+ do_iter_readv_writev+0x60a/0x890
+  vfs_writev+0x37c/0xbb0 fs/read_write.c:971
+  do_writev+0x1b1/0x350 fs/read_write.c:1018
+  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+  do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+RIP: 0033:0x7f936bf79e79
+Code: ff ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 40 00 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 c7 c1 a8 ff ff ff f7 d8 64 89 01 48
+RSP: 002b:00007f936cd7f038 EFLAGS: 00000246 ORIG_RAX: 0000000000000014
+RAX: ffffffffffffffda RBX: 00007f936c115f80 RCX: 00007f936bf79e79
+RDX: 0000000000000001 RSI: 0000000020000040 RDI: 0000000000000004
+RBP: 00007f936bfe7916 R08: 0000000000000000 R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000246 R12: 0000000000000000
+R13: 0000000000000000 R14: 00007f936c115f80 R15: 00007fff2860a7a8
+ </TASK>
+
+Allocated by task 6530:
+  kasan_save_stack mm/kasan/common.c:47 [inline]
+  kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+  unpoison_slab_object mm/kasan/common.c:312 [inline]
+  __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338
+  kasan_slab_alloc include/linux/kasan.h:201 [inline]
+  slab_post_alloc_hook mm/slub.c:3988 [inline]
+  slab_alloc_node mm/slub.c:4037 [inline]
+  kmem_cache_alloc_noprof+0x135/0x2a0 mm/slub.c:4044
+  dst_alloc+0x12b/0x190 net/core/dst.c:89
+  ip6_blackhole_route+0x59/0x340 net/ipv6/route.c:2670
+  make_blackhole net/xfrm/xfrm_policy.c:3120 [inline]
+  xfrm_lookup_route+0xd1/0x1c0 net/xfrm/xfrm_policy.c:3313
+  ip6_dst_lookup_flow+0x13e/0x180 net/ipv6/ip6_output.c:1257
+  rawv6_sendmsg+0x1283/0x23c0 net/ipv6/raw.c:898
+  sock_sendmsg_nosec net/socket.c:730 [inline]
+  __sock_sendmsg+0x1a6/0x270 net/socket.c:745
+  ____sys_sendmsg+0x525/0x7d0 net/socket.c:2597
+  ___sys_sendmsg net/socket.c:2651 [inline]
+  __sys_sendmsg+0x2b0/0x3a0 net/socket.c:2680
+  do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+  do_syscall_64+0xf3/0x230 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Freed by task 45:
+  kasan_save_stack mm/kasan/common.c:47 [inline]
+  kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+  kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579
+  poison_slab_object+0xe0/0x150 mm/kasan/common.c:240
+  __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256
+  kasan_slab_free include/linux/kasan.h:184 [inline]
+  slab_free_hook mm/slub.c:2252 [inline]
+  slab_free mm/slub.c:4473 [inline]
+  kmem_cache_free+0x145/0x350 mm/slub.c:4548
+  dst_destroy+0x2ac/0x460 net/core/dst.c:124
+  rcu_do_batch kernel/rcu/tree.c:2569 [inline]
+  rcu_core+0xafd/0x1830 kernel/rcu/tree.c:2843
+  handle_softirqs+0x2c4/0x970 kernel/softirq.c:554
+  __do_softirq kernel/softirq.c:588 [inline]
+  invoke_softirq kernel/softirq.c:428 [inline]
+  __irq_exit_rcu+0xf4/0x1c0 kernel/softirq.c:637
+  irq_exit_rcu+0x9/0x30 kernel/softirq.c:649
+  instr_sysvec_apic_timer_interrupt arch/x86/kernel/apic/apic.c:1043 [inline]
+  sysvec_apic_timer_interrupt+0xa6/0xc0 arch/x86/kernel/apic/apic.c:1043
+  asm_sysvec_apic_timer_interrupt+0x1a/0x20 arch/x86/include/asm/idtentry.h:702
+
+Last potentially related work creation:
+  kasan_save_stack+0x3f/0x60 mm/kasan/common.c:47
+  __kasan_record_aux_stack+0xac/0xc0 mm/kasan/generic.c:541
+  __call_rcu_common kernel/rcu/tree.c:3106 [inline]
+  call_rcu+0x167/0xa70 kernel/rcu/tree.c:3210
+  refdst_drop include/net/dst.h:263 [inline]
+  skb_dst_drop include/net/dst.h:275 [inline]
+  nf_ct_frag6_queue net/ipv6/netfilter/nf_conntrack_reasm.c:306 [inline]
+  nf_ct_frag6_gather+0xb9a/0x2080 net/ipv6/netfilter/nf_conntrack_reasm.c:485
+  ipv6_defrag+0x2c8/0x3c0 net/ipv6/netfilter/nf_defrag_ipv6_hooks.c:67
+  nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline]
+  nf_hook_slow+0xc3/0x220 net/netfilter/core.c:626
+  nf_hook include/linux/netfilter.h:269 [inline]
+  __ip6_local_out+0x6fa/0x800 net/ipv6/output_core.c:143
+  ip6_local_out+0x26/0x70 net/ipv6/output_core.c:153
+  ip6_send_skb+0x112/0x230 net/ipv6/ip6_output.c:1959
+  rawv6_push_pending_frames+0x75c/0x9e0 net/ipv6/raw.c:588
+  rawv6_sendmsg+0x19c7/0x23c0 net/ipv6/raw.c:926
+  sock_sendmsg_nosec net/socket.c:730 [inline]
+  __sock_sendmsg+0x1a6/0x270 net/socket.c:745
+  sock_write_iter+0x2dd/0x400 net/socket.c:1160
+ do_iter_readv_writev+0x60a/0x890
+
+Fixes: 0625491493d9 ("ipv6: ip6_push_pending_frames() should increment IPSTATS_MIB_OUTDISCARDS")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reported-by: syzbot <syzkaller@googlegroups.com>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://patch.msgid.link/20240820160859.3786976-2-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/ip6_output.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c
+index df79044fbf3c4..796cf0a0a4225 100644
+--- a/net/ipv6/ip6_output.c
++++ b/net/ipv6/ip6_output.c
+@@ -1993,6 +1993,7 @@ int ip6_send_skb(struct sk_buff *skb)
+       struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
+       int err;
++      rcu_read_lock();
+       err = ip6_local_out(net, skb->sk, skb);
+       if (err) {
+               if (err > 0)
+@@ -2002,6 +2003,7 @@ int ip6_send_skb(struct sk_buff *skb)
+                                     IPSTATS_MIB_OUTDISCARDS);
+       }
++      rcu_read_unlock();
+       return err;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.1/kcm-serialise-kcm_sendmsg-for-the-same-socket.patch b/queue-6.1/kcm-serialise-kcm_sendmsg-for-the-same-socket.patch
new file mode 100644 (file)
index 0000000..cf93907
--- /dev/null
@@ -0,0 +1,223 @@
+From e7df5604f4167b01735313ee072e77d063209128 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 15:04:37 -0700
+Subject: kcm: Serialise kcm_sendmsg() for the same socket.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 807067bf014d4a3ae2cc55bd3de16f22a01eb580 ]
+
+syzkaller reported UAF in kcm_release(). [0]
+
+The scenario is
+
+  1. Thread A builds a skb with MSG_MORE and sets kcm->seq_skb.
+
+  2. Thread A resumes building skb from kcm->seq_skb but is blocked
+     by sk_stream_wait_memory()
+
+  3. Thread B calls sendmsg() concurrently, finishes building kcm->seq_skb
+     and puts the skb to the write queue
+
+  4. Thread A faces an error and finally frees skb that is already in the
+     write queue
+
+  5. kcm_release() does double-free the skb in the write queue
+
+When a thread is building a MSG_MORE skb, another thread must not touch it.
+
+Let's add a per-sk mutex and serialise kcm_sendmsg().
+
+[0]:
+BUG: KASAN: slab-use-after-free in __skb_unlink include/linux/skbuff.h:2366 [inline]
+BUG: KASAN: slab-use-after-free in __skb_dequeue include/linux/skbuff.h:2385 [inline]
+BUG: KASAN: slab-use-after-free in __skb_queue_purge_reason include/linux/skbuff.h:3175 [inline]
+BUG: KASAN: slab-use-after-free in __skb_queue_purge include/linux/skbuff.h:3181 [inline]
+BUG: KASAN: slab-use-after-free in kcm_release+0x170/0x4c8 net/kcm/kcmsock.c:1691
+Read of size 8 at addr ffff0000ced0fc80 by task syz-executor329/6167
+
+CPU: 1 PID: 6167 Comm: syz-executor329 Tainted: G    B              6.8.0-rc5-syzkaller-g9abbc24128bc #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 01/25/2024
+Call trace:
+ dump_backtrace+0x1b8/0x1e4 arch/arm64/kernel/stacktrace.c:291
+ show_stack+0x2c/0x3c arch/arm64/kernel/stacktrace.c:298
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0xd0/0x124 lib/dump_stack.c:106
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0x178/0x518 mm/kasan/report.c:488
+ kasan_report+0xd8/0x138 mm/kasan/report.c:601
+ __asan_report_load8_noabort+0x20/0x2c mm/kasan/report_generic.c:381
+ __skb_unlink include/linux/skbuff.h:2366 [inline]
+ __skb_dequeue include/linux/skbuff.h:2385 [inline]
+ __skb_queue_purge_reason include/linux/skbuff.h:3175 [inline]
+ __skb_queue_purge include/linux/skbuff.h:3181 [inline]
+ kcm_release+0x170/0x4c8 net/kcm/kcmsock.c:1691
+ __sock_release net/socket.c:659 [inline]
+ sock_close+0xa4/0x1e8 net/socket.c:1421
+ __fput+0x30c/0x738 fs/file_table.c:376
+ ____fput+0x20/0x30 fs/file_table.c:404
+ task_work_run+0x230/0x2e0 kernel/task_work.c:180
+ exit_task_work include/linux/task_work.h:38 [inline]
+ do_exit+0x618/0x1f64 kernel/exit.c:871
+ do_group_exit+0x194/0x22c kernel/exit.c:1020
+ get_signal+0x1500/0x15ec kernel/signal.c:2893
+ do_signal+0x23c/0x3b44 arch/arm64/kernel/signal.c:1249
+ do_notify_resume+0x74/0x1f4 arch/arm64/kernel/entry-common.c:148
+ exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline]
+ exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline]
+ el0_svc+0xac/0x168 arch/arm64/kernel/entry-common.c:713
+ el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730
+ el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598
+
+Allocated by task 6166:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x40/0x78 mm/kasan/common.c:68
+ kasan_save_alloc_info+0x70/0x84 mm/kasan/generic.c:626
+ unpoison_slab_object mm/kasan/common.c:314 [inline]
+ __kasan_slab_alloc+0x74/0x8c mm/kasan/common.c:340
+ kasan_slab_alloc include/linux/kasan.h:201 [inline]
+ slab_post_alloc_hook mm/slub.c:3813 [inline]
+ slab_alloc_node mm/slub.c:3860 [inline]
+ kmem_cache_alloc_node+0x204/0x4c0 mm/slub.c:3903
+ __alloc_skb+0x19c/0x3d8 net/core/skbuff.c:641
+ alloc_skb include/linux/skbuff.h:1296 [inline]
+ kcm_sendmsg+0x1d3c/0x2124 net/kcm/kcmsock.c:783
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ __sock_sendmsg net/socket.c:745 [inline]
+ sock_sendmsg+0x220/0x2c0 net/socket.c:768
+ splice_to_socket+0x7cc/0xd58 fs/splice.c:889
+ do_splice_from fs/splice.c:941 [inline]
+ direct_splice_actor+0xec/0x1d8 fs/splice.c:1164
+ splice_direct_to_actor+0x438/0xa0c fs/splice.c:1108
+ do_splice_direct_actor fs/splice.c:1207 [inline]
+ do_splice_direct+0x1e4/0x304 fs/splice.c:1233
+ do_sendfile+0x460/0xb3c fs/read_write.c:1295
+ __do_sys_sendfile64 fs/read_write.c:1362 [inline]
+ __se_sys_sendfile64 fs/read_write.c:1348 [inline]
+ __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1348
+ __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline]
+ invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51
+ el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136
+ do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155
+ el0_svc+0x54/0x168 arch/arm64/kernel/entry-common.c:712
+ el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730
+ el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598
+
+Freed by task 6167:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x40/0x78 mm/kasan/common.c:68
+ kasan_save_free_info+0x5c/0x74 mm/kasan/generic.c:640
+ poison_slab_object+0x124/0x18c mm/kasan/common.c:241
+ __kasan_slab_free+0x3c/0x78 mm/kasan/common.c:257
+ kasan_slab_free include/linux/kasan.h:184 [inline]
+ slab_free_hook mm/slub.c:2121 [inline]
+ slab_free mm/slub.c:4299 [inline]
+ kmem_cache_free+0x15c/0x3d4 mm/slub.c:4363
+ kfree_skbmem+0x10c/0x19c
+ __kfree_skb net/core/skbuff.c:1109 [inline]
+ kfree_skb_reason+0x240/0x6f4 net/core/skbuff.c:1144
+ kfree_skb include/linux/skbuff.h:1244 [inline]
+ kcm_release+0x104/0x4c8 net/kcm/kcmsock.c:1685
+ __sock_release net/socket.c:659 [inline]
+ sock_close+0xa4/0x1e8 net/socket.c:1421
+ __fput+0x30c/0x738 fs/file_table.c:376
+ ____fput+0x20/0x30 fs/file_table.c:404
+ task_work_run+0x230/0x2e0 kernel/task_work.c:180
+ exit_task_work include/linux/task_work.h:38 [inline]
+ do_exit+0x618/0x1f64 kernel/exit.c:871
+ do_group_exit+0x194/0x22c kernel/exit.c:1020
+ get_signal+0x1500/0x15ec kernel/signal.c:2893
+ do_signal+0x23c/0x3b44 arch/arm64/kernel/signal.c:1249
+ do_notify_resume+0x74/0x1f4 arch/arm64/kernel/entry-common.c:148
+ exit_to_user_mode_prepare arch/arm64/kernel/entry-common.c:169 [inline]
+ exit_to_user_mode arch/arm64/kernel/entry-common.c:178 [inline]
+ el0_svc+0xac/0x168 arch/arm64/kernel/entry-common.c:713
+ el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:730
+ el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:598
+
+The buggy address belongs to the object at ffff0000ced0fc80
+ which belongs to the cache skbuff_head_cache of size 240
+The buggy address is located 0 bytes inside of
+ freed 240-byte region [ffff0000ced0fc80, ffff0000ced0fd70)
+
+The buggy address belongs to the physical page:
+page:00000000d35f4ae4 refcount:1 mapcount:0 mapping:0000000000000000 index:0x0 pfn:0x10ed0f
+flags: 0x5ffc00000000800(slab|node=0|zone=2|lastcpupid=0x7ff)
+page_type: 0xffffffff()
+raw: 05ffc00000000800 ffff0000c1cbf640 fffffdffc3423100 dead000000000004
+raw: 0000000000000000 00000000000c000c 00000001ffffffff 0000000000000000
+page dumped because: kasan: bad access detected
+
+Memory state around the buggy address:
+ ffff0000ced0fb80: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+ ffff0000ced0fc00: fb fb fb fb fb fb fc fc fc fc fc fc fc fc fc fc
+>ffff0000ced0fc80: fa fb fb fb fb fb fb fb fb fb fb fb fb fb fb fb
+                   ^
+ ffff0000ced0fd00: fb fb fb fb fb fb fb fb fb fb fb fb fb fb fc fc
+ ffff0000ced0fd80: fc fc fc fc fc fc fc fc fa fb fb fb fb fb fb fb
+
+Fixes: ab7ac4eb9832 ("kcm: Kernel Connection Multiplexor module")
+Reported-by: syzbot+b72d86aa5df17ce74c60@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=b72d86aa5df17ce74c60
+Tested-by: syzbot+b72d86aa5df17ce74c60@syzkaller.appspotmail.com
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20240815220437.69511-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/kcm.h | 1 +
+ net/kcm/kcmsock.c | 4 ++++
+ 2 files changed, 5 insertions(+)
+
+diff --git a/include/net/kcm.h b/include/net/kcm.h
+index 2d704f8f49059..8e8252e08a9ce 100644
+--- a/include/net/kcm.h
++++ b/include/net/kcm.h
+@@ -70,6 +70,7 @@ struct kcm_sock {
+       struct work_struct tx_work;
+       struct list_head wait_psock_list;
+       struct sk_buff *seq_skb;
++      struct mutex tx_mutex;
+       u32 tx_stopped : 1;
+       /* Don't use bit fields here, these are set under different locks */
+diff --git a/net/kcm/kcmsock.c b/net/kcm/kcmsock.c
+index 7d37bf4334d26..462bdb6bfa4d8 100644
+--- a/net/kcm/kcmsock.c
++++ b/net/kcm/kcmsock.c
+@@ -912,6 +912,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+                 !(msg->msg_flags & MSG_MORE) : !!(msg->msg_flags & MSG_EOR);
+       int err = -EPIPE;
++      mutex_lock(&kcm->tx_mutex);
+       lock_sock(sk);
+       /* Per tcp_sendmsg this should be in poll */
+@@ -1060,6 +1061,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+       KCM_STATS_ADD(kcm->stats.tx_bytes, copied);
+       release_sock(sk);
++      mutex_unlock(&kcm->tx_mutex);
+       return copied;
+ out_error:
+@@ -1085,6 +1087,7 @@ static int kcm_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+               sk->sk_write_space(sk);
+       release_sock(sk);
++      mutex_unlock(&kcm->tx_mutex);
+       return err;
+ }
+@@ -1325,6 +1328,7 @@ static void init_kcm_sock(struct kcm_sock *kcm, struct kcm_mux *mux)
+       spin_unlock_bh(&mux->lock);
+       INIT_WORK(&kcm->tx_work, kcm_tx_work);
++      mutex_init(&kcm->tx_mutex);
+       spin_lock_bh(&mux->rx_lock);
+       kcm_rcv_ready(kcm);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-dsa-mv88e6xxx-fix-out-of-bound-access.patch b/queue-6.1/net-dsa-mv88e6xxx-fix-out-of-bound-access.patch
new file mode 100644 (file)
index 0000000..c58ae50
--- /dev/null
@@ -0,0 +1,39 @@
+From 207617bd9d1d443b7230056673b65952ebb37384 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 19 Aug 2024 19:52:50 -0400
+Subject: net: dsa: mv88e6xxx: Fix out-of-bound access
+
+From: Joseph Huang <Joseph.Huang@garmin.com>
+
+[ Upstream commit 528876d867a23b5198022baf2e388052ca67c952 ]
+
+If an ATU violation was caused by a CPU Load operation, the SPID could
+be larger than DSA_MAX_PORTS (the size of mv88e6xxx_chip.ports[] array).
+
+Fixes: 75c05a74e745 ("net: dsa: mv88e6xxx: Fix counting of ATU violations")
+Signed-off-by: Joseph Huang <Joseph.Huang@garmin.com>
+Reviewed-by: Andrew Lunn <andrew@lunn.ch>
+Link: https://patch.msgid.link/20240819235251.1331763-1-Joseph.Huang@garmin.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/dsa/mv88e6xxx/global1_atu.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/dsa/mv88e6xxx/global1_atu.c b/drivers/net/dsa/mv88e6xxx/global1_atu.c
+index 7c513a03789cf..17fd62616ce6d 100644
+--- a/drivers/net/dsa/mv88e6xxx/global1_atu.c
++++ b/drivers/net/dsa/mv88e6xxx/global1_atu.c
+@@ -453,7 +453,8 @@ static irqreturn_t mv88e6xxx_g1_atu_prob_irq_thread_fn(int irq, void *dev_id)
+               trace_mv88e6xxx_atu_full_violation(chip->dev, spid,
+                                                  entry.portvec, entry.mac,
+                                                  fid);
+-              chip->ports[spid].atu_full_violation++;
++              if (spid < ARRAY_SIZE(chip->ports))
++                      chip->ports[spid].atu_full_violation++;
+       }
+       mv88e6xxx_reg_unlock(chip);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-dsa-tag_ocelot-call-only-the-relevant-portion-of.patch b/queue-6.1/net-dsa-tag_ocelot-call-only-the-relevant-portion-of.patch
new file mode 100644 (file)
index 0000000..1841c8c
--- /dev/null
@@ -0,0 +1,142 @@
+From d9c54abf576a1a42a1f9242c86e246f29fcd9d88 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 21 Apr 2023 01:56:01 +0300
+Subject: net: dsa: tag_ocelot: call only the relevant portion of
+ __skb_vlan_pop() on TX
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit 0bcf2e4aca6c29a07555b713f2fb461dc38d5977 ]
+
+ocelot_xmit_get_vlan_info() calls __skb_vlan_pop() as the most
+appropriate helper I could find which strips away a VLAN header.
+That's all I need it to do, but __skb_vlan_pop() has more logic, which
+will become incompatible with the future revert of commit 6d1ccff62780
+("net: reset mac header in dev_start_xmit()").
+
+Namely, it performs a sanity check on skb_mac_header(), which will stop
+being set after the above revert, so it will return an error instead of
+removing the VLAN tag.
+
+ocelot_xmit_get_vlan_info() gets called in 2 circumstances:
+
+(1) the port is under a VLAN-aware bridge and the bridge sends
+    VLAN-tagged packets
+
+(2) the port is under a VLAN-aware bridge and somebody else (an 8021q
+    upper) sends VLAN-tagged packets (using a VID that isn't in the
+    bridge vlan tables)
+
+In case (1), there is actually no bug to defend against, because
+br_dev_xmit() calls skb_reset_mac_header() and things continue to work.
+
+However, in case (2), illustrated using the commands below, it can be
+seen that our intervention is needed, since __skb_vlan_pop() complains:
+
+$ ip link add br0 type bridge vlan_filtering 1 && ip link set br0 up
+$ ip link set $eth master br0 && ip link set $eth up
+$ ip link add link $eth name $eth.100 type vlan id 100 && ip link set $eth.100 up
+$ ip addr add 192.168.100.1/24 dev $eth.100
+
+I could fend off the checks in __skb_vlan_pop() with some
+skb_mac_header_was_set() calls, but seeing how few callers of
+__skb_vlan_pop() there are from TX paths, that seems rather
+unproductive.
+
+As an alternative solution, extract the bare minimum logic to strip a
+VLAN header, and move it to a new helper named vlan_remove_tag(), close
+to the definition of vlan_insert_tag(). Document it appropriately and
+make ocelot_xmit_get_vlan_info() call this smaller helper instead.
+
+Seeing that it doesn't appear illegal to test skb->protocol in the TX
+path, I guess it would be a good for vlan_remove_tag() to also absorb
+the vlan_set_encap_proto() function call.
+
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Reviewed-by: Simon Horman <simon.horman@corigine.com>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 67c3ca2c5cfe ("net: mscc: ocelot: use ocelot_xmit_get_vlan_info() also for FDMA and register injection")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/if_vlan.h | 21 +++++++++++++++++++++
+ net/core/skbuff.c       |  8 +-------
+ net/dsa/tag_ocelot.c    |  2 +-
+ 3 files changed, 23 insertions(+), 8 deletions(-)
+
+diff --git a/include/linux/if_vlan.h b/include/linux/if_vlan.h
+index e0d0a645be7cf..83266201746c1 100644
+--- a/include/linux/if_vlan.h
++++ b/include/linux/if_vlan.h
+@@ -704,6 +704,27 @@ static inline void vlan_set_encap_proto(struct sk_buff *skb,
+               skb->protocol = htons(ETH_P_802_2);
+ }
++/**
++ * vlan_remove_tag - remove outer VLAN tag from payload
++ * @skb: skbuff to remove tag from
++ * @vlan_tci: buffer to store value
++ *
++ * Expects the skb to contain a VLAN tag in the payload, and to have skb->data
++ * pointing at the MAC header.
++ *
++ * Returns a new pointer to skb->data, or NULL on failure to pull.
++ */
++static inline void *vlan_remove_tag(struct sk_buff *skb, u16 *vlan_tci)
++{
++      struct vlan_hdr *vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
++
++      *vlan_tci = ntohs(vhdr->h_vlan_TCI);
++
++      memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
++      vlan_set_encap_proto(skb, vhdr);
++      return __skb_pull(skb, VLAN_HLEN);
++}
++
+ /**
+  * skb_vlan_tagged - check if skb is vlan tagged.
+  * @skb: skbuff to query
+diff --git a/net/core/skbuff.c b/net/core/skbuff.c
+index 4d46788cd493a..768b8d65a5baa 100644
+--- a/net/core/skbuff.c
++++ b/net/core/skbuff.c
+@@ -5791,7 +5791,6 @@ EXPORT_SYMBOL(skb_ensure_writable);
+  */
+ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+ {
+-      struct vlan_hdr *vhdr;
+       int offset = skb->data - skb_mac_header(skb);
+       int err;
+@@ -5807,13 +5806,8 @@ int __skb_vlan_pop(struct sk_buff *skb, u16 *vlan_tci)
+       skb_postpull_rcsum(skb, skb->data + (2 * ETH_ALEN), VLAN_HLEN);
+-      vhdr = (struct vlan_hdr *)(skb->data + ETH_HLEN);
+-      *vlan_tci = ntohs(vhdr->h_vlan_TCI);
+-
+-      memmove(skb->data + VLAN_HLEN, skb->data, 2 * ETH_ALEN);
+-      __skb_pull(skb, VLAN_HLEN);
++      vlan_remove_tag(skb, vlan_tci);
+-      vlan_set_encap_proto(skb, vhdr);
+       skb->mac_header += VLAN_HLEN;
+       if (skb_network_offset(skb) < ETH_HLEN)
+diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
+index afca3cdf190a0..18dda9423fae5 100644
+--- a/net/dsa/tag_ocelot.c
++++ b/net/dsa/tag_ocelot.c
+@@ -26,7 +26,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
+       br_vlan_get_proto(br, &proto);
+       if (ntohs(hdr->h_vlan_proto) == proto) {
+-              __skb_vlan_pop(skb, &tci);
++              vlan_remove_tag(skb, &tci);
+               *vlan_tci = tci;
+       } else {
+               rcu_read_lock();
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-dsa-tag_ocelot-do-not-rely-on-skb_mac_header-for.patch b/queue-6.1/net-dsa-tag_ocelot-do-not-rely-on-skb_mac_header-for.patch
new file mode 100644 (file)
index 0000000..d3b3cc7
--- /dev/null
@@ -0,0 +1,42 @@
+From a87a0ea313ec6e65248cb6705d3ffc929d4944b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 21 Apr 2023 01:55:56 +0300
+Subject: net: dsa: tag_ocelot: do not rely on skb_mac_header() for VLAN xmit
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit eabb1494c9f20362ae53a9991481a1523be4f4b7 ]
+
+skb_mac_header() will no longer be available in the TX path when
+reverting commit 6d1ccff62780 ("net: reset mac header in
+dev_start_xmit()"). As preparation for that, let's use
+skb_vlan_eth_hdr() to get to the VLAN header instead, which assumes it's
+located at skb->data (assumption which holds true here).
+
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Simon Horman <simon.horman@corigine.com>
+Reviewed-by: Florian Fainelli <f.fainelli@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 67c3ca2c5cfe ("net: mscc: ocelot: use ocelot_xmit_get_vlan_info() also for FDMA and register injection")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/dsa/tag_ocelot.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
+index 0d81f172b7a6e..afca3cdf190a0 100644
+--- a/net/dsa/tag_ocelot.c
++++ b/net/dsa/tag_ocelot.c
+@@ -22,7 +22,7 @@ static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
+               return;
+       }
+-      hdr = (struct vlan_ethhdr *)skb_mac_header(skb);
++      hdr = skb_vlan_eth_hdr(skb);
+       br_vlan_get_proto(br, &proto);
+       if (ntohs(hdr->h_vlan_proto) == proto) {
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mctp-test-use-correct-skb-for-route-input-check.patch b/queue-6.1/net-mctp-test-use-correct-skb-for-route-input-check.patch
new file mode 100644 (file)
index 0000000..9a75c85
--- /dev/null
@@ -0,0 +1,44 @@
+From e84e8a7932841ba91ffec6b2455e57edb81eb494 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 16 Aug 2024 18:29:17 +0800
+Subject: net: mctp: test: Use correct skb for route input check
+
+From: Jeremy Kerr <jk@codeconstruct.com.au>
+
+[ Upstream commit ce335db0621648472f9bb4b7191eb2e13a5793cf ]
+
+In the MCTP route input test, we're routing one skb, then (when delivery
+is expected) checking the resulting routed skb.
+
+However, we're currently checking the original skb length, rather than
+the routed skb. Check the routed skb instead; the original will have
+been freed at this point.
+
+Fixes: 8892c0490779 ("mctp: Add route input to socket tests")
+Reported-by: Dan Carpenter <dan.carpenter@linaro.org>
+Closes: https://lore.kernel.org/kernel-janitors/4ad204f0-94cf-46c5-bdab-49592addf315@kili.mountain/
+Signed-off-by: Jeremy Kerr <jk@codeconstruct.com.au>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240816-mctp-kunit-skb-fix-v1-1-3c367ac89c27@codeconstruct.com.au
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/mctp/test/route-test.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/mctp/test/route-test.c b/net/mctp/test/route-test.c
+index 92ea4158f7fc4..a944490a724d3 100644
+--- a/net/mctp/test/route-test.c
++++ b/net/mctp/test/route-test.c
+@@ -354,7 +354,7 @@ static void mctp_test_route_input_sk(struct kunit *test)
+               skb2 = skb_recv_datagram(sock->sk, MSG_DONTWAIT, &rc);
+               KUNIT_EXPECT_NOT_ERR_OR_NULL(test, skb2);
+-              KUNIT_EXPECT_EQ(test, skb->len, 1);
++              KUNIT_EXPECT_EQ(test, skb2->len, 1);
+               skb_free_datagram(sock->sk, skb2);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mscc-ocelot-fix-qos-class-for-injected-packets-w.patch b/queue-6.1/net-mscc-ocelot-fix-qos-class-for-injected-packets-w.patch
new file mode 100644 (file)
index 0000000..c76e77d
--- /dev/null
@@ -0,0 +1,106 @@
+From a1a6eed1e5817d51720741cbf6a3b6432a68d45b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 03:07:03 +0300
+Subject: net: mscc: ocelot: fix QoS class for injected packets with
+ "ocelot-8021q"
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit e1b9e80236c540fa85d76e2d510d1b38e1968c5d ]
+
+There are 2 distinct code paths (listed below) in the source code which
+set up an injection header for Ocelot(-like) switches. Code path (2)
+lacks the QoS class and source port being set correctly. Especially the
+improper QoS classification is a problem for the "ocelot-8021q"
+alternative DSA tagging protocol, because we support tc-taprio and each
+packet needs to be scheduled precisely through its time slot. This
+includes PTP, which is normally assigned to a traffic class other than
+0, but would be sent through TC 0 nonetheless.
+
+The code paths are:
+
+(1) ocelot_xmit_common() from net/dsa/tag_ocelot.c - called only by the
+    standard "ocelot" DSA tagging protocol which uses NPI-based
+    injection - sets up bit fields in the tag manually to account for
+    a small difference (destination port offset) between Ocelot and
+    Seville. Namely, ocelot_ifh_set_dest() is omitted out of
+    ocelot_xmit_common(), because there's also seville_ifh_set_dest().
+
+(2) ocelot_ifh_set_basic(), called by:
+    - ocelot_fdma_prepare_skb() for FDMA transmission of the ocelot
+      switchdev driver
+    - ocelot_port_xmit() -> ocelot_port_inject_frame() for
+      register-based transmission of the ocelot switchdev driver
+    - felix_port_deferred_xmit() -> ocelot_port_inject_frame() for the
+      DSA tagger ocelot-8021q when it must transmit PTP frames (also
+      through register-based injection).
+    sets the bit fields according to its own logic.
+
+The problem is that (2) doesn't call ocelot_ifh_set_qos_class().
+Copying that logic from ocelot_xmit_common() fixes that.
+
+Unfortunately, although desirable, it is not easily possible to
+de-duplicate code paths (1) and (2), and make net/dsa/tag_ocelot.c
+directly call ocelot_ifh_set_basic()), because of the ocelot/seville
+difference. This is the "minimal" fix with some logic duplicated (but
+at least more consolidated).
+
+Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping")
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mscc/ocelot.c      | 10 +++++++++-
+ drivers/net/ethernet/mscc/ocelot_fdma.c |  1 -
+ 2 files changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
+index b594f3054afb6..ad179a9dca61a 100644
+--- a/drivers/net/ethernet/mscc/ocelot.c
++++ b/drivers/net/ethernet/mscc/ocelot.c
+@@ -1103,13 +1103,21 @@ void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port,
+                         u32 rew_op, struct sk_buff *skb)
+ {
+       struct ocelot_port *ocelot_port = ocelot->ports[port];
++      struct net_device *dev = skb->dev;
+       u64 vlan_tci, tag_type;
++      int qos_class;
+       ocelot_xmit_get_vlan_info(skb, ocelot_port->bridge, &vlan_tci,
+                                 &tag_type);
++      qos_class = netdev_get_num_tc(dev) ?
++                  netdev_get_prio_tc_map(dev, skb->priority) : skb->priority;
++
++      memset(ifh, 0, OCELOT_TAG_LEN);
+       ocelot_ifh_set_bypass(ifh, 1);
++      ocelot_ifh_set_src(ifh, BIT_ULL(ocelot->num_phys_ports));
+       ocelot_ifh_set_dest(ifh, BIT_ULL(port));
++      ocelot_ifh_set_qos_class(ifh, qos_class);
+       ocelot_ifh_set_tag_type(ifh, tag_type);
+       ocelot_ifh_set_vlan_tci(ifh, vlan_tci);
+       if (rew_op)
+@@ -1120,7 +1128,7 @@ EXPORT_SYMBOL(ocelot_ifh_set_basic);
+ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+                             u32 rew_op, struct sk_buff *skb)
+ {
+-      u32 ifh[OCELOT_TAG_LEN / 4] = {0};
++      u32 ifh[OCELOT_TAG_LEN / 4];
+       unsigned int i, count, last;
+       ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
+diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
+index e9d2e96adb229..cc9bce5a4dcdf 100644
+--- a/drivers/net/ethernet/mscc/ocelot_fdma.c
++++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
+@@ -665,7 +665,6 @@ static int ocelot_fdma_prepare_skb(struct ocelot *ocelot, int port, u32 rew_op,
+       ifh = skb_push(skb, OCELOT_TAG_LEN);
+       skb_put(skb, ETH_FCS_LEN);
+-      memset(ifh, 0, OCELOT_TAG_LEN);
+       ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb);
+       return 0;
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mscc-ocelot-serialize-access-to-the-injection-ex.patch b/queue-6.1/net-mscc-ocelot-serialize-access-to-the-injection-ex.patch
new file mode 100644 (file)
index 0000000..a21eeb3
--- /dev/null
@@ -0,0 +1,245 @@
+From ec63457374787b7992800dee9839023c323de015 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 03:07:04 +0300
+Subject: net: mscc: ocelot: serialize access to the injection/extraction
+ groups
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit c5e12ac3beb0dd3a718296b2d8af5528e9ab728e ]
+
+As explained by Horatiu Vultur in commit 603ead96582d ("net: sparx5: Add
+spinlock for frame transmission from CPU") which is for a similar
+hardware design, multiple CPUs can simultaneously perform injection
+or extraction. There are only 2 register groups for injection and 2
+for extraction, and the driver only uses one of each. So we'd better
+serialize access using spin locks, otherwise frame corruption is
+possible.
+
+Note that unlike in sparx5, FDMA in ocelot does not have this issue
+because struct ocelot_fdma_tx_ring already contains an xmit_lock.
+
+I guess this is mostly a problem for NXP LS1028A, as that is dual core.
+I don't think VSC7514 is. So I'm blaming the commit where LS1028A (aka
+the felix DSA driver) started using register-based packet injection and
+extraction.
+
+Fixes: 0a6f17c6ae21 ("net: dsa: tag_ocelot_8021q: add support for PTP timestamping")
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/dsa/ocelot/felix.c             | 11 +++++
+ drivers/net/ethernet/mscc/ocelot.c         | 52 ++++++++++++++++++++++
+ drivers/net/ethernet/mscc/ocelot_vsc7514.c |  4 ++
+ include/soc/mscc/ocelot.h                  |  9 ++++
+ 4 files changed, 76 insertions(+)
+
+diff --git a/drivers/net/dsa/ocelot/felix.c b/drivers/net/dsa/ocelot/felix.c
+index 2d2c6f941272c..73da407bb0685 100644
+--- a/drivers/net/dsa/ocelot/felix.c
++++ b/drivers/net/dsa/ocelot/felix.c
+@@ -528,7 +528,9 @@ static int felix_tag_8021q_setup(struct dsa_switch *ds)
+        * so we need to be careful that there are no extra frames to be
+        * dequeued over MMIO, since we would never know to discard them.
+        */
++      ocelot_lock_xtr_grp_bh(ocelot, 0);
+       ocelot_drain_cpu_queue(ocelot, 0);
++      ocelot_unlock_xtr_grp_bh(ocelot, 0);
+       return 0;
+ }
+@@ -1493,6 +1495,8 @@ static void felix_port_deferred_xmit(struct kthread_work *work)
+       int port = xmit_work->dp->index;
+       int retries = 10;
++      ocelot_lock_inj_grp(ocelot, 0);
++
+       do {
+               if (ocelot_can_inject(ocelot, 0))
+                       break;
+@@ -1501,6 +1505,7 @@ static void felix_port_deferred_xmit(struct kthread_work *work)
+       } while (--retries);
+       if (!retries) {
++              ocelot_unlock_inj_grp(ocelot, 0);
+               dev_err(ocelot->dev, "port %d failed to inject skb\n",
+                       port);
+               ocelot_port_purge_txtstamp_skb(ocelot, port, skb);
+@@ -1510,6 +1515,8 @@ static void felix_port_deferred_xmit(struct kthread_work *work)
+       ocelot_port_inject_frame(ocelot, port, 0, rew_op, skb);
++      ocelot_unlock_inj_grp(ocelot, 0);
++
+       consume_skb(skb);
+       kfree(xmit_work);
+ }
+@@ -1658,6 +1665,8 @@ static bool felix_check_xtr_pkt(struct ocelot *ocelot)
+       if (!felix->info->quirk_no_xtr_irq)
+               return false;
++      ocelot_lock_xtr_grp(ocelot, grp);
++
+       while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) {
+               struct sk_buff *skb;
+               unsigned int type;
+@@ -1694,6 +1703,8 @@ static bool felix_check_xtr_pkt(struct ocelot *ocelot)
+               ocelot_drain_cpu_queue(ocelot, 0);
+       }
++      ocelot_unlock_xtr_grp(ocelot, grp);
++
+       return true;
+ }
+diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
+index ad179a9dca61a..310a36356f568 100644
+--- a/drivers/net/ethernet/mscc/ocelot.c
++++ b/drivers/net/ethernet/mscc/ocelot.c
+@@ -994,6 +994,48 @@ void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
+ }
+ EXPORT_SYMBOL(ocelot_ptp_rx_timestamp);
++void ocelot_lock_inj_grp(struct ocelot *ocelot, int grp)
++                       __acquires(&ocelot->inj_lock)
++{
++      spin_lock(&ocelot->inj_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_lock_inj_grp);
++
++void ocelot_unlock_inj_grp(struct ocelot *ocelot, int grp)
++                         __releases(&ocelot->inj_lock)
++{
++      spin_unlock(&ocelot->inj_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_unlock_inj_grp);
++
++void ocelot_lock_xtr_grp(struct ocelot *ocelot, int grp)
++                       __acquires(&ocelot->inj_lock)
++{
++      spin_lock(&ocelot->inj_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_lock_xtr_grp);
++
++void ocelot_unlock_xtr_grp(struct ocelot *ocelot, int grp)
++                         __releases(&ocelot->inj_lock)
++{
++      spin_unlock(&ocelot->inj_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_unlock_xtr_grp);
++
++void ocelot_lock_xtr_grp_bh(struct ocelot *ocelot, int grp)
++                          __acquires(&ocelot->xtr_lock)
++{
++      spin_lock_bh(&ocelot->xtr_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_lock_xtr_grp_bh);
++
++void ocelot_unlock_xtr_grp_bh(struct ocelot *ocelot, int grp)
++                            __releases(&ocelot->xtr_lock)
++{
++      spin_unlock_bh(&ocelot->xtr_lock);
++}
++EXPORT_SYMBOL_GPL(ocelot_unlock_xtr_grp_bh);
++
+ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
+ {
+       u64 timestamp, src_port, len;
+@@ -1004,6 +1046,8 @@ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **nskb)
+       u32 val, *buf;
+       int err;
++      lockdep_assert_held(&ocelot->xtr_lock);
++
+       err = ocelot_xtr_poll_xfh(ocelot, grp, xfh);
+       if (err)
+               return err;
+@@ -1079,6 +1123,8 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp)
+ {
+       u32 val = ocelot_read(ocelot, QS_INJ_STATUS);
++      lockdep_assert_held(&ocelot->inj_lock);
++
+       if (!(val & QS_INJ_STATUS_FIFO_RDY(BIT(grp))))
+               return false;
+       if (val & QS_INJ_STATUS_WMARK_REACHED(BIT(grp)))
+@@ -1131,6 +1177,8 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+       u32 ifh[OCELOT_TAG_LEN / 4];
+       unsigned int i, count, last;
++      lockdep_assert_held(&ocelot->inj_lock);
++
+       ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
+                        QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
+@@ -1167,6 +1215,8 @@ EXPORT_SYMBOL(ocelot_port_inject_frame);
+ void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp)
+ {
++      lockdep_assert_held(&ocelot->xtr_lock);
++
+       while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp))
+               ocelot_read_rix(ocelot, QS_XTR_RD, grp);
+ }
+@@ -2758,6 +2808,8 @@ int ocelot_init(struct ocelot *ocelot)
+       mutex_init(&ocelot->tas_lock);
+       spin_lock_init(&ocelot->ptp_clock_lock);
+       spin_lock_init(&ocelot->ts_id_lock);
++      spin_lock_init(&ocelot->inj_lock);
++      spin_lock_init(&ocelot->xtr_lock);
+       ocelot->owq = alloc_ordered_workqueue("ocelot-owq", 0);
+       if (!ocelot->owq)
+diff --git a/drivers/net/ethernet/mscc/ocelot_vsc7514.c b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+index 6f22aea08a644..bf39a053dc82f 100644
+--- a/drivers/net/ethernet/mscc/ocelot_vsc7514.c
++++ b/drivers/net/ethernet/mscc/ocelot_vsc7514.c
+@@ -159,6 +159,8 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
+       struct ocelot *ocelot = arg;
+       int grp = 0, err;
++      ocelot_lock_xtr_grp(ocelot, grp);
++
+       while (ocelot_read(ocelot, QS_XTR_DATA_PRESENT) & BIT(grp)) {
+               struct sk_buff *skb;
+@@ -177,6 +179,8 @@ static irqreturn_t ocelot_xtr_irq_handler(int irq, void *arg)
+       if (err < 0)
+               ocelot_drain_cpu_queue(ocelot, 0);
++      ocelot_unlock_xtr_grp(ocelot, grp);
++
+       return IRQ_HANDLED;
+ }
+diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
+index 9b904ea2f0db9..9b5562f545486 100644
+--- a/include/soc/mscc/ocelot.h
++++ b/include/soc/mscc/ocelot.h
+@@ -977,6 +977,9 @@ struct ocelot {
+       const struct ocelot_stat_layout *stats_layout;
+       struct list_head                stats_regions;
++      spinlock_t                      inj_lock;
++      spinlock_t                      xtr_lock;
++
+       u32                             pool_size[OCELOT_SB_NUM][OCELOT_SB_POOL_NUM];
+       int                             packet_buffer_size;
+       int                             num_frame_refs;
+@@ -1125,6 +1128,12 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
+                             u32 val, u32 reg, u32 offset);
+ /* Packet I/O */
++void ocelot_lock_inj_grp(struct ocelot *ocelot, int grp);
++void ocelot_unlock_inj_grp(struct ocelot *ocelot, int grp);
++void ocelot_lock_xtr_grp(struct ocelot *ocelot, int grp);
++void ocelot_unlock_xtr_grp(struct ocelot *ocelot, int grp);
++void ocelot_lock_xtr_grp_bh(struct ocelot *ocelot, int grp);
++void ocelot_unlock_xtr_grp_bh(struct ocelot *ocelot, int grp);
+ bool ocelot_can_inject(struct ocelot *ocelot, int grp);
+ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+                             u32 rew_op, struct sk_buff *skb);
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-mscc-ocelot-use-ocelot_xmit_get_vlan_info-also-f.patch b/queue-6.1/net-mscc-ocelot-use-ocelot_xmit_get_vlan_info-also-f.patch
new file mode 100644 (file)
index 0000000..0f2da98
--- /dev/null
@@ -0,0 +1,347 @@
+From ad707ca9b851301fb030d7047d93c8c95de029e0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 03:07:02 +0300
+Subject: net: mscc: ocelot: use ocelot_xmit_get_vlan_info() also for FDMA and
+ register injection
+
+From: Vladimir Oltean <vladimir.oltean@nxp.com>
+
+[ Upstream commit 67c3ca2c5cfe6a50772514e3349b5e7b3b0fac03 ]
+
+Problem description
+-------------------
+
+On an NXP LS1028A (felix DSA driver) with the following configuration:
+
+- ocelot-8021q tagging protocol
+- VLAN-aware bridge (with STP) spanning at least swp0 and swp1
+- 8021q VLAN upper interfaces on swp0 and swp1: swp0.700, swp1.700
+- ptp4l on swp0.700 and swp1.700
+
+we see that the ptp4l instances do not see each other's traffic,
+and they all go to the grand master state due to the
+ANNOUNCE_RECEIPT_TIMEOUT_EXPIRES condition.
+
+Jumping to the conclusion for the impatient
+-------------------------------------------
+
+There is a zero-day bug in the ocelot switchdev driver in the way it
+handles VLAN-tagged packet injection. The correct logic already exists in
+the source code, in function ocelot_xmit_get_vlan_info() added by commit
+5ca721c54d86 ("net: dsa: tag_ocelot: set the classified VLAN during xmit").
+But it is used only for normal NPI-based injection with the DSA "ocelot"
+tagging protocol. The other injection code paths (register-based and
+FDMA-based) roll their own wrong logic. This affects and was noticed on
+the DSA "ocelot-8021q" protocol because it uses register-based injection.
+
+By moving ocelot_xmit_get_vlan_info() to a place that's common for both
+the DSA tagger and the ocelot switch library, it can also be called from
+ocelot_port_inject_frame() in ocelot.c.
+
+We need to touch the lines with ocelot_ifh_port_set()'s prototype
+anyway, so let's rename it to something clearer regarding what it does,
+and add a kernel-doc. ocelot_ifh_set_basic() should do.
+
+Investigation notes
+-------------------
+
+Debugging reveals that PTP event (aka those carrying timestamps, like
+Sync) frames injected into swp0.700 (but also swp1.700) hit the wire
+with two VLAN tags:
+
+00000000: 01 1b 19 00 00 00 00 01 02 03 04 05 81 00 02 bc
+                                              ~~~~~~~~~~~
+00000010: 81 00 02 bc 88 f7 00 12 00 2c 00 00 02 00 00 00
+          ~~~~~~~~~~~
+00000020: 00 00 00 00 00 00 00 00 00 00 00 01 02 ff fe 03
+00000030: 04 05 00 01 00 04 00 00 00 00 00 00 00 00 00 00
+00000040: 00 00
+
+The second (unexpected) VLAN tag makes felix_check_xtr_pkt() ->
+ptp_classify_raw() fail to see these as PTP packets at the link
+partner's receiving end, and return PTP_CLASS_NONE (because the BPF
+classifier is not written to expect 2 VLAN tags).
+
+The reason why packets have 2 VLAN tags is because the transmission
+code treats VLAN incorrectly.
+
+Neither ocelot switchdev, nor felix DSA, declare the NETIF_F_HW_VLAN_CTAG_TX
+feature. Therefore, at xmit time, all VLANs should be in the skb head,
+and none should be in the hwaccel area. This is done by:
+
+static struct sk_buff *validate_xmit_vlan(struct sk_buff *skb,
+                                         netdev_features_t features)
+{
+       if (skb_vlan_tag_present(skb) &&
+           !vlan_hw_offload_capable(features, skb->vlan_proto))
+               skb = __vlan_hwaccel_push_inside(skb);
+       return skb;
+}
+
+But ocelot_port_inject_frame() handles things incorrectly:
+
+       ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb));
+
+void ocelot_ifh_port_set(struct sk_buff *skb, void *ifh, int port, u32 rew_op)
+{
+       (...)
+       if (vlan_tag)
+               ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
+       (...)
+}
+
+The way __vlan_hwaccel_push_inside() pushes the tag inside the skb head
+is by calling:
+
+static inline void __vlan_hwaccel_clear_tag(struct sk_buff *skb)
+{
+       skb->vlan_present = 0;
+}
+
+which does _not_ zero out skb->vlan_tci as seen by skb_vlan_tag_get().
+This means that ocelot, when it calls skb_vlan_tag_get(), sees
+(and uses) a residual skb->vlan_tci, while the same VLAN tag is
+_already_ in the skb head.
+
+The trivial fix for double VLAN headers is to replace the content of
+ocelot_ifh_port_set() with:
+
+       if (skb_vlan_tag_present(skb))
+               ocelot_ifh_set_vlan_tci(ifh, skb_vlan_tag_get(skb));
+
+but this would not be correct either, because, as mentioned,
+vlan_hw_offload_capable() is false for us, so we'd be inserting dead
+code and we'd always transmit packets with VID=0 in the injection frame
+header.
+
+I can't actually test the ocelot switchdev driver and rely exclusively
+on code inspection, but I don't think traffic from 8021q uppers has ever
+been injected properly, and not double-tagged. Thus I'm blaming the
+introduction of VLAN fields in the injection header - early driver code.
+
+As hinted at in the early conclusion, what we _want_ to happen for
+VLAN transmission was already described once in commit 5ca721c54d86
+("net: dsa: tag_ocelot: set the classified VLAN during xmit").
+
+ocelot_xmit_get_vlan_info() intends to ensure that if the port through
+which we're transmitting is under a VLAN-aware bridge, the outer VLAN
+tag from the skb head is stripped from there and inserted into the
+injection frame header (so that the packet is processed in hardware
+through that actual VLAN). And in all other cases, the packet is sent
+with VID=0 in the injection frame header, since the port is VLAN-unaware
+and has logic to strip this VID on egress (making it invisible to the
+wire).
+
+Fixes: 08d02364b12f ("net: mscc: fix the injection header")
+Signed-off-by: Vladimir Oltean <vladimir.oltean@nxp.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mscc/ocelot.c      | 29 +++++++++++----
+ drivers/net/ethernet/mscc/ocelot_fdma.c |  2 +-
+ include/linux/dsa/ocelot.h              | 47 +++++++++++++++++++++++++
+ include/soc/mscc/ocelot.h               |  3 +-
+ net/dsa/tag_ocelot.c                    | 37 ++-----------------
+ 5 files changed, 75 insertions(+), 43 deletions(-)
+
+diff --git a/drivers/net/ethernet/mscc/ocelot.c b/drivers/net/ethernet/mscc/ocelot.c
+index 01b6e13f4692f..b594f3054afb6 100644
+--- a/drivers/net/ethernet/mscc/ocelot.c
++++ b/drivers/net/ethernet/mscc/ocelot.c
+@@ -1088,17 +1088,34 @@ bool ocelot_can_inject(struct ocelot *ocelot, int grp)
+ }
+ EXPORT_SYMBOL(ocelot_can_inject);
+-void ocelot_ifh_port_set(void *ifh, int port, u32 rew_op, u32 vlan_tag)
++/**
++ * ocelot_ifh_set_basic - Set basic information in Injection Frame Header
++ * @ifh: Pointer to Injection Frame Header memory
++ * @ocelot: Switch private data structure
++ * @port: Egress port number
++ * @rew_op: Egress rewriter operation for PTP
++ * @skb: Pointer to socket buffer (packet)
++ *
++ * Populate the Injection Frame Header with basic information for this skb: the
++ * analyzer bypass bit, destination port, VLAN info, egress rewriter info.
++ */
++void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port,
++                        u32 rew_op, struct sk_buff *skb)
+ {
++      struct ocelot_port *ocelot_port = ocelot->ports[port];
++      u64 vlan_tci, tag_type;
++
++      ocelot_xmit_get_vlan_info(skb, ocelot_port->bridge, &vlan_tci,
++                                &tag_type);
++
+       ocelot_ifh_set_bypass(ifh, 1);
+       ocelot_ifh_set_dest(ifh, BIT_ULL(port));
+-      ocelot_ifh_set_tag_type(ifh, IFH_TAG_TYPE_C);
+-      if (vlan_tag)
+-              ocelot_ifh_set_vlan_tci(ifh, vlan_tag);
++      ocelot_ifh_set_tag_type(ifh, tag_type);
++      ocelot_ifh_set_vlan_tci(ifh, vlan_tci);
+       if (rew_op)
+               ocelot_ifh_set_rew_op(ifh, rew_op);
+ }
+-EXPORT_SYMBOL(ocelot_ifh_port_set);
++EXPORT_SYMBOL(ocelot_ifh_set_basic);
+ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+                             u32 rew_op, struct sk_buff *skb)
+@@ -1109,7 +1126,7 @@ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+       ocelot_write_rix(ocelot, QS_INJ_CTRL_GAP_SIZE(1) |
+                        QS_INJ_CTRL_SOF, QS_INJ_CTRL, grp);
+-      ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb));
++      ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb);
+       for (i = 0; i < OCELOT_TAG_LEN / 4; i++)
+               ocelot_write_rix(ocelot, ifh[i], QS_INJ_WR, grp);
+diff --git a/drivers/net/ethernet/mscc/ocelot_fdma.c b/drivers/net/ethernet/mscc/ocelot_fdma.c
+index 8e3894cf5f7cd..e9d2e96adb229 100644
+--- a/drivers/net/ethernet/mscc/ocelot_fdma.c
++++ b/drivers/net/ethernet/mscc/ocelot_fdma.c
+@@ -666,7 +666,7 @@ static int ocelot_fdma_prepare_skb(struct ocelot *ocelot, int port, u32 rew_op,
+       ifh = skb_push(skb, OCELOT_TAG_LEN);
+       skb_put(skb, ETH_FCS_LEN);
+       memset(ifh, 0, OCELOT_TAG_LEN);
+-      ocelot_ifh_port_set(ifh, port, rew_op, skb_vlan_tag_get(skb));
++      ocelot_ifh_set_basic(ifh, ocelot, port, rew_op, skb);
+       return 0;
+ }
+diff --git a/include/linux/dsa/ocelot.h b/include/linux/dsa/ocelot.h
+index dca2969015d80..6fbfbde68a37c 100644
+--- a/include/linux/dsa/ocelot.h
++++ b/include/linux/dsa/ocelot.h
+@@ -5,6 +5,8 @@
+ #ifndef _NET_DSA_TAG_OCELOT_H
+ #define _NET_DSA_TAG_OCELOT_H
++#include <linux/if_bridge.h>
++#include <linux/if_vlan.h>
+ #include <linux/kthread.h>
+ #include <linux/packing.h>
+ #include <linux/skbuff.h>
+@@ -273,4 +275,49 @@ static inline u32 ocelot_ptp_rew_op(struct sk_buff *skb)
+       return rew_op;
+ }
++/**
++ * ocelot_xmit_get_vlan_info: Determine VLAN_TCI and TAG_TYPE for injected frame
++ * @skb: Pointer to socket buffer
++ * @br: Pointer to bridge device that the port is under, if any
++ * @vlan_tci:
++ * @tag_type:
++ *
++ * If the port is under a VLAN-aware bridge, remove the VLAN header from the
++ * payload and move it into the DSA tag, which will make the switch classify
++ * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero,
++ * which is the pvid of standalone ports (OCELOT_STANDALONE_PVID), although not
++ * of VLAN-unaware bridge ports (that would be ocelot_vlan_unaware_pvid()).
++ * Anyway, VID 0 is fine because it is stripped on egress for these port modes,
++ * and source address learning is not performed for packets injected from the
++ * CPU anyway, so it doesn't matter that the VID is "wrong".
++ */
++static inline void ocelot_xmit_get_vlan_info(struct sk_buff *skb,
++                                           struct net_device *br,
++                                           u64 *vlan_tci, u64 *tag_type)
++{
++      struct vlan_ethhdr *hdr;
++      u16 proto, tci;
++
++      if (!br || !br_vlan_enabled(br)) {
++              *vlan_tci = 0;
++              *tag_type = IFH_TAG_TYPE_C;
++              return;
++      }
++
++      hdr = (struct vlan_ethhdr *)skb_mac_header(skb);
++      br_vlan_get_proto(br, &proto);
++
++      if (ntohs(hdr->h_vlan_proto) == proto) {
++              vlan_remove_tag(skb, &tci);
++              *vlan_tci = tci;
++      } else {
++              rcu_read_lock();
++              br_vlan_get_pvid_rcu(br, &tci);
++              rcu_read_unlock();
++              *vlan_tci = tci;
++      }
++
++      *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C;
++}
++
+ #endif
+diff --git a/include/soc/mscc/ocelot.h b/include/soc/mscc/ocelot.h
+index 195ca8f0b6f9d..9b904ea2f0db9 100644
+--- a/include/soc/mscc/ocelot.h
++++ b/include/soc/mscc/ocelot.h
+@@ -1128,7 +1128,8 @@ void __ocelot_target_write_ix(struct ocelot *ocelot, enum ocelot_target target,
+ bool ocelot_can_inject(struct ocelot *ocelot, int grp);
+ void ocelot_port_inject_frame(struct ocelot *ocelot, int port, int grp,
+                             u32 rew_op, struct sk_buff *skb);
+-void ocelot_ifh_port_set(void *ifh, int port, u32 rew_op, u32 vlan_tag);
++void ocelot_ifh_set_basic(void *ifh, struct ocelot *ocelot, int port,
++                        u32 rew_op, struct sk_buff *skb);
+ int ocelot_xtr_poll_frame(struct ocelot *ocelot, int grp, struct sk_buff **skb);
+ void ocelot_drain_cpu_queue(struct ocelot *ocelot, int grp);
+ void ocelot_ptp_rx_timestamp(struct ocelot *ocelot, struct sk_buff *skb,
+diff --git a/net/dsa/tag_ocelot.c b/net/dsa/tag_ocelot.c
+index 18dda9423fae5..ce9d2b20d67a9 100644
+--- a/net/dsa/tag_ocelot.c
++++ b/net/dsa/tag_ocelot.c
+@@ -4,40 +4,6 @@
+ #include <linux/dsa/ocelot.h>
+ #include "dsa_priv.h"
+-/* If the port is under a VLAN-aware bridge, remove the VLAN header from the
+- * payload and move it into the DSA tag, which will make the switch classify
+- * the packet to the bridge VLAN. Otherwise, leave the classified VLAN at zero,
+- * which is the pvid of standalone and VLAN-unaware bridge ports.
+- */
+-static void ocelot_xmit_get_vlan_info(struct sk_buff *skb, struct dsa_port *dp,
+-                                    u64 *vlan_tci, u64 *tag_type)
+-{
+-      struct net_device *br = dsa_port_bridge_dev_get(dp);
+-      struct vlan_ethhdr *hdr;
+-      u16 proto, tci;
+-
+-      if (!br || !br_vlan_enabled(br)) {
+-              *vlan_tci = 0;
+-              *tag_type = IFH_TAG_TYPE_C;
+-              return;
+-      }
+-
+-      hdr = skb_vlan_eth_hdr(skb);
+-      br_vlan_get_proto(br, &proto);
+-
+-      if (ntohs(hdr->h_vlan_proto) == proto) {
+-              vlan_remove_tag(skb, &tci);
+-              *vlan_tci = tci;
+-      } else {
+-              rcu_read_lock();
+-              br_vlan_get_pvid_rcu(br, &tci);
+-              rcu_read_unlock();
+-              *vlan_tci = tci;
+-      }
+-
+-      *tag_type = (proto != ETH_P_8021Q) ? IFH_TAG_TYPE_S : IFH_TAG_TYPE_C;
+-}
+-
+ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
+                              __be32 ifh_prefix, void **ifh)
+ {
+@@ -49,7 +15,8 @@ static void ocelot_xmit_common(struct sk_buff *skb, struct net_device *netdev,
+       u32 rew_op = 0;
+       u64 qos_class;
+-      ocelot_xmit_get_vlan_info(skb, dp, &vlan_tci, &tag_type);
++      ocelot_xmit_get_vlan_info(skb, dsa_port_bridge_dev_get(dp), &vlan_tci,
++                                &tag_type);
+       qos_class = netdev_get_num_tc(netdev) ?
+                   netdev_get_prio_tc_map(netdev, skb->priority) : skb->priority;
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-xilinx-axienet-always-disable-promiscuous-mode.patch b/queue-6.1/net-xilinx-axienet-always-disable-promiscuous-mode.patch
new file mode 100644 (file)
index 0000000..b0eb22f
--- /dev/null
@@ -0,0 +1,42 @@
+From a60b1c01007426a8687949dcab7436f73726bcce Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Aug 2024 11:40:55 -0400
+Subject: net: xilinx: axienet: Always disable promiscuous mode
+
+From: Sean Anderson <sean.anderson@linux.dev>
+
+[ Upstream commit 4ae738dfef2c0323752ab81786e2d298c9939321 ]
+
+If promiscuous mode is disabled when there are fewer than four multicast
+addresses, then it will not be reflected in the hardware. Fix this by
+always clearing the promiscuous mode flag even when we program multicast
+addresses.
+
+Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
+Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240822154059.1066595-2-sean.anderson@linux.dev
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/xilinx/xilinx_axienet_main.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+index ff777735be66b..ff4b31e93d75f 100644
+--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
++++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+@@ -429,6 +429,10 @@ static void axienet_set_multicast_list(struct net_device *ndev)
+       } else if (!netdev_mc_empty(ndev)) {
+               struct netdev_hw_addr *ha;
++              reg = axienet_ior(lp, XAE_FMI_OFFSET);
++              reg &= ~XAE_FMI_PM_MASK;
++              axienet_iow(lp, XAE_FMI_OFFSET, reg);
++
+               i = 0;
+               netdev_for_each_mc_addr(ha, ndev) {
+                       if (i >= XAE_MULTICAST_CAM_TABLE_NUM)
+-- 
+2.43.0
+
diff --git a/queue-6.1/net-xilinx-axienet-fix-dangling-multicast-addresses.patch b/queue-6.1/net-xilinx-axienet-fix-dangling-multicast-addresses.patch
new file mode 100644 (file)
index 0000000..18974f3
--- /dev/null
@@ -0,0 +1,94 @@
+From 60a9f9d705c4f96475bd3824cecf7b34bae43805 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 22 Aug 2024 11:40:56 -0400
+Subject: net: xilinx: axienet: Fix dangling multicast addresses
+
+From: Sean Anderson <sean.anderson@linux.dev>
+
+[ Upstream commit 797a68c9de0f5a5447baf4bd3bb9c10a3993435b ]
+
+If a multicast address is removed but there are still some multicast
+addresses, that address would remain programmed into the frame filter.
+Fix this by explicitly setting the enable bit for each filter.
+
+Fixes: 8a3b7a252dca ("drivers/net/ethernet/xilinx: added Xilinx AXI Ethernet driver")
+Signed-off-by: Sean Anderson <sean.anderson@linux.dev>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240822154059.1066595-3-sean.anderson@linux.dev
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/xilinx/xilinx_axienet.h  |  1 +
+ .../net/ethernet/xilinx/xilinx_axienet_main.c | 21 ++++++++-----------
+ 2 files changed, 10 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet.h b/drivers/net/ethernet/xilinx/xilinx_axienet.h
+index 969bea5541976..503c32413474a 100644
+--- a/drivers/net/ethernet/xilinx/xilinx_axienet.h
++++ b/drivers/net/ethernet/xilinx/xilinx_axienet.h
+@@ -169,6 +169,7 @@
+ #define XAE_UAW0_OFFSET               0x00000700 /* Unicast address word 0 */
+ #define XAE_UAW1_OFFSET               0x00000704 /* Unicast address word 1 */
+ #define XAE_FMI_OFFSET                0x00000708 /* Frame Filter Control */
++#define XAE_FFE_OFFSET                0x0000070C /* Frame Filter Enable */
+ #define XAE_AF0_OFFSET                0x00000710 /* Address Filter 0 */
+ #define XAE_AF1_OFFSET                0x00000714 /* Address Filter 1 */
+diff --git a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+index ff4b31e93d75f..59d1cfbf7d6b7 100644
+--- a/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
++++ b/drivers/net/ethernet/xilinx/xilinx_axienet_main.c
+@@ -411,7 +411,7 @@ static int netdev_set_mac_address(struct net_device *ndev, void *p)
+  */
+ static void axienet_set_multicast_list(struct net_device *ndev)
+ {
+-      int i;
++      int i = 0;
+       u32 reg, af0reg, af1reg;
+       struct axienet_local *lp = netdev_priv(ndev);
+@@ -433,7 +433,6 @@ static void axienet_set_multicast_list(struct net_device *ndev)
+               reg &= ~XAE_FMI_PM_MASK;
+               axienet_iow(lp, XAE_FMI_OFFSET, reg);
+-              i = 0;
+               netdev_for_each_mc_addr(ha, ndev) {
+                       if (i >= XAE_MULTICAST_CAM_TABLE_NUM)
+                               break;
+@@ -452,6 +451,7 @@ static void axienet_set_multicast_list(struct net_device *ndev)
+                       axienet_iow(lp, XAE_FMI_OFFSET, reg);
+                       axienet_iow(lp, XAE_AF0_OFFSET, af0reg);
+                       axienet_iow(lp, XAE_AF1_OFFSET, af1reg);
++                      axienet_iow(lp, XAE_FFE_OFFSET, 1);
+                       i++;
+               }
+       } else {
+@@ -459,18 +459,15 @@ static void axienet_set_multicast_list(struct net_device *ndev)
+               reg &= ~XAE_FMI_PM_MASK;
+               axienet_iow(lp, XAE_FMI_OFFSET, reg);
+-
+-              for (i = 0; i < XAE_MULTICAST_CAM_TABLE_NUM; i++) {
+-                      reg = axienet_ior(lp, XAE_FMI_OFFSET) & 0xFFFFFF00;
+-                      reg |= i;
+-
+-                      axienet_iow(lp, XAE_FMI_OFFSET, reg);
+-                      axienet_iow(lp, XAE_AF0_OFFSET, 0);
+-                      axienet_iow(lp, XAE_AF1_OFFSET, 0);
+-              }
+-
+               dev_info(&ndev->dev, "Promiscuous mode disabled.\n");
+       }
++
++      for (; i < XAE_MULTICAST_CAM_TABLE_NUM; i++) {
++              reg = axienet_ior(lp, XAE_FMI_OFFSET) & 0xFFFFFF00;
++              reg |= i;
++              axienet_iow(lp, XAE_FMI_OFFSET, reg);
++              axienet_iow(lp, XAE_FFE_OFFSET, 0);
++      }
+ }
+ /**
+-- 
+2.43.0
+
diff --git a/queue-6.1/netem-fix-return-value-if-duplicate-enqueue-fails.patch b/queue-6.1/netem-fix-return-value-if-duplicate-enqueue-fails.patch
new file mode 100644 (file)
index 0000000..cbe7bac
--- /dev/null
@@ -0,0 +1,138 @@
+From 6a32a23651048ad6ba3f2ccc93f7f2f0d9f4d961 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 19 Aug 2024 10:56:45 -0700
+Subject: netem: fix return value if duplicate enqueue fails
+
+From: Stephen Hemminger <stephen@networkplumber.org>
+
+[ Upstream commit c07ff8592d57ed258afee5a5e04991a48dbaf382 ]
+
+There is a bug in netem_enqueue() introduced by
+commit 5845f706388a ("net: netem: fix skb length BUG_ON in __skb_to_sgvec")
+that can lead to a use-after-free.
+
+This commit made netem_enqueue() always return NET_XMIT_SUCCESS
+when a packet is duplicated, which can cause the parent qdisc's q.qlen
+to be mistakenly incremented. When this happens qlen_notify() may be
+skipped on the parent during destruction, leaving a dangling pointer
+for some classful qdiscs like DRR.
+
+There are two ways for the bug happen:
+
+- If the duplicated packet is dropped by rootq->enqueue() and then
+  the original packet is also dropped.
+- If rootq->enqueue() sends the duplicated packet to a different qdisc
+  and the original packet is dropped.
+
+In both cases NET_XMIT_SUCCESS is returned even though no packets
+are enqueued at the netem qdisc.
+
+The fix is to defer the enqueue of the duplicate packet until after
+the original packet has been guaranteed to return NET_XMIT_SUCCESS.
+
+Fixes: 5845f706388a ("net: netem: fix skb length BUG_ON in __skb_to_sgvec")
+Reported-by: Budimir Markovic <markovicbudimir@gmail.com>
+Signed-off-by: Stephen Hemminger <stephen@networkplumber.org>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240819175753.5151-1-stephen@networkplumber.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sched/sch_netem.c | 47 ++++++++++++++++++++++++++-----------------
+ 1 file changed, 29 insertions(+), 18 deletions(-)
+
+diff --git a/net/sched/sch_netem.c b/net/sched/sch_netem.c
+index d0e045116d4e9..a18b24c125f4e 100644
+--- a/net/sched/sch_netem.c
++++ b/net/sched/sch_netem.c
+@@ -437,12 +437,10 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+       struct netem_sched_data *q = qdisc_priv(sch);
+       /* We don't fill cb now as skb_unshare() may invalidate it */
+       struct netem_skb_cb *cb;
+-      struct sk_buff *skb2;
++      struct sk_buff *skb2 = NULL;
+       struct sk_buff *segs = NULL;
+       unsigned int prev_len = qdisc_pkt_len(skb);
+       int count = 1;
+-      int rc = NET_XMIT_SUCCESS;
+-      int rc_drop = NET_XMIT_DROP;
+       /* Do not fool qdisc_drop_all() */
+       skb->prev = NULL;
+@@ -471,19 +469,11 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+               skb_orphan_partial(skb);
+       /*
+-       * If we need to duplicate packet, then re-insert at top of the
+-       * qdisc tree, since parent queuer expects that only one
+-       * skb will be queued.
++       * If we need to duplicate packet, then clone it before
++       * original is modified.
+        */
+-      if (count > 1 && (skb2 = skb_clone(skb, GFP_ATOMIC)) != NULL) {
+-              struct Qdisc *rootq = qdisc_root_bh(sch);
+-              u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
+-
+-              q->duplicate = 0;
+-              rootq->enqueue(skb2, rootq, to_free);
+-              q->duplicate = dupsave;
+-              rc_drop = NET_XMIT_SUCCESS;
+-      }
++      if (count > 1)
++              skb2 = skb_clone(skb, GFP_ATOMIC);
+       /*
+        * Randomized packet corruption.
+@@ -495,7 +485,8 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+               if (skb_is_gso(skb)) {
+                       skb = netem_segment(skb, sch, to_free);
+                       if (!skb)
+-                              return rc_drop;
++                              goto finish_segs;
++
+                       segs = skb->next;
+                       skb_mark_not_on_list(skb);
+                       qdisc_skb_cb(skb)->pkt_len = skb->len;
+@@ -521,7 +512,24 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+               /* re-link segs, so that qdisc_drop_all() frees them all */
+               skb->next = segs;
+               qdisc_drop_all(skb, sch, to_free);
+-              return rc_drop;
++              if (skb2)
++                      __qdisc_drop(skb2, to_free);
++              return NET_XMIT_DROP;
++      }
++
++      /*
++       * If doing duplication then re-insert at top of the
++       * qdisc tree, since parent queuer expects that only one
++       * skb will be queued.
++       */
++      if (skb2) {
++              struct Qdisc *rootq = qdisc_root_bh(sch);
++              u32 dupsave = q->duplicate; /* prevent duplicating a dup... */
++
++              q->duplicate = 0;
++              rootq->enqueue(skb2, rootq, to_free);
++              q->duplicate = dupsave;
++              skb2 = NULL;
+       }
+       qdisc_qstats_backlog_inc(sch, skb);
+@@ -592,9 +600,12 @@ static int netem_enqueue(struct sk_buff *skb, struct Qdisc *sch,
+       }
+ finish_segs:
++      if (skb2)
++              __qdisc_drop(skb2, to_free);
++
+       if (segs) {
+               unsigned int len, last_len;
+-              int nb;
++              int rc, nb;
+               len = skb ? skb->len : 0;
+               nb = skb ? 1 : 0;
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-flowtable-validate-vlan-header.patch b/queue-6.1/netfilter-flowtable-validate-vlan-header.patch
new file mode 100644 (file)
index 0000000..3534437
--- /dev/null
@@ -0,0 +1,60 @@
+From e5c35d2af3e784d92b4c61738611b46798907c3c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Aug 2024 12:39:46 +0200
+Subject: netfilter: flowtable: validate vlan header
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit 6ea14ccb60c8ab829349979b22b58a941ec4a3ee ]
+
+Ensure there is sufficient room to access the protocol field of the
+VLAN header, validate it once before the flowtable lookup.
+
+=====================================================
+BUG: KMSAN: uninit-value in nf_flow_offload_inet_hook+0x45a/0x5f0 net/netfilter/nf_flow_table_inet.c:32
+ nf_flow_offload_inet_hook+0x45a/0x5f0 net/netfilter/nf_flow_table_inet.c:32
+ nf_hook_entry_hookfn include/linux/netfilter.h:154 [inline]
+ nf_hook_slow+0xf4/0x400 net/netfilter/core.c:626
+ nf_hook_ingress include/linux/netfilter_netdev.h:34 [inline]
+ nf_ingress net/core/dev.c:5440 [inline]
+
+Fixes: 4cd91f7c290f ("netfilter: flowtable: add vlan support")
+Reported-by: syzbot+8407d9bb88cd4c6bf61a@syzkaller.appspotmail.com
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_flow_table_inet.c | 3 +++
+ net/netfilter/nf_flow_table_ip.c   | 3 +++
+ 2 files changed, 6 insertions(+)
+
+diff --git a/net/netfilter/nf_flow_table_inet.c b/net/netfilter/nf_flow_table_inet.c
+index 6eef15648b7b0..b0f1991719324 100644
+--- a/net/netfilter/nf_flow_table_inet.c
++++ b/net/netfilter/nf_flow_table_inet.c
+@@ -17,6 +17,9 @@ nf_flow_offload_inet_hook(void *priv, struct sk_buff *skb,
+       switch (skb->protocol) {
+       case htons(ETH_P_8021Q):
++              if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
++                      return NF_ACCEPT;
++
+               veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+               proto = veth->h_vlan_encapsulated_proto;
+               break;
+diff --git a/net/netfilter/nf_flow_table_ip.c b/net/netfilter/nf_flow_table_ip.c
+index 22bc0e3d8a0b5..34be2c9bc39d8 100644
+--- a/net/netfilter/nf_flow_table_ip.c
++++ b/net/netfilter/nf_flow_table_ip.c
+@@ -275,6 +275,9 @@ static bool nf_flow_skb_encap_protocol(struct sk_buff *skb, __be16 proto,
+       switch (skb->protocol) {
+       case htons(ETH_P_8021Q):
++              if (!pskb_may_pull(skb, skb_mac_offset(skb) + sizeof(*veth)))
++                      return false;
++
+               veth = (struct vlan_ethhdr *)skb_mac_header(skb);
+               if (veth->h_vlan_encapsulated_proto == proto) {
+                       *offset += VLAN_HLEN;
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-nft_counter-disable-bh-in-nft_counter_offl.patch b/queue-6.1/netfilter-nft_counter-disable-bh-in-nft_counter_offl.patch
new file mode 100644 (file)
index 0000000..d801329
--- /dev/null
@@ -0,0 +1,55 @@
+From be9a6657ec05c4edaaf03660d275e9c6c1858df5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Aug 2024 09:54:30 +0200
+Subject: netfilter: nft_counter: Disable BH in nft_counter_offload_stats().
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+[ Upstream commit 1eacdd71b3436b54d5fc8218c4bb0187d92a6892 ]
+
+The sequence counter nft_counter_seq is a per-CPU counter. There is no
+lock associated with it. nft_counter_do_eval() is using the same counter
+and disables BH which suggest that it can be invoked from a softirq.
+This in turn means that nft_counter_offload_stats(), which disables only
+preemption, can be interrupted by nft_counter_do_eval() leading to two
+writer for one seqcount_t.
+This can lead to loosing stats or reading statistics while they are
+updated.
+
+Disable BH during stats update in nft_counter_offload_stats() to ensure
+one writer at a time.
+
+Fixes: b72920f6e4a9d ("netfilter: nftables: counter hardware offload support")
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Reviewed-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_counter.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
+index b5fe7fe4b60db..73e4d278d6c13 100644
+--- a/net/netfilter/nft_counter.c
++++ b/net/netfilter/nft_counter.c
+@@ -264,7 +264,7 @@ static void nft_counter_offload_stats(struct nft_expr *expr,
+       struct nft_counter *this_cpu;
+       seqcount_t *myseq;
+-      preempt_disable();
++      local_bh_disable();
+       this_cpu = this_cpu_ptr(priv->counter);
+       myseq = this_cpu_ptr(&nft_counter_seq);
+@@ -272,7 +272,7 @@ static void nft_counter_offload_stats(struct nft_expr *expr,
+       this_cpu->packets += stats->pkts;
+       this_cpu->bytes += stats->bytes;
+       write_seqcount_end(myseq);
+-      preempt_enable();
++      local_bh_enable();
+ }
+ void nft_counter_init_seqcount(void)
+-- 
+2.43.0
+
diff --git a/queue-6.1/netfilter-nft_counter-synchronize-nft_counter_reset-.patch b/queue-6.1/netfilter-nft_counter-synchronize-nft_counter_reset-.patch
new file mode 100644 (file)
index 0000000..d6056de
--- /dev/null
@@ -0,0 +1,50 @@
+From 816edb8ef7c8160fc365d92e06bb60c6a74d3162 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 20 Aug 2024 09:54:31 +0200
+Subject: netfilter: nft_counter: Synchronize nft_counter_reset() against
+ reader.
+
+From: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+
+[ Upstream commit a0b39e2dc7017ac667b70bdeee5293e410fab2fb ]
+
+nft_counter_reset() resets the counter by subtracting the previously
+retrieved value from the counter. This is a write operation on the
+counter and as such it requires to be performed with a write sequence of
+nft_counter_seq to serialize against its possible reader.
+
+Update the packets/ bytes within write-sequence of nft_counter_seq.
+
+Fixes: d84701ecbcd6a ("netfilter: nft_counter: rework atomic dump and reset")
+Signed-off-by: Sebastian Andrzej Siewior <bigeasy@linutronix.de>
+Reviewed-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_counter.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/net/netfilter/nft_counter.c b/net/netfilter/nft_counter.c
+index 73e4d278d6c13..781d3a26f5df7 100644
+--- a/net/netfilter/nft_counter.c
++++ b/net/netfilter/nft_counter.c
+@@ -107,11 +107,16 @@ static void nft_counter_reset(struct nft_counter_percpu_priv *priv,
+                             struct nft_counter *total)
+ {
+       struct nft_counter *this_cpu;
++      seqcount_t *myseq;
+       local_bh_disable();
+       this_cpu = this_cpu_ptr(priv->counter);
++      myseq = this_cpu_ptr(&nft_counter_seq);
++
++      write_seqcount_begin(myseq);
+       this_cpu->packets -= total->packets;
+       this_cpu->bytes -= total->bytes;
++      write_seqcount_end(myseq);
+       local_bh_enable();
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.1/octeontx2-af-fix-cpt-af-register-offset-calculation.patch b/queue-6.1/octeontx2-af-fix-cpt-af-register-offset-calculation.patch
new file mode 100644 (file)
index 0000000..c4cfbc3
--- /dev/null
@@ -0,0 +1,88 @@
+From a7f7c192846359f2b51d78a4606537ef1909dbcc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 21 Aug 2024 12:35:58 +0530
+Subject: octeontx2-af: Fix CPT AF register offset calculation
+
+From: Bharat Bhushan <bbhushan2@marvell.com>
+
+[ Upstream commit af688a99eb1fc7ef69774665d61e6be51cea627a ]
+
+Some CPT AF registers are per LF and others are global. Translation
+of PF/VF local LF slot number to actual LF slot number is required
+only for accessing perf LF registers. CPT AF global registers access
+do not require any LF slot number. Also, there is no reason CPT
+PF/VF to know actual lf's register offset.
+
+Without this fix microcode loading will fail, VFs cannot be created
+and hardware is not usable.
+
+Fixes: bc35e28af789 ("octeontx2-af: replace cpt slot with lf id on reg write")
+Signed-off-by: Bharat Bhushan <bbhushan2@marvell.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://patch.msgid.link/20240821070558.1020101-1-bbhushan2@marvell.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/marvell/octeontx2/af/rvu_cpt.c   | 23 +++++++++----------
+ 1 file changed, 11 insertions(+), 12 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
+index b226a4d376aab..160e044c25c24 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_cpt.c
+@@ -632,7 +632,9 @@ int rvu_mbox_handler_cpt_inline_ipsec_cfg(struct rvu *rvu,
+       return ret;
+ }
+-static bool is_valid_offset(struct rvu *rvu, struct cpt_rd_wr_reg_msg *req)
++static bool validate_and_update_reg_offset(struct rvu *rvu,
++                                         struct cpt_rd_wr_reg_msg *req,
++                                         u64 *reg_offset)
+ {
+       u64 offset = req->reg_offset;
+       int blkaddr, num_lfs, lf;
+@@ -663,6 +665,11 @@ static bool is_valid_offset(struct rvu *rvu, struct cpt_rd_wr_reg_msg *req)
+               if (lf < 0)
+                       return false;
++              /* Translate local LF's offset to global CPT LF's offset to
++               * access LFX register.
++               */
++              *reg_offset = (req->reg_offset & 0xFF000) + (lf << 3);
++
+               return true;
+       } else if (!(req->hdr.pcifunc & RVU_PFVF_FUNC_MASK)) {
+               /* Registers that can be accessed from PF */
+@@ -697,7 +704,7 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu,
+                                       struct cpt_rd_wr_reg_msg *rsp)
+ {
+       u64 offset = req->reg_offset;
+-      int blkaddr, lf;
++      int blkaddr;
+       blkaddr = validate_and_get_cpt_blkaddr(req->blkaddr);
+       if (blkaddr < 0)
+@@ -708,18 +715,10 @@ int rvu_mbox_handler_cpt_rd_wr_register(struct rvu *rvu,
+           !is_cpt_vf(rvu, req->hdr.pcifunc))
+               return CPT_AF_ERR_ACCESS_DENIED;
+-      if (!is_valid_offset(rvu, req))
++      if (!validate_and_update_reg_offset(rvu, req, &offset))
+               return CPT_AF_ERR_ACCESS_DENIED;
+-      /* Translate local LF used by VFs to global CPT LF */
+-      lf = rvu_get_lf(rvu, &rvu->hw->block[blkaddr], req->hdr.pcifunc,
+-                      (offset & 0xFFF) >> 3);
+-
+-      /* Translate local LF's offset to global CPT LF's offset */
+-      offset &= 0xFF000;
+-      offset += lf << 3;
+-
+-      rsp->reg_offset = offset;
++      rsp->reg_offset = req->reg_offset;
+       rsp->ret_val = req->ret_val;
+       rsp->is_write = req->is_write;
+-- 
+2.43.0
+
diff --git a/queue-6.1/selftests-net-synchronize-udpgro-tests-tx-and-rx-con.patch b/queue-6.1/selftests-net-synchronize-udpgro-tests-tx-and-rx-con.patch
new file mode 100644 (file)
index 0000000..d37d191
--- /dev/null
@@ -0,0 +1,152 @@
+From 7418aa00138283151c721425b7886be7acfafa30 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 14 Nov 2023 10:11:31 -0500
+Subject: selftests/net: synchronize udpgro tests' tx and rx connection
+
+From: Lucas Karpinski <lkarpins@redhat.com>
+
+[ Upstream commit 3bdd9fd29cb0f136b307559a19c107210ad5c314 ]
+
+The sockets used by udpgso_bench_tx aren't always ready when
+udpgso_bench_tx transmits packets. This issue is more prevalent in -rt
+kernels, but can occur in both. Replace the hacky sleep calls with a
+function that checks whether the ports in the namespace are ready for
+use.
+
+Suggested-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lucas Karpinski <lkarpins@redhat.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 7167395a4be7 ("selftests: udpgro: report error when receive failed")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/net_helper.sh     | 22 +++++++++++++++++++
+ tools/testing/selftests/net/udpgro.sh         | 13 +++++------
+ tools/testing/selftests/net/udpgro_bench.sh   |  5 +++--
+ tools/testing/selftests/net/udpgro_frglist.sh |  5 +++--
+ 4 files changed, 34 insertions(+), 11 deletions(-)
+ create mode 100755 tools/testing/selftests/net/net_helper.sh
+
+diff --git a/tools/testing/selftests/net/net_helper.sh b/tools/testing/selftests/net/net_helper.sh
+new file mode 100755
+index 0000000000000..4fe0befa13fbc
+--- /dev/null
++++ b/tools/testing/selftests/net/net_helper.sh
+@@ -0,0 +1,22 @@
++#!/bin/bash
++# SPDX-License-Identifier: GPL-2.0
++#
++# Helper functions
++
++wait_local_port_listen()
++{
++      local listener_ns="${1}"
++      local port="${2}"
++      local protocol="${3}"
++      local port_hex
++      local i
++
++      port_hex="$(printf "%04X" "${port}")"
++      for i in $(seq 10); do
++              if ip netns exec "${listener_ns}" cat /proc/net/"${protocol}"* | \
++                 grep -q "${port_hex}"; then
++                      break
++              fi
++              sleep 0.1
++      done
++}
+diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
+index 0c743752669af..af5dc57c8ce93 100755
+--- a/tools/testing/selftests/net/udpgro.sh
++++ b/tools/testing/selftests/net/udpgro.sh
+@@ -3,6 +3,8 @@
+ #
+ # Run a series of udpgro functional tests.
++source net_helper.sh
++
+ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+ BPF_FILE="../bpf/xdp_dummy.bpf.o"
+@@ -51,8 +53,7 @@ run_one() {
+               echo "ok" || \
+               echo "failed" &
+-      # Hack: let bg programs complete the startup
+-      sleep 0.2
++      wait_local_port_listen ${PEER_NS} 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+       ret=$?
+       wait $(jobs -p)
+@@ -97,7 +98,7 @@ run_one_nat() {
+               echo "ok" || \
+               echo "failed"&
+-      sleep 0.1
++      wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+       ret=$?
+       kill -INT $pid
+@@ -118,11 +119,9 @@ run_one_2sock() {
+               echo "ok" || \
+               echo "failed" &
+-      # Hack: let bg programs complete the startup
+-      sleep 0.2
++      wait_local_port_listen "${PEER_NS}" 12345 udp
+       ./udpgso_bench_tx ${tx_args} -p 12345
+-      sleep 0.1
+-      # first UDP GSO socket should be closed at this point
++      wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+       ret=$?
+       wait $(jobs -p)
+diff --git a/tools/testing/selftests/net/udpgro_bench.sh b/tools/testing/selftests/net/udpgro_bench.sh
+index 894972877e8b0..cb664679b4342 100755
+--- a/tools/testing/selftests/net/udpgro_bench.sh
++++ b/tools/testing/selftests/net/udpgro_bench.sh
+@@ -3,6 +3,8 @@
+ #
+ # Run a series of udpgro benchmarks
++source net_helper.sh
++
+ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+ BPF_FILE="../bpf/xdp_dummy.bpf.o"
+@@ -40,8 +42,7 @@ run_one() {
+       ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
+       ip netns exec "${PEER_NS}" ./udpgso_bench_rx -t ${rx_args} -r &
+-      # Hack: let bg programs complete the startup
+-      sleep 0.2
++      wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+ }
+diff --git a/tools/testing/selftests/net/udpgro_frglist.sh b/tools/testing/selftests/net/udpgro_frglist.sh
+index 0a6359bed0b92..dd47fa96f6b3e 100755
+--- a/tools/testing/selftests/net/udpgro_frglist.sh
++++ b/tools/testing/selftests/net/udpgro_frglist.sh
+@@ -3,6 +3,8 @@
+ #
+ # Run a series of udpgro benchmarks
++source net_helper.sh
++
+ readonly PEER_NS="ns-peer-$(mktemp -u XXXXXX)"
+ BPF_FILE="../bpf/xdp_dummy.bpf.o"
+@@ -45,8 +47,7 @@ run_one() {
+         echo ${rx_args}
+       ip netns exec "${PEER_NS}" ./udpgso_bench_rx ${rx_args} -r &
+-      # Hack: let bg programs complete the startup
+-      sleep 0.2
++      wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.1/selftests-udpgro-report-error-when-receive-failed.patch b/queue-6.1/selftests-udpgro-report-error-when-receive-failed.patch
new file mode 100644 (file)
index 0000000..df9b6a0
--- /dev/null
@@ -0,0 +1,139 @@
+From 21c8895615fabf1679837084b68879aab22311b0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 15:59:50 +0800
+Subject: selftests: udpgro: report error when receive failed
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit 7167395a4be7930ecac6a33b4e54d7e3dd9ee209 ]
+
+Currently, we only check the latest senders's exit code. If the receiver
+report failed, it is not recoreded. Fix it by checking the exit code
+of all the involved processes.
+
+Before:
+  bad GRO lookup       ok
+  multiple GRO socks   ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520
+
+ ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520
+
+ failed
+ $ echo $?
+ 0
+
+After:
+  bad GRO lookup       ok
+  multiple GRO socks   ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520
+
+ ./udpgso_bench_rx: recv: bad packet len, got 1452, expected 14520
+
+ failed
+ $ echo $?
+ 1
+
+Fixes: 3327a9c46352 ("selftests: add functionals test for UDP GRO")
+Suggested-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/udpgro.sh | 44 ++++++++++++++++-----------
+ 1 file changed, 27 insertions(+), 17 deletions(-)
+
+diff --git a/tools/testing/selftests/net/udpgro.sh b/tools/testing/selftests/net/udpgro.sh
+index af5dc57c8ce93..241c6c37994d8 100755
+--- a/tools/testing/selftests/net/udpgro.sh
++++ b/tools/testing/selftests/net/udpgro.sh
+@@ -46,17 +46,19 @@ run_one() {
+       local -r all="$@"
+       local -r tx_args=${all%rx*}
+       local -r rx_args=${all#*rx}
++      local ret=0
+       cfg_veth
+-      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} && \
+-              echo "ok" || \
+-              echo "failed" &
++      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} &
++      local PID1=$!
+       wait_local_port_listen ${PEER_NS} 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+-      ret=$?
+-      wait $(jobs -p)
++      check_err $?
++      wait ${PID1}
++      check_err $?
++      [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
+       return $ret
+ }
+@@ -73,6 +75,7 @@ run_one_nat() {
+       local -r all="$@"
+       local -r tx_args=${all%rx*}
+       local -r rx_args=${all#*rx}
++      local ret=0
+       if [[ ${tx_args} = *-4* ]]; then
+               ipt_cmd=iptables
+@@ -93,16 +96,17 @@ run_one_nat() {
+       # ... so that GRO will match the UDP_GRO enabled socket, but packets
+       # will land on the 'plain' one
+       ip netns exec "${PEER_NS}" ./udpgso_bench_rx -G ${family} -b ${addr1} -n 0 &
+-      pid=$!
+-      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} && \
+-              echo "ok" || \
+-              echo "failed"&
++      local PID1=$!
++      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${family} -b ${addr2%/*} ${rx_args} &
++      local PID2=$!
+       wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+-      ret=$?
+-      kill -INT $pid
+-      wait $(jobs -p)
++      check_err $?
++      kill -INT ${PID1}
++      wait ${PID2}
++      check_err $?
++      [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
+       return $ret
+ }
+@@ -111,20 +115,26 @@ run_one_2sock() {
+       local -r all="$@"
+       local -r tx_args=${all%rx*}
+       local -r rx_args=${all#*rx}
++      local ret=0
+       cfg_veth
+       ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 1000 -R 10 ${rx_args} -p 12345 &
+-      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} && \
+-              echo "ok" || \
+-              echo "failed" &
++      local PID1=$!
++      ip netns exec "${PEER_NS}" ./udpgso_bench_rx -C 2000 -R 10 ${rx_args} &
++      local PID2=$!
+       wait_local_port_listen "${PEER_NS}" 12345 udp
+       ./udpgso_bench_tx ${tx_args} -p 12345
++      check_err $?
+       wait_local_port_listen "${PEER_NS}" 8000 udp
+       ./udpgso_bench_tx ${tx_args}
+-      ret=$?
+-      wait $(jobs -p)
++      check_err $?
++      wait ${PID1}
++      check_err $?
++      wait ${PID2}
++      check_err $?
++      [ "$ret" -eq 0 ] && echo "ok" || echo "failed"
+       return $ret
+ }
+-- 
+2.43.0
+
index 2de5cdb80a150410a4d72a6d6fb2576fafbb2e70..5f4984b2d45cfd8a6bc6bc6742cd574137a1ac6f 100644 (file)
@@ -217,3 +217,42 @@ btrfs-replace-sb-s_blocksize-by-fs_info-sectorsize.patch
 btrfs-send-allow-cloning-non-aligned-extent-if-it-en.patch
 drm-amd-display-adjust-cursor-position.patch
 platform-surface-aggregator-fix-warning-when-control.patch
+drm-amdkfd-reserve-the-bo-before-validating-it.patch
+bluetooth-hci_core-fix-le-quote-calculation.patch
+bluetooth-smp-fix-assumption-of-central-always-being.patch
+net-dsa-tag_ocelot-do-not-rely-on-skb_mac_header-for.patch
+net-dsa-tag_ocelot-call-only-the-relevant-portion-of.patch
+net-mscc-ocelot-use-ocelot_xmit_get_vlan_info-also-f.patch
+net-mscc-ocelot-fix-qos-class-for-injected-packets-w.patch
+net-mscc-ocelot-serialize-access-to-the-injection-ex.patch
+tc-testing-don-t-access-non-existent-variable-on-exc.patch
+selftests-net-synchronize-udpgro-tests-tx-and-rx-con.patch
+selftests-udpgro-report-error-when-receive-failed.patch
+tcp-dccp-bypass-empty-buckets-in-inet_twsk_purge.patch
+tcp-dccp-do-not-care-about-families-in-inet_twsk_pur.patch
+tcp-prevent-concurrent-execution-of-tcp_sk_exit_batc.patch
+net-mctp-test-use-correct-skb-for-route-input-check.patch
+kcm-serialise-kcm_sendmsg-for-the-same-socket.patch
+netfilter-nft_counter-disable-bh-in-nft_counter_offl.patch
+netfilter-nft_counter-synchronize-nft_counter_reset-.patch
+ip6_tunnel-fix-broken-gro.patch
+bonding-fix-bond_ipsec_offload_ok-return-type.patch
+bonding-fix-null-pointer-deref-in-bond_ipsec_offload.patch
+bonding-fix-xfrm-real_dev-null-pointer-dereference.patch
+bonding-fix-xfrm-state-handling-when-clearing-active.patch
+ice-prepare-legacy-rx-for-upcoming-xdp-multi-buffer-.patch
+ice-add-xdp_buff-to-ice_rx_ring-struct.patch
+ice-store-page-count-inside-ice_rx_buf.patch
+ice-pull-out-next_to_clean-bump-out-of-ice_put_rx_bu.patch
+ice-fix-page-reuse-when-page_size-is-over-8k.patch
+ice-fix-ice_last_offset-formula.patch
+dpaa2-switch-fix-error-checking-in-dpaa2_switch_seed.patch
+net-dsa-mv88e6xxx-fix-out-of-bound-access.patch
+netem-fix-return-value-if-duplicate-enqueue-fails.patch
+ipv6-prevent-uaf-in-ip6_send_skb.patch
+ipv6-fix-possible-uaf-in-ip6_finish_output2.patch
+ipv6-prevent-possible-uaf-in-ip6_xmit.patch
+netfilter-flowtable-validate-vlan-header.patch
+octeontx2-af-fix-cpt-af-register-offset-calculation.patch
+net-xilinx-axienet-always-disable-promiscuous-mode.patch
+net-xilinx-axienet-fix-dangling-multicast-addresses.patch
diff --git a/queue-6.1/tc-testing-don-t-access-non-existent-variable-on-exc.patch b/queue-6.1/tc-testing-don-t-access-non-existent-variable-on-exc.patch
new file mode 100644 (file)
index 0000000..6e7bcc1
--- /dev/null
@@ -0,0 +1,60 @@
+From cac571704ad5d7012ea3eb3a099c3e74d6dc97a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 15 Aug 2024 16:37:13 +0100
+Subject: tc-testing: don't access non-existent variable on exception
+
+From: Simon Horman <horms@kernel.org>
+
+[ Upstream commit a0c9fe5eecc97680323ee83780ea3eaf440ba1b7 ]
+
+Since commit 255c1c7279ab ("tc-testing: Allow test cases to be skipped")
+the variable test_ordinal doesn't exist in call_pre_case().
+So it should not be accessed when an exception occurs.
+
+This resolves the following splat:
+
+  ...
+  During handling of the above exception, another exception occurred:
+
+  Traceback (most recent call last):
+    File ".../tdc.py", line 1028, in <module>
+      main()
+    File ".../tdc.py", line 1022, in main
+      set_operation_mode(pm, parser, args, remaining)
+    File ".../tdc.py", line 966, in set_operation_mode
+      catresults = test_runner_serial(pm, args, alltests)
+    File ".../tdc.py", line 642, in test_runner_serial
+      (index, tsr) = test_runner(pm, args, alltests)
+    File ".../tdc.py", line 536, in test_runner
+      res = run_one_test(pm, args, index, tidx)
+    File ".../tdc.py", line 419, in run_one_test
+      pm.call_pre_case(tidx)
+    File ".../tdc.py", line 146, in call_pre_case
+      print('test_ordinal is {}'.format(test_ordinal))
+  NameError: name 'test_ordinal' is not defined
+
+Fixes: 255c1c7279ab ("tc-testing: Allow test cases to be skipped")
+Signed-off-by: Simon Horman <horms@kernel.org>
+Acked-by: Jamal Hadi Salim <jhs@mojatatu.com>
+Link: https://patch.msgid.link/20240815-tdc-test-ordinal-v1-1-0255c122a427@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/tc-testing/tdc.py | 1 -
+ 1 file changed, 1 deletion(-)
+
+diff --git a/tools/testing/selftests/tc-testing/tdc.py b/tools/testing/selftests/tc-testing/tdc.py
+index ee22e3447ec7e..4702c99c99d3f 100755
+--- a/tools/testing/selftests/tc-testing/tdc.py
++++ b/tools/testing/selftests/tc-testing/tdc.py
+@@ -129,7 +129,6 @@ class PluginMgr:
+             except Exception as ee:
+                 print('exception {} in call to pre_case for {} plugin'.
+                       format(ee, pgn_inst.__class__))
+-                print('test_ordinal is {}'.format(test_ordinal))
+                 print('testid is {}'.format(caseinfo['id']))
+                 raise
+-- 
+2.43.0
+
diff --git a/queue-6.1/tcp-dccp-bypass-empty-buckets-in-inet_twsk_purge.patch b/queue-6.1/tcp-dccp-bypass-empty-buckets-in-inet_twsk_purge.patch
new file mode 100644 (file)
index 0000000..7475f95
--- /dev/null
@@ -0,0 +1,52 @@
+From 8595c4c11387f406f8b7b7f008a2d042b64676e1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 27 Mar 2024 19:12:06 +0000
+Subject: tcp/dccp: bypass empty buckets in inet_twsk_purge()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 50e2907ef8bb52cf80ecde9eec5c4dac07177146 ]
+
+TCP ehash table is often sparsely populated.
+
+inet_twsk_purge() spends too much time calling cond_resched().
+
+This patch can reduce time spent in inet_twsk_purge() by 20x.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240327191206.508114-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 565d121b6998 ("tcp: prevent concurrent execution of tcp_sk_exit_batch")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/inet_timewait_sock.c | 9 +++++++--
+ 1 file changed, 7 insertions(+), 2 deletions(-)
+
+diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
+index 340a8f0c29800..15d6ce41e5de7 100644
+--- a/net/ipv4/inet_timewait_sock.c
++++ b/net/ipv4/inet_timewait_sock.c
+@@ -284,12 +284,17 @@ EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
+ /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
+ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+ {
++      struct inet_ehash_bucket *head = &hashinfo->ehash[0];
++      unsigned int ehash_mask = hashinfo->ehash_mask;
+       struct hlist_nulls_node *node;
+       unsigned int slot;
+       struct sock *sk;
+-      for (slot = 0; slot <= hashinfo->ehash_mask; slot++) {
+-              struct inet_ehash_bucket *head = &hashinfo->ehash[slot];
++      for (slot = 0; slot <= ehash_mask; slot++, head++) {
++
++              if (hlist_nulls_empty(&head->chain))
++                      continue;
++
+ restart_rcu:
+               cond_resched();
+               rcu_read_lock();
+-- 
+2.43.0
+
diff --git a/queue-6.1/tcp-dccp-do-not-care-about-families-in-inet_twsk_pur.patch b/queue-6.1/tcp-dccp-do-not-care-about-families-in-inet_twsk_pur.patch
new file mode 100644 (file)
index 0000000..e5e5972
--- /dev/null
@@ -0,0 +1,194 @@
+From 6129d002a28a32b787ee8ba51cfc1f8b9db92a3b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 29 Mar 2024 15:32:03 +0000
+Subject: tcp/dccp: do not care about families in inet_twsk_purge()
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 1eeb5043573981f3a1278876515851b7f6b1df1b ]
+
+We lost ability to unload ipv6 module a long time ago.
+
+Instead of calling expensive inet_twsk_purge() twice,
+we can handle all families in one round.
+
+Also remove an extra line added in my prior patch,
+per Kuniyuki Iwashima feedback.
+
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/netdev/20240327192934.6843-1-kuniyu@amazon.com/
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240329153203.345203-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Stable-dep-of: 565d121b6998 ("tcp: prevent concurrent execution of tcp_sk_exit_batch")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_timewait_sock.h | 2 +-
+ include/net/tcp.h                | 2 +-
+ net/dccp/ipv4.c                  | 2 +-
+ net/dccp/ipv6.c                  | 6 ------
+ net/ipv4/inet_timewait_sock.c    | 9 +++------
+ net/ipv4/tcp_ipv4.c              | 2 +-
+ net/ipv4/tcp_minisocks.c         | 6 +++---
+ net/ipv6/tcp_ipv6.c              | 6 ------
+ 8 files changed, 10 insertions(+), 25 deletions(-)
+
+diff --git a/include/net/inet_timewait_sock.h b/include/net/inet_timewait_sock.h
+index 4a8e578405cb3..9365e5af8d6da 100644
+--- a/include/net/inet_timewait_sock.h
++++ b/include/net/inet_timewait_sock.h
+@@ -114,7 +114,7 @@ static inline void inet_twsk_reschedule(struct inet_timewait_sock *tw, int timeo
+ void inet_twsk_deschedule_put(struct inet_timewait_sock *tw);
+-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family);
++void inet_twsk_purge(struct inet_hashinfo *hashinfo);
+ static inline
+ struct net *twsk_net(const struct inet_timewait_sock *twsk)
+diff --git a/include/net/tcp.h b/include/net/tcp.h
+index cc314c383c532..c7501ca66dd34 100644
+--- a/include/net/tcp.h
++++ b/include/net/tcp.h
+@@ -352,7 +352,7 @@ void tcp_rcv_established(struct sock *sk, struct sk_buff *skb);
+ void tcp_rcv_space_adjust(struct sock *sk);
+ int tcp_twsk_unique(struct sock *sk, struct sock *sktw, void *twp);
+ void tcp_twsk_destructor(struct sock *sk);
+-void tcp_twsk_purge(struct list_head *net_exit_list, int family);
++void tcp_twsk_purge(struct list_head *net_exit_list);
+ ssize_t tcp_splice_read(struct socket *sk, loff_t *ppos,
+                       struct pipe_inode_info *pipe, size_t len,
+                       unsigned int flags);
+diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
+index f4a2dce3e1048..db8d54fb88060 100644
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -1042,7 +1042,7 @@ static void __net_exit dccp_v4_exit_net(struct net *net)
+ static void __net_exit dccp_v4_exit_batch(struct list_head *net_exit_list)
+ {
+-      inet_twsk_purge(&dccp_hashinfo, AF_INET);
++      inet_twsk_purge(&dccp_hashinfo);
+ }
+ static struct pernet_operations dccp_v4_ops = {
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
+index 016af0301366d..d90bb941f2ada 100644
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -1121,15 +1121,9 @@ static void __net_exit dccp_v6_exit_net(struct net *net)
+       inet_ctl_sock_destroy(pn->v6_ctl_sk);
+ }
+-static void __net_exit dccp_v6_exit_batch(struct list_head *net_exit_list)
+-{
+-      inet_twsk_purge(&dccp_hashinfo, AF_INET6);
+-}
+-
+ static struct pernet_operations dccp_v6_ops = {
+       .init   = dccp_v6_init_net,
+       .exit   = dccp_v6_exit_net,
+-      .exit_batch = dccp_v6_exit_batch,
+       .id     = &dccp_v6_pernet_id,
+       .size   = sizeof(struct dccp_v6_pernet),
+ };
+diff --git a/net/ipv4/inet_timewait_sock.c b/net/ipv4/inet_timewait_sock.c
+index 15d6ce41e5de7..6356a8a47b345 100644
+--- a/net/ipv4/inet_timewait_sock.c
++++ b/net/ipv4/inet_timewait_sock.c
+@@ -282,7 +282,7 @@ void __inet_twsk_schedule(struct inet_timewait_sock *tw, int timeo, bool rearm)
+ EXPORT_SYMBOL_GPL(__inet_twsk_schedule);
+ /* Remove all non full sockets (TIME_WAIT and NEW_SYN_RECV) for dead netns */
+-void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
++void inet_twsk_purge(struct inet_hashinfo *hashinfo)
+ {
+       struct inet_ehash_bucket *head = &hashinfo->ehash[0];
+       unsigned int ehash_mask = hashinfo->ehash_mask;
+@@ -291,7 +291,6 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+       struct sock *sk;
+       for (slot = 0; slot <= ehash_mask; slot++, head++) {
+-
+               if (hlist_nulls_empty(&head->chain))
+                       continue;
+@@ -306,15 +305,13 @@ void inet_twsk_purge(struct inet_hashinfo *hashinfo, int family)
+                                            TCPF_NEW_SYN_RECV))
+                               continue;
+-                      if (sk->sk_family != family ||
+-                          refcount_read(&sock_net(sk)->ns.count))
++                      if (refcount_read(&sock_net(sk)->ns.count))
+                               continue;
+                       if (unlikely(!refcount_inc_not_zero(&sk->sk_refcnt)))
+                               continue;
+-                      if (unlikely(sk->sk_family != family ||
+-                                   refcount_read(&sock_net(sk)->ns.count))) {
++                      if (refcount_read(&sock_net(sk)->ns.count)) {
+                               sock_gen_put(sk);
+                               goto restart;
+                       }
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index c64ba4f8ddaa9..167de693981a8 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -3242,7 +3242,7 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+ {
+       struct net *net;
+-      tcp_twsk_purge(net_exit_list, AF_INET);
++      tcp_twsk_purge(net_exit_list);
+       list_for_each_entry(net, net_exit_list, exit_list) {
+               inet_pernet_hashinfo_free(net->ipv4.tcp_death_row.hashinfo);
+diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c
+index b3bfa1a09df68..000dce7d0e2d0 100644
+--- a/net/ipv4/tcp_minisocks.c
++++ b/net/ipv4/tcp_minisocks.c
+@@ -347,7 +347,7 @@ void tcp_twsk_destructor(struct sock *sk)
+ }
+ EXPORT_SYMBOL_GPL(tcp_twsk_destructor);
+-void tcp_twsk_purge(struct list_head *net_exit_list, int family)
++void tcp_twsk_purge(struct list_head *net_exit_list)
+ {
+       bool purged_once = false;
+       struct net *net;
+@@ -355,9 +355,9 @@ void tcp_twsk_purge(struct list_head *net_exit_list, int family)
+       list_for_each_entry(net, net_exit_list, exit_list) {
+               if (net->ipv4.tcp_death_row.hashinfo->pernet) {
+                       /* Even if tw_refcount == 1, we must clean up kernel reqsk */
+-                      inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo, family);
++                      inet_twsk_purge(net->ipv4.tcp_death_row.hashinfo);
+               } else if (!purged_once) {
+-                      inet_twsk_purge(&tcp_hashinfo, family);
++                      inet_twsk_purge(&tcp_hashinfo);
+                       purged_once = true;
+               }
+       }
+diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c
+index eb6fc0e2a4533..06b4acbfd314b 100644
+--- a/net/ipv6/tcp_ipv6.c
++++ b/net/ipv6/tcp_ipv6.c
+@@ -2217,15 +2217,9 @@ static void __net_exit tcpv6_net_exit(struct net *net)
+       inet_ctl_sock_destroy(net->ipv6.tcp_sk);
+ }
+-static void __net_exit tcpv6_net_exit_batch(struct list_head *net_exit_list)
+-{
+-      tcp_twsk_purge(net_exit_list, AF_INET6);
+-}
+-
+ static struct pernet_operations tcpv6_net_ops = {
+       .init       = tcpv6_net_init,
+       .exit       = tcpv6_net_exit,
+-      .exit_batch = tcpv6_net_exit_batch,
+ };
+ int __init tcpv6_init(void)
+-- 
+2.43.0
+
diff --git a/queue-6.1/tcp-prevent-concurrent-execution-of-tcp_sk_exit_batc.patch b/queue-6.1/tcp-prevent-concurrent-execution-of-tcp_sk_exit_batc.patch
new file mode 100644 (file)
index 0000000..aa119af
--- /dev/null
@@ -0,0 +1,109 @@
+From 534dffe8c924eae7d781fd2bb2e92921e70ba6b7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 13 Aug 2024 00:28:25 +0200
+Subject: tcp: prevent concurrent execution of tcp_sk_exit_batch
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 565d121b69980637f040eb4d84289869cdaabedf ]
+
+Its possible that two threads call tcp_sk_exit_batch() concurrently,
+once from the cleanup_net workqueue, once from a task that failed to clone
+a new netns.  In the latter case, error unwinding calls the exit handlers
+in reverse order for the 'failed' netns.
+
+tcp_sk_exit_batch() calls tcp_twsk_purge().
+Problem is that since commit b099ce2602d8 ("net: Batch inet_twsk_purge"),
+this function picks up twsk in any dying netns, not just the one passed
+in via exit_batch list.
+
+This means that the error unwind of setup_net() can "steal" and destroy
+timewait sockets belonging to the exiting netns.
+
+This allows the netns exit worker to proceed to call
+
+WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
+
+without the expected 1 -> 0 transition, which then splats.
+
+At same time, error unwind path that is also running inet_twsk_purge()
+will splat as well:
+
+WARNING: .. at lib/refcount.c:31 refcount_warn_saturate+0x1ed/0x210
+...
+ refcount_dec include/linux/refcount.h:351 [inline]
+ inet_twsk_kill+0x758/0x9c0 net/ipv4/inet_timewait_sock.c:70
+ inet_twsk_deschedule_put net/ipv4/inet_timewait_sock.c:221
+ inet_twsk_purge+0x725/0x890 net/ipv4/inet_timewait_sock.c:304
+ tcp_sk_exit_batch+0x1c/0x170 net/ipv4/tcp_ipv4.c:3522
+ ops_exit_list+0x128/0x180 net/core/net_namespace.c:178
+ setup_net+0x714/0xb40 net/core/net_namespace.c:375
+ copy_net_ns+0x2f0/0x670 net/core/net_namespace.c:508
+ create_new_namespaces+0x3ea/0xb10 kernel/nsproxy.c:110
+
+... because refcount_dec() of tw_refcount unexpectedly dropped to 0.
+
+This doesn't seem like an actual bug (no tw sockets got lost and I don't
+see a use-after-free) but as erroneous trigger of debug check.
+
+Add a mutex to force strict ordering: the task that calls tcp_twsk_purge()
+blocks other task from doing final _dec_and_test before mutex-owner has
+removed all tw sockets of dying netns.
+
+Fixes: e9bd0cca09d1 ("tcp: Don't allocate tcp_death_row outside of struct netns_ipv4.")
+Reported-by: syzbot+8ea26396ff85d23a8929@syzkaller.appspotmail.com
+Closes: https://lore.kernel.org/netdev/0000000000003a5292061f5e4e19@google.com/
+Link: https://lore.kernel.org/netdev/20240812140104.GA21559@breakpoint.cc/
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Jason Xing <kerneljasonxing@gmail.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://patch.msgid.link/20240812222857.29837-1-fw@strlen.de
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp_ipv4.c | 14 ++++++++++++++
+ 1 file changed, 14 insertions(+)
+
+diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c
+index 167de693981a8..1327447a3aade 100644
+--- a/net/ipv4/tcp_ipv4.c
++++ b/net/ipv4/tcp_ipv4.c
+@@ -93,6 +93,8 @@ EXPORT_SYMBOL(tcp_hashinfo);
+ static DEFINE_PER_CPU(struct sock *, ipv4_tcp_sk);
++static DEFINE_MUTEX(tcp_exit_batch_mutex);
++
+ static u32 tcp_v4_init_seq(const struct sk_buff *skb)
+ {
+       return secure_tcp_seq(ip_hdr(skb)->daddr,
+@@ -3242,6 +3244,16 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+ {
+       struct net *net;
++      /* make sure concurrent calls to tcp_sk_exit_batch from net_cleanup_work
++       * and failed setup_net error unwinding path are serialized.
++       *
++       * tcp_twsk_purge() handles twsk in any dead netns, not just those in
++       * net_exit_list, the thread that dismantles a particular twsk must
++       * do so without other thread progressing to refcount_dec_and_test() of
++       * tcp_death_row.tw_refcount.
++       */
++      mutex_lock(&tcp_exit_batch_mutex);
++
+       tcp_twsk_purge(net_exit_list);
+       list_for_each_entry(net, net_exit_list, exit_list) {
+@@ -3249,6 +3261,8 @@ static void __net_exit tcp_sk_exit_batch(struct list_head *net_exit_list)
+               WARN_ON_ONCE(!refcount_dec_and_test(&net->ipv4.tcp_death_row.tw_refcount));
+               tcp_fastopen_ctx_destroy(net);
+       }
++
++      mutex_unlock(&tcp_exit_batch_mutex);
+ }
+ static struct pernet_operations __net_initdata tcp_sk_ops = {
+-- 
+2.43.0
+