Fixes for 6.1

author Sasha Levin <sashal@kernel.org>

Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)

committer Sasha Levin <sashal@kernel.org>

Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)
author Sasha Levin <sashal@kernel.org>
Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)
committer Sasha Levin <sashal@kernel.org>
Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)
diff --git a/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch b/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch

new file mode 100644 (file)

index 0000000..070e9d2
--- /dev/null
+++ b/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch
@@ -0,0 +1,43 @@
+From 0542337a6f654f25f6109d7fd0f7d620af585d19 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Nov 2023 19:12:47 +0800
+Subject: blk-core: use pr_warn_ratelimited() in bio_check_ro()
+
+From: Yu Kuai <yukuai3@huawei.com>
+
+[ Upstream commit 1b0a151c10a6d823f033023b9fdd9af72a89591b ]
+
+If one of the underlying disks of raid or dm is set to read-only, then
+each io will generate new log, which will cause message storm. This
+environment is indeed problematic, however we can't make sure our
+naive custormer won't do this, hence use pr_warn_ratelimited() to
+prevent message storm in this case.
+
+Signed-off-by: Yu Kuai <yukuai3@huawei.com>
+Fixes: 57e95e4670d1 ("block: fix and cleanup bio_check_ro")
+Signed-off-by: Ye Bin <yebin10@huawei.com>
+Link: https://lore.kernel.org/r/20231107111247.2157820-1-yukuai1@huaweicloud.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ block/blk-core.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/block/blk-core.c b/block/blk-core.c
+index ebb7a1689b261..6eaf2b0ad7cca 100644
+--- a/block/blk-core.c
++++ b/block/blk-core.c
+@@ -490,8 +490,8 @@ static inline void bio_check_ro(struct bio *bio)
+       if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) {
+               if (op_is_flush(bio->bi_opf) && !bio_sectors(bio))
+                       return;
+-              pr_warn("Trying to write to read-only block-device %pg\n",
+-                      bio->bi_bdev);
++              pr_warn_ratelimited("Trying to write to read-only block-device %pg\n",
++                                  bio->bi_bdev);
+               /* Older lvm-tools actually trigger this */
+       }
+ }
+-- 
+2.42.0
+
diff --git a/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch b/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch

new file mode 100644 (file)

index 0000000..03c599e
--- /dev/null
+++ b/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch
@@ -0,0 +1,113 @@
+From ef364ef17a004ff8e3a33e4168c585acfdfa4568 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 14:36:16 +0800
+Subject: bpf: Check map->usercnt after timer->timer is assigned
+
+From: Hou Tao <houtao1@huawei.com>
+
+[ Upstream commit fd381ce60a2d79cc967506208085336d3d268ae0 ]
+
+When there are concurrent uref release and bpf timer init operations,
+the following sequence diagram is possible. It will break the guarantee
+provided by bpf_timer: bpf_timer will still be alive after userspace
+application releases or unpins the map. It also will lead to kmemleak
+for old kernel version which doesn't release bpf_timer when map is
+released.
+
+bpf program X:
+
+bpf_timer_init()
+  lock timer->lock
+    read timer->timer as NULL
+    read map->usercnt != 0
+
+                process Y:
+
+                close(map_fd)
+                  // put last uref
+                  bpf_map_put_uref()
+                    atomic_dec_and_test(map->usercnt)
+                      array_map_free_timers()
+                        bpf_timer_cancel_and_free()
+                          // just return
+                          read timer->timer is NULL
+
+    t = bpf_map_kmalloc_node()
+    timer->timer = t
+  unlock timer->lock
+
+Fix the problem by checking map->usercnt after timer->timer is assigned,
+so when there are concurrent uref release and bpf timer init, either
+bpf_timer_cancel_and_free() from uref release reads a no-NULL timer
+or the newly-added atomic64_read() returns a zero usercnt.
+
+Because atomic_dec_and_test(map->usercnt) and READ_ONCE(timer->timer)
+in bpf_timer_cancel_and_free() are not protected by a lock, so add
+a memory barrier to guarantee the order between map->usercnt and
+timer->timer. Also use WRITE_ONCE(timer->timer, x) to match the lockless
+read of timer->timer in bpf_timer_cancel_and_free().
+
+Reported-by: Hsin-Wei Hung <hsinweih@uci.edu>
+Closes: https://lore.kernel.org/bpf/CABcoxUaT2k9hWsS1tNgXyoU3E-=PuOgMn737qK984fbFmfYixQ@mail.gmail.com
+Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.")
+Signed-off-by: Hou Tao <houtao1@huawei.com>
+Link: https://lore.kernel.org/r/20231030063616.1653024-1-houtao@huaweicloud.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/bpf/helpers.c | 25 ++++++++++++++++---------
+ 1 file changed, 16 insertions(+), 9 deletions(-)
+
+diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c
+index a6b04faed282b..6212e4ae084bb 100644
+--- a/kernel/bpf/helpers.c
++++ b/kernel/bpf/helpers.c
+@@ -1156,13 +1156,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
+               ret = -EBUSY;
+               goto out;
+       }
+-      if (!atomic64_read(&map->usercnt)) {
+-              /* maps with timers must be either held by user space
+-               * or pinned in bpffs.
+-               */
+-              ret = -EPERM;
+-              goto out;
+-      }
+       /* allocate hrtimer via map_kmalloc to use memcg accounting */
+       t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node);
+       if (!t) {
+@@ -1175,7 +1168,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map
+       rcu_assign_pointer(t->callback_fn, NULL);
+       hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT);
+       t->timer.function = bpf_timer_cb;
+-      timer->timer = t;
++      WRITE_ONCE(timer->timer, t);
++      /* Guarantee the order between timer->timer and map->usercnt. So
++       * when there are concurrent uref release and bpf timer init, either
++       * bpf_timer_cancel_and_free() called by uref release reads a no-NULL
++       * timer or atomic64_read() below returns a zero usercnt.
++       */
++      smp_mb();
++      if (!atomic64_read(&map->usercnt)) {
++              /* maps with timers must be either held by user space
++               * or pinned in bpffs.
++               */
++              WRITE_ONCE(timer->timer, NULL);
++              kfree(t);
++              ret = -EPERM;
++      }
+ out:
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+       return ret;
+@@ -1343,7 +1350,7 @@ void bpf_timer_cancel_and_free(void *val)
+       /* The subsequent bpf_timer_start/cancel() helpers won't be able to use
+        * this timer, since it won't be initialized.
+        */
+-      timer->timer = NULL;
++      WRITE_ONCE(timer->timer, NULL);
+ out:
+       __bpf_spin_unlock_irqrestore(&timer->lock);
+       if (!t)
+-- 
+2.42.0
+
diff --git a/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch b/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch

new file mode 100644 (file)

index 0000000..6b935ee
--- /dev/null
+++ b/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch
@@ -0,0 +1,59 @@
+From d55df215ba76d0171b2e4892577406bf603ac157 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 13:10:41 -0700
+Subject: dccp: Call security_inet_conn_request() after setting IPv4 addresses.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit fa2df45af13091f76b89adb84a28f13818d5d631 ]
+
+Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child
+sockets") introduced security_inet_conn_request() in some functions
+where reqsk is allocated.  The hook is added just after the allocation,
+so reqsk's IPv4 remote address was not initialised then.
+
+However, SELinux/Smack started to read it in netlbl_req_setattr()
+after the cited commits.
+
+This bug was partially fixed by commit 284904aa7946 ("lsm: Relocate
+the IPv4 security_inet_conn_request() hooks").
+
+This patch fixes the last bug in DCCPv4.
+
+Fixes: 389fb800ac8b ("netlabel: Label incoming TCP connections correctly in SELinux")
+Fixes: 07feee8f812f ("netlabel: Cleanup the Smack/NetLabel code to fix incoming TCP connections")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/dccp/ipv4.c | 6 +++---
+ 1 file changed, 3 insertions(+), 3 deletions(-)
+
+diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c
+index 247179d4c8865..9fe6d96797169 100644
+--- a/net/dccp/ipv4.c
++++ b/net/dccp/ipv4.c
+@@ -628,9 +628,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+       if (dccp_parse_options(sk, dreq, skb))
+               goto drop_and_free;
+ 
+-      if (security_inet_conn_request(sk, skb, req))
+-              goto drop_and_free;
+-
+       ireq = inet_rsk(req);
+       sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr);
+       sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr);
+@@ -638,6 +635,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb)
+       ireq->ireq_family = AF_INET;
+       ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if);
+ 
++      if (security_inet_conn_request(sk, skb, req))
++              goto drop_and_free;
++
+       /*
+        * Step 3: Process LISTEN state
+        *
+-- 
+2.42.0
+
diff --git a/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch b/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch

new file mode 100644 (file)

index 0000000..0bc57be
--- /dev/null
+++ b/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch
@@ -0,0 +1,85 @@
+From 41c12a435b1d8d7850233fc50e0898436c63ae67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 13:10:42 -0700
+Subject: dccp/tcp: Call security_inet_conn_request() after setting IPv6
+ addresses.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 23be1e0e2a83a8543214d2599a31d9a2185a796b ]
+
+Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child
+sockets") introduced security_inet_conn_request() in some functions
+where reqsk is allocated.  The hook is added just after the allocation,
+so reqsk's IPv6 remote address was not initialised then.
+
+However, SELinux/Smack started to read it in netlbl_req_setattr()
+after commit e1adea927080 ("calipso: Allow request sockets to be
+relabelled by the lsm.").
+
+Commit 284904aa7946 ("lsm: Relocate the IPv4 security_inet_conn_request()
+hooks") fixed that kind of issue only in TCPv4 because IPv6 labeling was
+not supported at that time.  Finally, the same issue was introduced again
+in IPv6.
+
+Let's apply the same fix on DCCPv6 and TCPv6.
+
+Fixes: e1adea927080 ("calipso: Allow request sockets to be relabelled by the lsm.")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paul Moore <paul@paul-moore.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/dccp/ipv6.c       | 6 +++---
+ net/ipv6/syncookies.c | 7 ++++---
+ 2 files changed, 7 insertions(+), 6 deletions(-)
+
+diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c
+index 6fb34eaf1237a..e0b0bf75a46c2 100644
+--- a/net/dccp/ipv6.c
++++ b/net/dccp/ipv6.c
+@@ -359,15 +359,15 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb)
+       if (dccp_parse_options(sk, dreq, skb))
+               goto drop_and_free;
+ 
+-      if (security_inet_conn_request(sk, skb, req))
+-              goto drop_and_free;
+-
+       ireq = inet_rsk(req);
+       ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+       ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
+       ireq->ireq_family = AF_INET6;
+       ireq->ir_mark = inet_request_mark(sk, skb);
+ 
++      if (security_inet_conn_request(sk, skb, req))
++              goto drop_and_free;
++
+       if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) ||
+           np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+           np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c
+index 5014aa6634527..8698b49dfc8de 100644
+--- a/net/ipv6/syncookies.c
++++ b/net/ipv6/syncookies.c
+@@ -180,14 +180,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
+       treq = tcp_rsk(req);
+       treq->tfo_listener = false;
+ 
+-      if (security_inet_conn_request(sk, skb, req))
+-              goto out_free;
+-
+       req->mss = mss;
+       ireq->ir_rmt_port = th->source;
+       ireq->ir_num = ntohs(th->dest);
+       ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr;
+       ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr;
++
++      if (security_inet_conn_request(sk, skb, req))
++              goto out_free;
++
+       if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) ||
+           np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo ||
+           np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) {
+-- 
+2.42.0
+
diff --git a/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch b/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch

new file mode 100644 (file)

index 0000000..b8398c8
--- /dev/null
+++ b/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch
@@ -0,0 +1,64 @@
+From 3be08cb79ea133375cf3b3ab3bd02118f119bb6f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 24 Oct 2023 09:53:33 +1100
+Subject: Fix termination state for idr_for_each_entry_ul()
+
+From: NeilBrown <neilb@suse.de>
+
+[ Upstream commit e8ae8ad479e2d037daa33756e5e72850a7bd37a9 ]
+
+The comment for idr_for_each_entry_ul() states
+
+  after normal termination @entry is left with the value NULL
+
+This is not correct in the case where UINT_MAX has an entry in the idr.
+In that case @entry will be non-NULL after termination.
+No current code depends on the documentation being correct, but to
+save future code we should fix it.
+
+Also fix idr_for_each_entry_continue_ul().  While this is not documented
+as leaving @entry as NULL, the mellanox driver appears to depend on
+it doing so.  So make that explicit in the documentation as well as in
+the code.
+
+Fixes: e33d2b74d805 ("idr: fix overflow case for idr_for_each_entry_ul()")
+Cc: Matthew Wilcox <willy@infradead.org>
+Cc: Chris Mi <chrism@mellanox.com>
+Cc: Cong Wang <xiyou.wangcong@gmail.com>
+Signed-off-by: NeilBrown <neilb@suse.de>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/idr.h | 6 ++++--
+ 1 file changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/include/linux/idr.h b/include/linux/idr.h
+index a0dce14090a9e..da5f5fa4a3a6a 100644
+--- a/include/linux/idr.h
++++ b/include/linux/idr.h
+@@ -200,7 +200,7 @@ static inline void idr_preload_end(void)
+  */
+ #define idr_for_each_entry_ul(idr, entry, tmp, id)                    \
+       for (tmp = 0, id = 0;                                           \
+-           tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
++           ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
+            tmp = id, ++id)
+ 
+ /**
+@@ -224,10 +224,12 @@ static inline void idr_preload_end(void)
+  * @id: Entry ID.
+  *
+  * Continue to iterate over entries, continuing after the current position.
++ * After normal termination @entry is left with the value NULL.  This
++ * is convenient for a "not found" value.
+  */
+ #define idr_for_each_entry_continue_ul(idr, entry, tmp, id)           \
+       for (tmp = id;                                                  \
+-           tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \
++           ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \
+            tmp = id, ++id)
+ 
+ /*
+-- 
+2.42.0
+
diff --git a/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch b/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch

new file mode 100644 (file)

index 0000000..12aa62e
--- /dev/null
+++ b/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch
@@ -0,0 +1,42 @@
+From 9e7063b06e42ddfb71d39500f0b6e465401d09d1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Oct 2023 15:19:01 +0300
+Subject: hsr: Prevent use after free in prp_create_tagged_frame()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit 876f8ab52363f649bcc74072157dfd7adfbabc0d ]
+
+The prp_fill_rct() function can fail.  In that situation, it frees the
+skb and returns NULL.  Meanwhile on the success path, it returns the
+original skb.  So it's straight forward to fix bug by using the returned
+value.
+
+Fixes: 451d8123f897 ("net: prp: add packet handling support")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/57af1f28-7f57-4a96-bcd3-b7a0f2340845@moroto.mountain
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/hsr/hsr_forward.c | 4 +---
+ 1 file changed, 1 insertion(+), 3 deletions(-)
+
+diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c
+index b71dab630a873..80cdc6f6b34c9 100644
+--- a/net/hsr/hsr_forward.c
++++ b/net/hsr/hsr_forward.c
+@@ -342,9 +342,7 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame,
+       skb = skb_copy_expand(frame->skb_std, 0,
+                             skb_tailroom(frame->skb_std) + HSR_HLEN,
+                             GFP_ATOMIC);
+-      prp_fill_rct(skb, frame, port);
+-
+-      return skb;
++      return prp_fill_rct(skb, frame, port);
+ }
+ 
+ static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev,
+-- 
+2.42.0
+
diff --git a/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch b/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch

new file mode 100644 (file)

index 0000000..d34e57a
--- /dev/null
+++ b/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch
@@ -0,0 +1,200 @@
+From ea9042fe39247c12add88da66f6ccda2b3b6f98f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 24 Aug 2023 14:23:51 -0700
+Subject: i2c: iproc: handle invalid slave state
+
+From: Roman Bacik <roman.bacik@broadcom.com>
+
+[ Upstream commit ba15a14399c262f91ce30c19fcbdc952262dd1be ]
+
+Add the code to handle an invalid state when both bits S_RX_EVENT
+(indicating a transaction) and S_START_BUSY (indicating the end
+of transaction - transition of START_BUSY from 1 to 0) are set in
+the interrupt status register during a slave read.
+
+Signed-off-by: Roman Bacik <roman.bacik@broadcom.com>
+Fixes: 1ca1b4516088 ("i2c: iproc: handle Master aborted error")
+Acked-by: Ray Jui <ray.jui@broadcom.com>
+Signed-off-by: Wolfram Sang <wsa@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/i2c/busses/i2c-bcm-iproc.c | 133 ++++++++++++++++-------------
+ 1 file changed, 75 insertions(+), 58 deletions(-)
+
+diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c
+index 30a2a3200bed9..86a080f24d8a2 100644
+--- a/drivers/i2c/busses/i2c-bcm-iproc.c
++++ b/drivers/i2c/busses/i2c-bcm-iproc.c
+@@ -316,26 +316,44 @@ static void bcm_iproc_i2c_slave_init(
+       iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val);
+ }
+ 
+-static void bcm_iproc_i2c_check_slave_status(
+-      struct bcm_iproc_i2c_dev *iproc_i2c)
++static bool bcm_iproc_i2c_check_slave_status
++      (struct bcm_iproc_i2c_dev *iproc_i2c, u32 status)
+ {
+       u32 val;
++      bool recover = false;
+ 
+-      val = iproc_i2c_rd_reg(iproc_i2c, S_CMD_OFFSET);
+-      /* status is valid only when START_BUSY is cleared after it was set */
+-      if (val & BIT(S_CMD_START_BUSY_SHIFT))
+-              return;
++      /* check slave transmit status only if slave is transmitting */
++      if (!iproc_i2c->slave_rx_only) {
++              val = iproc_i2c_rd_reg(iproc_i2c, S_CMD_OFFSET);
++              /* status is valid only when START_BUSY is cleared */
++              if (!(val & BIT(S_CMD_START_BUSY_SHIFT))) {
++                      val = (val >> S_CMD_STATUS_SHIFT) & S_CMD_STATUS_MASK;
++                      if (val == S_CMD_STATUS_TIMEOUT ||
++                          val == S_CMD_STATUS_MASTER_ABORT) {
++                              dev_warn(iproc_i2c->device,
++                                       (val == S_CMD_STATUS_TIMEOUT) ?
++                                       "slave random stretch time timeout\n" :
++                                       "Master aborted read transaction\n");
++                              recover = true;
++                      }
++              }
++      }
++
++      /* RX_EVENT is not valid when START_BUSY is set */
++      if ((status & BIT(IS_S_RX_EVENT_SHIFT)) &&
++          (status & BIT(IS_S_START_BUSY_SHIFT))) {
++              dev_warn(iproc_i2c->device, "Slave aborted read transaction\n");
++              recover = true;
++      }
+ 
+-      val = (val >> S_CMD_STATUS_SHIFT) & S_CMD_STATUS_MASK;
+-      if (val == S_CMD_STATUS_TIMEOUT || val == S_CMD_STATUS_MASTER_ABORT) {
+-              dev_err(iproc_i2c->device, (val == S_CMD_STATUS_TIMEOUT) ?
+-                      "slave random stretch time timeout\n" :
+-                      "Master aborted read transaction\n");
++      if (recover) {
+               /* re-initialize i2c for recovery */
+               bcm_iproc_i2c_enable_disable(iproc_i2c, false);
+               bcm_iproc_i2c_slave_init(iproc_i2c, true);
+               bcm_iproc_i2c_enable_disable(iproc_i2c, true);
+       }
++
++      return recover;
+ }
+ 
+ static void bcm_iproc_i2c_slave_read(struct bcm_iproc_i2c_dev *iproc_i2c)
+@@ -420,48 +438,6 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c,
+       u32 val;
+       u8 value;
+ 
+-      /*
+-       * Slave events in case of master-write, master-write-read and,
+-       * master-read
+-       *
+-       * Master-write     : only IS_S_RX_EVENT_SHIFT event
+-       * Master-write-read: both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT
+-       *                    events
+-       * Master-read      : both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT
+-       *                    events or only IS_S_RD_EVENT_SHIFT
+-       *
+-       * iproc has a slave rx fifo size of 64 bytes. Rx fifo full interrupt
+-       * (IS_S_RX_FIFO_FULL_SHIFT) will be generated when RX fifo becomes
+-       * full. This can happen if Master issues write requests of more than
+-       * 64 bytes.
+-       */
+-      if (status & BIT(IS_S_RX_EVENT_SHIFT) ||
+-          status & BIT(IS_S_RD_EVENT_SHIFT) ||
+-          status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) {
+-              /* disable slave interrupts */
+-              val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET);
+-              val &= ~iproc_i2c->slave_int_mask;
+-              iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val);
+-
+-              if (status & BIT(IS_S_RD_EVENT_SHIFT))
+-                      /* Master-write-read request */
+-                      iproc_i2c->slave_rx_only = false;
+-              else
+-                      /* Master-write request only */
+-                      iproc_i2c->slave_rx_only = true;
+-
+-              /* schedule tasklet to read data later */
+-              tasklet_schedule(&iproc_i2c->slave_rx_tasklet);
+-
+-              /*
+-               * clear only IS_S_RX_EVENT_SHIFT and
+-               * IS_S_RX_FIFO_FULL_SHIFT interrupt.
+-               */
+-              val = BIT(IS_S_RX_EVENT_SHIFT);
+-              if (status & BIT(IS_S_RX_FIFO_FULL_SHIFT))
+-                      val |= BIT(IS_S_RX_FIFO_FULL_SHIFT);
+-              iproc_i2c_wr_reg(iproc_i2c, IS_OFFSET, val);
+-      }
+ 
+       if (status & BIT(IS_S_TX_UNDERRUN_SHIFT)) {
+               iproc_i2c->tx_underrun++;
+@@ -493,8 +469,9 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c,
+                * less than PKT_LENGTH bytes were output on the SMBUS
+                */
+               iproc_i2c->slave_int_mask &= ~BIT(IE_S_TX_UNDERRUN_SHIFT);
+-              iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET,
+-                               iproc_i2c->slave_int_mask);
++              val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET);
++              val &= ~BIT(IE_S_TX_UNDERRUN_SHIFT);
++              iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val);
+ 
+               /* End of SMBUS for Master Read */
+               val = BIT(S_TX_WR_STATUS_SHIFT);
+@@ -515,9 +492,49 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c,
+                                BIT(IS_S_START_BUSY_SHIFT));
+       }
+ 
+-      /* check slave transmit status only if slave is transmitting */
+-      if (!iproc_i2c->slave_rx_only)
+-              bcm_iproc_i2c_check_slave_status(iproc_i2c);
++      /* if the controller has been reset, immediately return from the ISR */
++      if (bcm_iproc_i2c_check_slave_status(iproc_i2c, status))
++              return true;
++
++      /*
++       * Slave events in case of master-write, master-write-read and,
++       * master-read
++       *
++       * Master-write     : only IS_S_RX_EVENT_SHIFT event
++       * Master-write-read: both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT
++       *                    events
++       * Master-read      : both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT
++       *                    events or only IS_S_RD_EVENT_SHIFT
++       *
++       * iproc has a slave rx fifo size of 64 bytes. Rx fifo full interrupt
++       * (IS_S_RX_FIFO_FULL_SHIFT) will be generated when RX fifo becomes
++       * full. This can happen if Master issues write requests of more than
++       * 64 bytes.
++       */
++      if (status & BIT(IS_S_RX_EVENT_SHIFT) ||
++          status & BIT(IS_S_RD_EVENT_SHIFT) ||
++          status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) {
++              /* disable slave interrupts */
++              val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET);
++              val &= ~iproc_i2c->slave_int_mask;
++              iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val);
++
++              if (status & BIT(IS_S_RD_EVENT_SHIFT))
++                      /* Master-write-read request */
++                      iproc_i2c->slave_rx_only = false;
++              else
++                      /* Master-write request only */
++                      iproc_i2c->slave_rx_only = true;
++
++              /* schedule tasklet to read data later */
++              tasklet_schedule(&iproc_i2c->slave_rx_tasklet);
++
++              /* clear IS_S_RX_FIFO_FULL_SHIFT interrupt */
++              if (status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) {
++                      val = BIT(IS_S_RX_FIFO_FULL_SHIFT);
++                      iproc_i2c_wr_reg(iproc_i2c, IS_OFFSET, val);
++              }
++      }
+ 
+       return true;
+ }
+-- 
+2.42.0
+
diff --git a/queue-6.1/inet-shrink-struct-flowi_common.patch b/queue-6.1/inet-shrink-struct-flowi_common.patch

new file mode 100644 (file)

index 0000000..ca831d2
--- /dev/null
+++ b/queue-6.1/inet-shrink-struct-flowi_common.patch
@@ -0,0 +1,44 @@
+From 54f549733fa56fa6f5de1e4198c516777a13b2da Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 14:10:37 +0000
+Subject: inet: shrink struct flowi_common
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit 1726483b79a72e0150734d5367e4a0238bf8fcff ]
+
+I am looking at syzbot reports triggering kernel stack overflows
+involving a cascade of ipvlan devices.
+
+We can save 8 bytes in struct flowi_common.
+
+This patch alone will not fix the issue, but is a start.
+
+Fixes: 24ba14406c5c ("route: Add multipath_hash in flowi_common to make user-define hash")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Cc: wenxu <wenxu@ucloud.cn>
+Reviewed-by: David Ahern <dsahern@kernel.org>
+Link: https://lore.kernel.org/r/20231025141037.3448203-1-edumazet@google.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/flow.h | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/include/net/flow.h b/include/net/flow.h
+index 2f0da4f0318b5..079cc493fe67d 100644
+--- a/include/net/flow.h
++++ b/include/net/flow.h
+@@ -39,8 +39,8 @@ struct flowi_common {
+ #define FLOWI_FLAG_KNOWN_NH           0x02
+       __u32   flowic_secid;
+       kuid_t  flowic_uid;
+-      struct flowi_tunnel flowic_tun_key;
+       __u32           flowic_multipath_hash;
++      struct flowi_tunnel flowic_tun_key;
+ };
+ 
+ union flowi_uli {
+-- 
+2.42.0
+
diff --git a/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch b/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch

new file mode 100644 (file)

index 0000000..77c203d
--- /dev/null
+++ b/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch
@@ -0,0 +1,43 @@
+From 52f66df7f9d6a80f301b583c80168ff716396f9b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 29 Oct 2023 02:53:36 +0000
+Subject: Input: synaptics-rmi4 - fix use after free in
+ rmi_unregister_function()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit eb988e46da2e4eae89f5337e047ce372fe33d5b1 ]
+
+The put_device() calls rmi_release_function() which frees "fn" so the
+dereference on the next line "fn->num_of_irqs" is a use after free.
+Move the put_device() to the end to fix this.
+
+Fixes: 24d28e4f1271 ("Input: synaptics-rmi4 - convert irq distribution to irq_domain")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Link: https://lore.kernel.org/r/706efd36-7561-42f3-adfa-dd1d0bd4f5a1@moroto.mountain
+Signed-off-by: Dmitry Torokhov <dmitry.torokhov@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/input/rmi4/rmi_bus.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c
+index 50a0134b6901b..e6557d5f50ce5 100644
+--- a/drivers/input/rmi4/rmi_bus.c
++++ b/drivers/input/rmi4/rmi_bus.c
+@@ -277,11 +277,11 @@ void rmi_unregister_function(struct rmi_function *fn)
+ 
+       device_del(&fn->dev);
+       of_node_put(fn->dev.of_node);
+-      put_device(&fn->dev);
+ 
+       for (i = 0; i < fn->num_of_irqs; i++)
+               irq_dispose_mapping(fn->irq[i]);
+ 
++      put_device(&fn->dev);
+ }
+ 
+ /**
+-- 
+2.42.0
+
diff --git a/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch b/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch

new file mode 100644 (file)

index 0000000..aef9eb4
--- /dev/null
+++ b/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch
@@ -0,0 +1,113 @@
+From e59a5f2fb7ab319630f42646ebae0d856244f914 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 19:42:38 -0400
+Subject: llc: verify mac len before reading mac header
+
+From: Willem de Bruijn <willemb@google.com>
+
+[ Upstream commit 7b3ba18703a63f6fd487183b9262b08e5632da1b ]
+
+LLC reads the mac header with eth_hdr without verifying that the skb
+has an Ethernet header.
+
+Syzbot was able to enter llc_rcv on a tun device. Tun can insert
+packets without mac len and with user configurable skb->protocol
+(passing a tun_pi header when not configuring IFF_NO_PI).
+
+    BUG: KMSAN: uninit-value in llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline]
+    BUG: KMSAN: uninit-value in llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111
+    llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline]
+    llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111
+    llc_rcv+0xc5d/0x14a0 net/llc/llc_input.c:218
+    __netif_receive_skb_one_core net/core/dev.c:5523 [inline]
+    __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5637
+    netif_receive_skb_internal net/core/dev.c:5723 [inline]
+    netif_receive_skb+0x58/0x660 net/core/dev.c:5782
+    tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555
+    tun_get_user+0x54c5/0x69c0 drivers/net/tun.c:2002
+
+Add a mac_len test before all three eth_hdr(skb) calls under net/llc.
+
+There are further uses in include/net/llc_pdu.h. All these are
+protected by a test skb->protocol == ETH_P_802_2. Which does not
+protect against this tun scenario.
+
+But the mac_len test added in this patch in llc_fixup_skb will
+indirectly protect those too. That is called from llc_rcv before any
+other LLC code.
+
+It is tempting to just add a blanket mac_len check in llc_rcv, but
+not sure whether that could break valid LLC paths that do not assume
+an Ethernet header. 802.2 LLC may be used on top of non-802.3
+protocols in principle. The below referenced commit shows that used
+to, on top of Token Ring.
+
+At least one of the three eth_hdr uses goes back to before the start
+of git history. But the one that syzbot exercises is introduced in
+this commit. That commit is old enough (2008), that effectively all
+stable kernels should receive this.
+
+Fixes: f83f1768f833 ("[LLC]: skb allocation size for responses")
+Reported-by: syzbot+a8c7be6dee0de1b669cc@syzkaller.appspotmail.com
+Signed-off-by: Willem de Bruijn <willemb@google.com>
+Link: https://lore.kernel.org/r/20231025234251.3796495-1-willemdebruijn.kernel@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/llc/llc_input.c   | 10 ++++++++--
+ net/llc/llc_s_ac.c    |  3 +++
+ net/llc/llc_station.c |  3 +++
+ 3 files changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c
+index 7cac441862e21..51bccfb00a9cd 100644
+--- a/net/llc/llc_input.c
++++ b/net/llc/llc_input.c
+@@ -127,8 +127,14 @@ static inline int llc_fixup_skb(struct sk_buff *skb)
+       skb->transport_header += llc_len;
+       skb_pull(skb, llc_len);
+       if (skb->protocol == htons(ETH_P_802_2)) {
+-              __be16 pdulen = eth_hdr(skb)->h_proto;
+-              s32 data_size = ntohs(pdulen) - llc_len;
++              __be16 pdulen;
++              s32 data_size;
++
++              if (skb->mac_len < ETH_HLEN)
++                      return 0;
++
++              pdulen = eth_hdr(skb)->h_proto;
++              data_size = ntohs(pdulen) - llc_len;
+ 
+               if (data_size < 0 ||
+                   !pskb_may_pull(skb, data_size))
+diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c
+index 79d1cef8f15a9..06fb8e6944b06 100644
+--- a/net/llc/llc_s_ac.c
++++ b/net/llc/llc_s_ac.c
+@@ -153,6 +153,9 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb)
+       int rc = 1;
+       u32 data_size;
+ 
++      if (skb->mac_len < ETH_HLEN)
++              return 1;
++
+       llc_pdu_decode_sa(skb, mac_da);
+       llc_pdu_decode_da(skb, mac_sa);
+       llc_pdu_decode_ssap(skb, &dsap);
+diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c
+index 05c6ae0920534..f506542925109 100644
+--- a/net/llc/llc_station.c
++++ b/net/llc/llc_station.c
+@@ -76,6 +76,9 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb)
+       u32 data_size;
+       struct sk_buff *nskb;
+ 
++      if (skb->mac_len < ETH_HLEN)
++              goto out;
++
+       /* The test request command is type U (llc_len = 3) */
+       data_size = ntohs(eth_hdr(skb)->h_proto) - 3;
+       nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size);
+-- 
+2.42.0
+
diff --git a/queue-6.1/nbd-fix-uaf-in-nbd_open.patch b/queue-6.1/nbd-fix-uaf-in-nbd_open.patch

new file mode 100644 (file)

index 0000000..981ba86
--- /dev/null
+++ b/queue-6.1/nbd-fix-uaf-in-nbd_open.patch
@@ -0,0 +1,73 @@
+From 77347a8505e3a84797ef1b8474922e2feefe48bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 7 Nov 2023 18:34:35 +0800
+Subject: nbd: fix uaf in nbd_open
+
+From: Li Lingfeng <lilingfeng3@huawei.com>
+
+[ Upstream commit 327462725b0f759f093788dfbcb2f1fd132f956b ]
+
+Commit 4af5f2e03013 ("nbd: use blk_mq_alloc_disk and
+blk_cleanup_disk") cleans up disk by blk_cleanup_disk() and it won't set
+disk->private_data as NULL as before. UAF may be triggered in nbd_open()
+if someone tries to open nbd device right after nbd_put() since nbd has
+been free in nbd_dev_remove().
+
+Fix this by implementing ->free_disk and free private data in it.
+
+Fixes: 4af5f2e03013 ("nbd: use blk_mq_alloc_disk and blk_cleanup_disk")
+Signed-off-by: Li Lingfeng <lilingfeng3@huawei.com>
+Reviewed-by: Josef Bacik <josef@toxicpanda.com>
+Link: https://lore.kernel.org/r/20231107103435.2074904-1-lilingfeng@huaweicloud.com
+Signed-off-by: Jens Axboe <axboe@kernel.dk>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/block/nbd.c | 11 +++++++++--
+ 1 file changed, 9 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c
+index 7718c81e1dba8..e94d2ff6b1223 100644
+--- a/drivers/block/nbd.c
++++ b/drivers/block/nbd.c
+@@ -250,7 +250,6 @@ static void nbd_dev_remove(struct nbd_device *nbd)
+       struct gendisk *disk = nbd->disk;
+ 
+       del_gendisk(disk);
+-      put_disk(disk);
+       blk_mq_free_tag_set(&nbd->tag_set);
+ 
+       /*
+@@ -261,7 +260,7 @@ static void nbd_dev_remove(struct nbd_device *nbd)
+       idr_remove(&nbd_index_idr, nbd->index);
+       mutex_unlock(&nbd_index_mutex);
+       destroy_workqueue(nbd->recv_workq);
+-      kfree(nbd);
++      put_disk(disk);
+ }
+ 
+ static void nbd_dev_remove_work(struct work_struct *work)
+@@ -1608,6 +1607,13 @@ static void nbd_release(struct gendisk *disk, fmode_t mode)
+       nbd_put(nbd);
+ }
+ 
++static void nbd_free_disk(struct gendisk *disk)
++{
++      struct nbd_device *nbd = disk->private_data;
++
++      kfree(nbd);
++}
++
+ static const struct block_device_operations nbd_fops =
+ {
+       .owner =        THIS_MODULE,
+@@ -1615,6 +1621,7 @@ static const struct block_device_operations nbd_fops =
+       .release =      nbd_release,
+       .ioctl =        nbd_ioctl,
+       .compat_ioctl = nbd_ioctl,
++      .free_disk =    nbd_free_disk,
+ };
+ 
+ #if IS_ENABLED(CONFIG_DEBUG_FS)
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch b/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch

new file mode 100644 (file)

index 0000000..e9fd736
--- /dev/null
+++ b/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch
@@ -0,0 +1,48 @@
+From f6f08cbd9ad20a06e17adec789b23e8478d73984 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 17:12:56 +0800
+Subject: net: page_pool: add missing free_percpu when page_pool_init fail
+
+From: Jian Shen <shenjian15@huawei.com>
+
+[ Upstream commit 8ffbd1669ed1d58939d6e878dffaa2f60bf961a4 ]
+
+When ptr_ring_init() returns failure in page_pool_init(), free_percpu()
+is not called to free pool->recycle_stats, which may cause memory
+leak.
+
+Fixes: ad6fa1e1ab1b ("page_pool: Add recycle stats")
+Signed-off-by: Jian Shen <shenjian15@huawei.com>
+Signed-off-by: Jijie Shao <shaojijie@huawei.com>
+Reviewed-by: Yunsheng Lin <linyunsheng@huawei.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Somnath Kotur <somnath.kotur@broadcom.com>
+Reviewed-by: Ilias Apalodimas <ilias.apalodimas@linaro.org>
+Link: https://lore.kernel.org/r/20231030091256.2915394-1-shaojijie@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/page_pool.c | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/net/core/page_pool.c b/net/core/page_pool.c
+index 2396c99bedeaa..caf6d950d54ad 100644
+--- a/net/core/page_pool.c
++++ b/net/core/page_pool.c
+@@ -209,8 +209,12 @@ static int page_pool_init(struct page_pool *pool,
+               return -ENOMEM;
+ #endif
+ 
+-      if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0)
++      if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) {
++#ifdef CONFIG_PAGE_POOL_STATS
++              free_percpu(pool->recycle_stats);
++#endif
+               return -ENOMEM;
++      }
+ 
+       atomic_set(&pool->pages_state_release_cnt, 0);
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch b/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch

new file mode 100644 (file)

index 0000000..488cf72
--- /dev/null
+++ b/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch
@@ -0,0 +1,43 @@
+From 86470605f1916f8312661e33a42580217c7c6cb2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 16:50:14 -0400
+Subject: net: r8169: Disable multicast filter for RTL8168H and RTL8107E
+
+From: Patrick Thompson <ptf@google.com>
+
+[ Upstream commit efa5f1311c4998e9e6317c52bc5ee93b3a0f36df ]
+
+RTL8168H and RTL8107E ethernet adapters erroneously filter unicast
+eapol packets unless allmulti is enabled. These devices correspond to
+RTL_GIGA_MAC_VER_46 and VER_48. Add an exception for VER_46 and VER_48
+in the same way that VER_35 has an exception.
+
+Fixes: 6e1d0b898818 ("r8169:add support for RTL8168H and RTL8107E")
+Signed-off-by: Patrick Thompson <ptf@google.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Reviewed-by: Heiner Kallweit <hkallweit1@gmail.com>
+Link: https://lore.kernel.org/r/20231030205031.177855-1-ptf@google.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/realtek/r8169_main.c | 4 +++-
+ 1 file changed, 3 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
+index 94f902d8e975f..c56d3538889b6 100644
+--- a/drivers/net/ethernet/realtek/r8169_main.c
++++ b/drivers/net/ethernet/realtek/r8169_main.c
+@@ -2514,7 +2514,9 @@ static void rtl_set_rx_mode(struct net_device *dev)
+               rx_mode |= AcceptAllPhys;
+       } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT ||
+                  dev->flags & IFF_ALLMULTI ||
+-                 tp->mac_version == RTL_GIGA_MAC_VER_35) {
++                 tp->mac_version == RTL_GIGA_MAC_VER_35 ||
++                 tp->mac_version == RTL_GIGA_MAC_VER_46 ||
++                 tp->mac_version == RTL_GIGA_MAC_VER_48) {
+               /* accept all multicasts */
+       } else if (netdev_mc_empty(dev)) {
+               rx_mode &= ~AcceptMulticast;
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch b/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch

new file mode 100644 (file)

index 0000000..fcd3c59
--- /dev/null
+++ b/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch
@@ -0,0 +1,64 @@
+From 7853abecf41949469a9d1cbeff72cfbb11a80a67 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Nov 2023 14:07:39 +0800
+Subject: net/smc: allow cdc msg send rather than drop it with NULL sndbuf_desc
+
+From: D. Wythe <alibuda@linux.alibaba.com>
+
+[ Upstream commit c5bf605ba4f9d6fbbb120595ab95002f4716edcb ]
+
+This patch re-fix the issues mentioned by commit 22a825c541d7
+("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()").
+
+Blocking sending message do solve the issues though, but it also
+prevents the peer to receive the final message. Besides, in logic,
+whether the sndbuf_desc is NULL or not have no impact on the processing
+of cdc message sending.
+
+Hence that, this patch allows the cdc message sending but to check the
+sndbuf_desc with care in smc_cdc_tx_handler().
+
+Fixes: 22a825c541d7 ("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()")
+Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_cdc.c | 9 ++++-----
+ 1 file changed, 4 insertions(+), 5 deletions(-)
+
+diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
+index 01bdb7909a14b..3c06625ceb200 100644
+--- a/net/smc/smc_cdc.c
++++ b/net/smc/smc_cdc.c
+@@ -28,13 +28,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd,
+ {
+       struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd;
+       struct smc_connection *conn = cdcpend->conn;
++      struct smc_buf_desc *sndbuf_desc;
+       struct smc_sock *smc;
+       int diff;
+ 
++      sndbuf_desc = conn->sndbuf_desc;
+       smc = container_of(conn, struct smc_sock, conn);
+       bh_lock_sock(&smc->sk);
+-      if (!wc_status) {
+-              diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len,
++      if (!wc_status && sndbuf_desc) {
++              diff = smc_curs_diff(sndbuf_desc->len,
+                                    &cdcpend->conn->tx_curs_fin,
+                                    &cdcpend->cursor);
+               /* sndbuf_space is decreased in smc_sendmsg */
+@@ -114,9 +116,6 @@ int smc_cdc_msg_send(struct smc_connection *conn,
+       union smc_host_cursor cfed;
+       int rc;
+ 
+-      if (unlikely(!READ_ONCE(conn->sndbuf_desc)))
+-              return -ENOBUFS;
+-
+       smc_cdc_add_pending_send(conn, pend);
+ 
+       conn->tx_cdc_seq++;
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch b/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch

new file mode 100644 (file)

index 0000000..fdb706b
--- /dev/null
+++ b/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch
@@ -0,0 +1,111 @@
+From 5f8336187cd90bc38a9010e71ebfce68e99810aa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Nov 2023 14:07:38 +0800
+Subject: net/smc: fix dangling sock under state SMC_APPFINCLOSEWAIT
+
+From: D. Wythe <alibuda@linux.alibaba.com>
+
+[ Upstream commit 5211c9729484c923f8d2e06bd29f9322cc42bb8f ]
+
+Considering scenario:
+
+                               smc_cdc_rx_handler
+__smc_release
+                               sock_set_flag
+smc_close_active()
+sock_set_flag
+
+__set_bit(DEAD)                        __set_bit(DONE)
+
+Dues to __set_bit is not atomic, the DEAD or DONE might be lost.
+if the DEAD flag lost, the state SMC_CLOSED  will be never be reached
+in smc_close_passive_work:
+
+if (sock_flag(sk, SOCK_DEAD) &&
+       smc_close_sent_any_close(conn)) {
+       sk->sk_state = SMC_CLOSED;
+} else {
+       /* just shutdown, but not yet closed locally */
+       sk->sk_state = SMC_APPFINCLOSEWAIT;
+}
+
+Replace sock_set_flags or __set_bit to set_bit will fix this problem.
+Since set_bit is atomic.
+
+Fixes: b38d732477e4 ("smc: socket closing and linkgroup cleanup")
+Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/af_smc.c    | 4 ++--
+ net/smc/smc.h       | 5 +++++
+ net/smc/smc_cdc.c   | 2 +-
+ net/smc/smc_close.c | 2 +-
+ 4 files changed, 9 insertions(+), 4 deletions(-)
+
+diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c
+index 4ea41d6e36969..d676119984c09 100644
+--- a/net/smc/af_smc.c
++++ b/net/smc/af_smc.c
+@@ -274,7 +274,7 @@ static int __smc_release(struct smc_sock *smc)
+ 
+       if (!smc->use_fallback) {
+               rc = smc_close_active(smc);
+-              sock_set_flag(sk, SOCK_DEAD);
++              smc_sock_set_flag(sk, SOCK_DEAD);
+               sk->sk_shutdown |= SHUTDOWN_MASK;
+       } else {
+               if (sk->sk_state != SMC_CLOSED) {
+@@ -1710,7 +1710,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc)
+               if (new_clcsock)
+                       sock_release(new_clcsock);
+               new_sk->sk_state = SMC_CLOSED;
+-              sock_set_flag(new_sk, SOCK_DEAD);
++              smc_sock_set_flag(new_sk, SOCK_DEAD);
+               sock_put(new_sk); /* final */
+               *new_smc = NULL;
+               goto out;
+diff --git a/net/smc/smc.h b/net/smc/smc.h
+index 1d36720fc019c..bcb57e60b2155 100644
+--- a/net/smc/smc.h
++++ b/net/smc/smc.h
+@@ -377,4 +377,9 @@ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb);
+ int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+ int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info);
+ 
++static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag)
++{
++      set_bit(flag, &sk->sk_flags);
++}
++
+ #endif        /* __SMC_H */
+diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c
+index 89105e95b4523..01bdb7909a14b 100644
+--- a/net/smc/smc_cdc.c
++++ b/net/smc/smc_cdc.c
+@@ -385,7 +385,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc,
+               smc->sk.sk_shutdown |= RCV_SHUTDOWN;
+               if (smc->clcsock && smc->clcsock->sk)
+                       smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN;
+-              sock_set_flag(&smc->sk, SOCK_DONE);
++              smc_sock_set_flag(&smc->sk, SOCK_DONE);
+               sock_hold(&smc->sk); /* sock_put in close_work */
+               if (!queue_work(smc_close_wq, &conn->close_work))
+                       sock_put(&smc->sk);
+diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
+index dbdf03e8aa5b5..449ef454b53be 100644
+--- a/net/smc/smc_close.c
++++ b/net/smc/smc_close.c
+@@ -173,7 +173,7 @@ void smc_close_active_abort(struct smc_sock *smc)
+               break;
+       }
+ 
+-      sock_set_flag(sk, SOCK_DEAD);
++      smc_sock_set_flag(sk, SOCK_DEAD);
+       sk->sk_state_change(sk);
+ 
+       if (release_clcsock) {
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch b/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch

new file mode 100644 (file)

index 0000000..cab4fd7
--- /dev/null
+++ b/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch
@@ -0,0 +1,40 @@
+From 2daffade9ae01e94c5c2521448a0a5f202d39365 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Nov 2023 14:07:40 +0800
+Subject: net/smc: put sk reference if close work was canceled
+
+From: D. Wythe <alibuda@linux.alibaba.com>
+
+[ Upstream commit aa96fbd6d78d9770323b21e2c92bd38821be8852 ]
+
+Note that we always hold a reference to sock when attempting
+to submit close_work. Therefore, if we have successfully
+canceled close_work from pending, we MUST release that reference
+to avoid potential leaks.
+
+Fixes: 42bfba9eaa33 ("net/smc: immediate termination for SMCD link groups")
+Signed-off-by: D. Wythe <alibuda@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_close.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c
+index 449ef454b53be..10219f55aad14 100644
+--- a/net/smc/smc_close.c
++++ b/net/smc/smc_close.c
+@@ -116,7 +116,8 @@ static void smc_close_cancel_work(struct smc_sock *smc)
+       struct sock *sk = &smc->sk;
+ 
+       release_sock(sk);
+-      cancel_work_sync(&smc->conn.close_work);
++      if (cancel_work_sync(&smc->conn.close_work))
++              sock_put(sk);
+       cancel_delayed_work_sync(&smc->conn.tx_work);
+       lock_sock(sk);
+ }
+-- 
+2.42.0
+
diff --git a/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch b/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch

new file mode 100644 (file)

index 0000000..b931db7
--- /dev/null
+++ b/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch
@@ -0,0 +1,68 @@
+From e4eaab234e08ae1b7f732d7d07c38d25ed439eca Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Oct 2023 10:27:29 +0800
+Subject: net: stmmac: xgmac: Enable support for multiple Flexible PPS outputs
+
+From: Furong Xu <0x1207@gmail.com>
+
+[ Upstream commit db456d90a4c1b43b6251fa4348c8adc59b583274 ]
+
+From XGMAC Core 3.20 and later, each Flexible PPS has individual PPSEN bit
+to select Fixed mode or Flexible mode. The PPSEN must be set, or it stays
+in Fixed PPS mode by default.
+XGMAC Core prior 3.20, only PPSEN0(bit 4) is writable. PPSEN{1,2,3} are
+read-only reserved, and they are already in Flexible mode by default, our
+new code always set PPSEN{1,2,3} do not make things worse ;-)
+
+Fixes: 95eaf3cd0a90 ("net: stmmac: dwxgmac: Add Flexible PPS support")
+Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Signed-off-by: Furong Xu <0x1207@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h     |  2 +-
+ .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c    | 14 +++++++++++++-
+ 2 files changed, 14 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
+index 1913385df6856..880a75bf2eb1f 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
++++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h
+@@ -222,7 +222,7 @@
+       ((val) << XGMAC_PPS_MINIDX(x))
+ #define XGMAC_PPSCMD_START            0x2
+ #define XGMAC_PPSCMD_STOP             0x5
+-#define XGMAC_PPSEN0                  BIT(4)
++#define XGMAC_PPSENx(x)                       BIT(4 + (x) * 8)
+ #define XGMAC_PPSx_TARGET_TIME_SEC(x) (0x00000d80 + (x) * 0x10)
+ #define XGMAC_PPSx_TARGET_TIME_NSEC(x)        (0x00000d84 + (x) * 0x10)
+ #define XGMAC_TRGTBUSY0                       BIT(31)
+diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+index c6c4d7948fe5f..f30e08a106cbe 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
++++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c
+@@ -1135,7 +1135,19 @@ static int dwxgmac2_flex_pps_config(void __iomem *ioaddr, int index,
+ 
+       val |= XGMAC_PPSCMDx(index, XGMAC_PPSCMD_START);
+       val |= XGMAC_TRGTMODSELx(index, XGMAC_PPSCMD_START);
+-      val |= XGMAC_PPSEN0;
++
++      /* XGMAC Core has 4 PPS outputs at most.
++       *
++       * Prior XGMAC Core 3.20, Fixed mode or Flexible mode are selectable for
++       * PPS0 only via PPSEN0. PPS{1,2,3} are in Flexible mode by default,
++       * and can not be switched to Fixed mode, since PPSEN{1,2,3} are
++       * read-only reserved to 0.
++       * But we always set PPSEN{1,2,3} do not make things worse ;-)
++       *
++       * From XGMAC Core 3.20 and later, PPSEN{0,1,2,3} are writable and must
++       * be set, or the PPS outputs stay in Fixed PPS mode by default.
++       */
++      val |= XGMAC_PPSENx(index);
+ 
+       writel(cfg->start.tv_sec, ioaddr + XGMAC_PPSx_TARGET_TIME_SEC(index));
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch b/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch

new file mode 100644 (file)

index 0000000..4fb459c
--- /dev/null
+++ b/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch
@@ -0,0 +1,97 @@
+From 7f92464ecb7569ffd1203fc661d6017b7e2e85b8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 8 Nov 2023 13:18:53 +0100
+Subject: netfilter: nat: fix ipv6 nat redirect with mapped and scoped
+ addresses
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit 80abbe8a8263106fe45a4f293b92b5c74cc9cc8a ]
+
+The ipv6 redirect target was derived from the ipv4 one, i.e. its
+identical to a 'dnat' with the first (primary) address assigned to the
+network interface.  The code has been moved around to make it usable
+from nf_tables too, but its still the same as it was back when this
+was added in 2012.
+
+IPv6, however, has different types of addresses, if the 'wrong' address
+comes first the redirection does not work.
+
+In Daniels case, the addresses are:
+  inet6 ::ffff:192 ...
+  inet6 2a01: ...
+
+... so the function attempts to redirect to the mapped address.
+
+Add more checks before the address is deemed correct:
+1. If the packets' daddr is scoped, search for a scoped address too
+2. skip tentative addresses
+3. skip mapped addresses
+
+Use the first address that appears to match our needs.
+
+Reported-by: Daniel Huhardeaux <tech@tootai.net>
+Closes: https://lore.kernel.org/netfilter/71be06b8-6aa0-4cf9-9e0b-e2839b01b22f@tootai.net/
+Fixes: 115e23ac78f8 ("netfilter: ip6tables: add REDIRECT target")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_nat_redirect.c | 27 ++++++++++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
+index 6616ba5d0b049..5b37487d9d11f 100644
+--- a/net/netfilter/nf_nat_redirect.c
++++ b/net/netfilter/nf_nat_redirect.c
+@@ -80,6 +80,26 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
+ 
+ static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT;
+ 
++static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope)
++{
++      unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr);
++
++      if (ifa_addr_type & IPV6_ADDR_MAPPED)
++              return false;
++
++      if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC)))
++              return false;
++
++      if (scope) {
++              unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK;
++
++              if (!(scope & ifa_scope))
++                      return false;
++      }
++
++      return true;
++}
++
+ unsigned int
+ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+                    unsigned int hooknum)
+@@ -89,14 +109,19 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+       if (hooknum == NF_INET_LOCAL_OUT) {
+               newdst.in6 = loopback_addr;
+       } else {
++              unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr);
+               struct inet6_dev *idev;
+-              struct inet6_ifaddr *ifa;
+               bool addr = false;
+ 
+               idev = __in6_dev_get(skb->dev);
+               if (idev != NULL) {
++                      const struct inet6_ifaddr *ifa;
++
+                       read_lock_bh(&idev->lock);
+                       list_for_each_entry(ifa, &idev->addr_list, if_list) {
++                              if (!nf_nat_redirect_ipv6_usable(ifa, scope))
++                                      continue;
++
+                               newdst.in6 = ifa->addr;
+                               addr = true;
+                               break;
+-- 
+2.42.0
+
diff --git a/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch b/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch

new file mode 100644 (file)

index 0000000..8e9c4e2
--- /dev/null
+++ b/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch
@@ -0,0 +1,372 @@
+From 8d64f2d44d2141b8cbca5ef6876f6e12553d3dfb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 15 Mar 2023 21:48:01 +0000
+Subject: netfilter: nft_redir: use `struct nf_nat_range2` throughout and
+ deduplicate eval call-backs
+
+From: Jeremy Sowden <jeremy@azazel.net>
+
+[ Upstream commit 6f56ad1b92328997e1b1792047099df6f8d7acb5 ]
+
+`nf_nat_redirect_ipv4` takes a `struct nf_nat_ipv4_multi_range_compat`,
+but converts it internally to a `struct nf_nat_range2`.  Change the
+function to take the latter, factor out the code now shared with
+`nf_nat_redirect_ipv6`, move the conversion to the xt_REDIRECT module,
+and update the ipv4 range initialization in the nft_redir module.
+
+Replace a bare hex constant for 127.0.0.1 with a macro.
+
+Remove `WARN_ON`.  `nf_nat_setup_info` calls `nf_ct_is_confirmed`:
+
+       /* Can't setup nat info for confirmed ct. */
+       if (nf_ct_is_confirmed(ct))
+               return NF_ACCEPT;
+
+This means that `ct` cannot be null or the kernel will crash, and
+implies that `ctinfo` is `IP_CT_NEW` or `IP_CT_RELATED`.
+
+nft_redir has separate ipv4 and ipv6 call-backs which share much of
+their code, and an inet one switch containing a switch that calls one of
+the others based on the family of the packet.  Merge the ipv4 and ipv6
+ones into the inet one in order to get rid of the duplicate code.
+
+Const-qualify the `priv` pointer since we don't need to write through
+it.
+
+Assign `priv->flags` to the range instead of OR-ing it in.
+
+Set the `NF_NAT_RANGE_PROTO_SPECIFIED` flag once during init, rather
+than on every eval.
+
+Signed-off-by: Jeremy Sowden <jeremy@azazel.net>
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Stable-dep-of: 80abbe8a8263 ("netfilter: nat: fix ipv6 nat redirect with mapped and scoped addresses")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/netfilter/nf_nat_redirect.h |  3 +-
+ net/netfilter/nf_nat_redirect.c         | 71 ++++++++++-----------
+ net/netfilter/nft_redir.c               | 84 +++++++++----------------
+ net/netfilter/xt_REDIRECT.c             | 10 ++-
+ 4 files changed, 72 insertions(+), 96 deletions(-)
+
+diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h
+index 2418653a66db1..279380de904c8 100644
+--- a/include/net/netfilter/nf_nat_redirect.h
++++ b/include/net/netfilter/nf_nat_redirect.h
+@@ -6,8 +6,7 @@
+ #include <uapi/linux/netfilter/nf_nat.h>
+ 
+ unsigned int
+-nf_nat_redirect_ipv4(struct sk_buff *skb,
+-                   const struct nf_nat_ipv4_multi_range_compat *mr,
++nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
+                    unsigned int hooknum);
+ unsigned int
+ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c
+index f91579c821e9a..6616ba5d0b049 100644
+--- a/net/netfilter/nf_nat_redirect.c
++++ b/net/netfilter/nf_nat_redirect.c
+@@ -10,6 +10,7 @@
+ 
+ #include <linux/if.h>
+ #include <linux/inetdevice.h>
++#include <linux/in.h>
+ #include <linux/ip.h>
+ #include <linux/kernel.h>
+ #include <linux/netdevice.h>
+@@ -24,54 +25,56 @@
+ #include <net/netfilter/nf_nat.h>
+ #include <net/netfilter/nf_nat_redirect.h>
+ 
++static unsigned int
++nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range,
++              const union nf_inet_addr *newdst)
++{
++      struct nf_nat_range2 newrange;
++      enum ip_conntrack_info ctinfo;
++      struct nf_conn *ct;
++
++      ct = nf_ct_get(skb, &ctinfo);
++
++      memset(&newrange, 0, sizeof(newrange));
++
++      newrange.flags          = range->flags | NF_NAT_RANGE_MAP_IPS;
++      newrange.min_addr       = *newdst;
++      newrange.max_addr       = *newdst;
++      newrange.min_proto      = range->min_proto;
++      newrange.max_proto      = range->max_proto;
++
++      return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
++}
++
+ unsigned int
+-nf_nat_redirect_ipv4(struct sk_buff *skb,
+-                   const struct nf_nat_ipv4_multi_range_compat *mr,
++nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range,
+                    unsigned int hooknum)
+ {
+-      struct nf_conn *ct;
+-      enum ip_conntrack_info ctinfo;
+-      __be32 newdst;
+-      struct nf_nat_range2 newrange;
++      union nf_inet_addr newdst = {};
+ 
+       WARN_ON(hooknum != NF_INET_PRE_ROUTING &&
+               hooknum != NF_INET_LOCAL_OUT);
+ 
+-      ct = nf_ct_get(skb, &ctinfo);
+-      WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)));
+-
+       /* Local packets: make them go to loopback */
+       if (hooknum == NF_INET_LOCAL_OUT) {
+-              newdst = htonl(0x7F000001);
++              newdst.ip = htonl(INADDR_LOOPBACK);
+       } else {
+               const struct in_device *indev;
+ 
+-              newdst = 0;
+-
+               indev = __in_dev_get_rcu(skb->dev);
+               if (indev) {
+                       const struct in_ifaddr *ifa;
+ 
+                       ifa = rcu_dereference(indev->ifa_list);
+                       if (ifa)
+-                              newdst = ifa->ifa_local;
++                              newdst.ip = ifa->ifa_local;
+               }
+ 
+-              if (!newdst)
++              if (!newdst.ip)
+                       return NF_DROP;
+       }
+ 
+-      /* Transfer from original range. */
+-      memset(&newrange.min_addr, 0, sizeof(newrange.min_addr));
+-      memset(&newrange.max_addr, 0, sizeof(newrange.max_addr));
+-      newrange.flags       = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS;
+-      newrange.min_addr.ip = newdst;
+-      newrange.max_addr.ip = newdst;
+-      newrange.min_proto   = mr->range[0].min;
+-      newrange.max_proto   = mr->range[0].max;
+-
+-      /* Hand modified range to generic setup. */
+-      return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
++      return nf_nat_redirect(skb, range, &newdst);
+ }
+ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4);
+ 
+@@ -81,14 +84,10 @@ unsigned int
+ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+                    unsigned int hooknum)
+ {
+-      struct nf_nat_range2 newrange;
+-      struct in6_addr newdst;
+-      enum ip_conntrack_info ctinfo;
+-      struct nf_conn *ct;
++      union nf_inet_addr newdst = {};
+ 
+-      ct = nf_ct_get(skb, &ctinfo);
+       if (hooknum == NF_INET_LOCAL_OUT) {
+-              newdst = loopback_addr;
++              newdst.in6 = loopback_addr;
+       } else {
+               struct inet6_dev *idev;
+               struct inet6_ifaddr *ifa;
+@@ -98,7 +97,7 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+               if (idev != NULL) {
+                       read_lock_bh(&idev->lock);
+                       list_for_each_entry(ifa, &idev->addr_list, if_list) {
+-                              newdst = ifa->addr;
++                              newdst.in6 = ifa->addr;
+                               addr = true;
+                               break;
+                       }
+@@ -109,12 +108,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range,
+                       return NF_DROP;
+       }
+ 
+-      newrange.flags          = range->flags | NF_NAT_RANGE_MAP_IPS;
+-      newrange.min_addr.in6   = newdst;
+-      newrange.max_addr.in6   = newdst;
+-      newrange.min_proto      = range->min_proto;
+-      newrange.max_proto      = range->max_proto;
+-
+-      return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST);
++      return nf_nat_redirect(skb, range, &newdst);
+ }
+ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6);
+diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c
+index 5ed64b2bd15e8..08b408d3e113d 100644
+--- a/net/netfilter/nft_redir.c
++++ b/net/netfilter/nft_redir.c
+@@ -64,6 +64,8 @@ static int nft_redir_init(const struct nft_ctx *ctx,
+               } else {
+                       priv->sreg_proto_max = priv->sreg_proto_min;
+               }
++
++              priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+       }
+ 
+       if (tb[NFTA_REDIR_FLAGS]) {
+@@ -98,25 +100,37 @@ static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr)
+       return -1;
+ }
+ 
+-static void nft_redir_ipv4_eval(const struct nft_expr *expr,
+-                              struct nft_regs *regs,
+-                              const struct nft_pktinfo *pkt)
++static void nft_redir_eval(const struct nft_expr *expr,
++                         struct nft_regs *regs,
++                         const struct nft_pktinfo *pkt)
+ {
+-      struct nft_redir *priv = nft_expr_priv(expr);
+-      struct nf_nat_ipv4_multi_range_compat mr;
++      const struct nft_redir *priv = nft_expr_priv(expr);
++      struct nf_nat_range2 range;
+ 
+-      memset(&mr, 0, sizeof(mr));
++      memset(&range, 0, sizeof(range));
++      range.flags = priv->flags;
+       if (priv->sreg_proto_min) {
+-              mr.range[0].min.all = (__force __be16)nft_reg_load16(
+-                      &regs->data[priv->sreg_proto_min]);
+-              mr.range[0].max.all = (__force __be16)nft_reg_load16(
+-                      &regs->data[priv->sreg_proto_max]);
+-              mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
++              range.min_proto.all = (__force __be16)
++                      nft_reg_load16(&regs->data[priv->sreg_proto_min]);
++              range.max_proto.all = (__force __be16)
++                      nft_reg_load16(&regs->data[priv->sreg_proto_max]);
+       }
+ 
+-      mr.range[0].flags |= priv->flags;
+-
+-      regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt));
++      switch (nft_pf(pkt)) {
++      case NFPROTO_IPV4:
++              regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range,
++                                                        nft_hook(pkt));
++              break;
++#ifdef CONFIG_NF_TABLES_IPV6
++      case NFPROTO_IPV6:
++              regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range,
++                                                        nft_hook(pkt));
++              break;
++#endif
++      default:
++              WARN_ON_ONCE(1);
++              break;
++      }
+ }
+ 
+ static void
+@@ -129,7 +143,7 @@ static struct nft_expr_type nft_redir_ipv4_type;
+ static const struct nft_expr_ops nft_redir_ipv4_ops = {
+       .type           = &nft_redir_ipv4_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+-      .eval           = nft_redir_ipv4_eval,
++      .eval           = nft_redir_eval,
+       .init           = nft_redir_init,
+       .destroy        = nft_redir_ipv4_destroy,
+       .dump           = nft_redir_dump,
+@@ -147,28 +161,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = {
+ };
+ 
+ #ifdef CONFIG_NF_TABLES_IPV6
+-static void nft_redir_ipv6_eval(const struct nft_expr *expr,
+-                              struct nft_regs *regs,
+-                              const struct nft_pktinfo *pkt)
+-{
+-      struct nft_redir *priv = nft_expr_priv(expr);
+-      struct nf_nat_range2 range;
+-
+-      memset(&range, 0, sizeof(range));
+-      if (priv->sreg_proto_min) {
+-              range.min_proto.all = (__force __be16)nft_reg_load16(
+-                      &regs->data[priv->sreg_proto_min]);
+-              range.max_proto.all = (__force __be16)nft_reg_load16(
+-                      &regs->data[priv->sreg_proto_max]);
+-              range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED;
+-      }
+-
+-      range.flags |= priv->flags;
+-
+-      regs->verdict.code =
+-              nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt));
+-}
+-
+ static void
+ nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+ {
+@@ -179,7 +171,7 @@ static struct nft_expr_type nft_redir_ipv6_type;
+ static const struct nft_expr_ops nft_redir_ipv6_ops = {
+       .type           = &nft_redir_ipv6_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+-      .eval           = nft_redir_ipv6_eval,
++      .eval           = nft_redir_eval,
+       .init           = nft_redir_init,
+       .destroy        = nft_redir_ipv6_destroy,
+       .dump           = nft_redir_dump,
+@@ -198,20 +190,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = {
+ #endif
+ 
+ #ifdef CONFIG_NF_TABLES_INET
+-static void nft_redir_inet_eval(const struct nft_expr *expr,
+-                              struct nft_regs *regs,
+-                              const struct nft_pktinfo *pkt)
+-{
+-      switch (nft_pf(pkt)) {
+-      case NFPROTO_IPV4:
+-              return nft_redir_ipv4_eval(expr, regs, pkt);
+-      case NFPROTO_IPV6:
+-              return nft_redir_ipv6_eval(expr, regs, pkt);
+-      }
+-
+-      WARN_ON_ONCE(1);
+-}
+-
+ static void
+ nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr)
+ {
+@@ -222,7 +200,7 @@ static struct nft_expr_type nft_redir_inet_type;
+ static const struct nft_expr_ops nft_redir_inet_ops = {
+       .type           = &nft_redir_inet_type,
+       .size           = NFT_EXPR_SIZE(sizeof(struct nft_redir)),
+-      .eval           = nft_redir_inet_eval,
++      .eval           = nft_redir_eval,
+       .init           = nft_redir_init,
+       .destroy        = nft_redir_inet_destroy,
+       .dump           = nft_redir_dump,
+diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c
+index 353ca7801251a..ff66b56a3f97d 100644
+--- a/net/netfilter/xt_REDIRECT.c
++++ b/net/netfilter/xt_REDIRECT.c
+@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par)
+       nf_ct_netns_put(par->net, par->family);
+ }
+ 
+-/* FIXME: Take multiple ranges --RR */
+ static int redirect_tg4_check(const struct xt_tgchk_param *par)
+ {
+       const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
+@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par)
+ static unsigned int
+ redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par)
+ {
+-      return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par));
++      const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo;
++      struct nf_nat_range2 range = {
++              .flags       = mr->range[0].flags,
++              .min_proto   = mr->range[0].min,
++              .max_proto   = mr->range[0].max,
++      };
++
++      return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par));
+ }
+ 
+ static struct xt_target redirect_tg_reg[] __read_mostly = {
+-- 
+2.42.0
+
diff --git a/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch b/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch

new file mode 100644 (file)

index 0000000..69dfab9
--- /dev/null
+++ b/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch
@@ -0,0 +1,49 @@
+From 128703facde1966f223048219a98e742fabbc063 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Nov 2023 11:56:00 -0800
+Subject: netfilter: xt_recent: fix (increase) ipv6 literal buffer length
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Maciej Żenczykowski <zenczykowski@gmail.com>
+
+[ Upstream commit 7b308feb4fd2d1c06919445c65c8fbf8e9fd1781 ]
+
+in6_pton() supports 'low-32-bit dot-decimal representation'
+(this is useful with DNS64/NAT64 networks for example):
+
+  # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:1.2.3.4 > /proc/self/net/xt_recent/DEFAULT
+  # cat /proc/self/net/xt_recent/DEFAULT
+  src=aaaa:bbbb:cccc:dddd:eeee:ffff:0102:0304 ttl: 0 last_seen: 9733848829 oldest_pkt: 1 9733848829
+
+but the provided buffer is too short:
+
+  # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:255.255.255.255 > /proc/self/net/xt_recent/DEFAULT
+  -bash: echo: write error: Invalid argument
+
+Fixes: 079aa88fe717 ("netfilter: xt_recent: IPv6 support")
+Signed-off-by: Maciej Żenczykowski <zenczykowski@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/xt_recent.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c
+index 7ddb9a78e3fc8..ef93e0d3bee04 100644
+--- a/net/netfilter/xt_recent.c
++++ b/net/netfilter/xt_recent.c
+@@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input,
+ {
+       struct recent_table *t = pde_data(file_inode(file));
+       struct recent_entry *e;
+-      char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")];
++      char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")];
+       const char *c = buf;
+       union nf_inet_addr addr = {};
+       u_int16_t family;
+-- 
+2.42.0
+
diff --git a/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch b/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch

new file mode 100644 (file)

index 0000000..0e53148
--- /dev/null
+++ b/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch
@@ -0,0 +1,46 @@
+From 939f52cda3d6f05a0dcbfbbdfa7f6378eb95d8e0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Oct 2023 00:54:30 +0530
+Subject: nvme: fix error-handling for io_uring nvme-passthrough
+
+From: Anuj Gupta <anuj20.g@samsung.com>
+
+[ Upstream commit 1147dd0503564fa0e03489a039f9e0c748a03db4 ]
+
+Driver may return an error before submitting the command to the device.
+Ensure that such error is propagated up.
+
+Fixes: 456cba386e94 ("nvme: wire-up uring-cmd support for io-passthru on char-device.")
+Signed-off-by: Anuj Gupta <anuj20.g@samsung.com>
+Signed-off-by: Kanchan Joshi <joshi.k@samsung.com>
+Reviewed-by: Niklas Cassel <niklas.cassel@wdc.com>
+Reviewed-by: Christoph Hellwig <hch@lst.de>
+Signed-off-by: Keith Busch <kbusch@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/nvme/host/ioctl.c | 7 +++++--
+ 1 file changed, 5 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c
+index b33004a4bcb5a..91e6d03475798 100644
+--- a/drivers/nvme/host/ioctl.c
++++ b/drivers/nvme/host/ioctl.c
+@@ -435,10 +435,13 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req,
+       void *cookie = READ_ONCE(ioucmd->cookie);
+ 
+       req->bio = pdu->bio;
+-      if (nvme_req(req)->flags & NVME_REQ_CANCELLED)
++      if (nvme_req(req)->flags & NVME_REQ_CANCELLED) {
+               pdu->nvme_status = -EINTR;
+-      else
++      } else {
+               pdu->nvme_status = nvme_req(req)->status;
++              if (!pdu->nvme_status)
++                      pdu->nvme_status = blk_status_to_errno(err);
++      }
+       pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64);
+ 
+       /*
+-- 
+2.42.0
+
diff --git a/queue-6.1/octeontx2-pf-fix-error-codes.patch b/queue-6.1/octeontx2-pf-fix-error-codes.patch

new file mode 100644 (file)

index 0000000..569d6c9
--- /dev/null
+++ b/queue-6.1/octeontx2-pf-fix-error-codes.patch
@@ -0,0 +1,69 @@
+From 2e089a867a9f6ed8df0c8b4385a9e45f09c3cc30 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Oct 2023 07:49:52 +0530
+Subject: octeontx2-pf: Fix error codes
+
+From: Ratheesh Kannoth <rkannoth@marvell.com>
+
+[ Upstream commit 96b9a68d1a6e4f889d453874c9e359aa720b520f ]
+
+Some of error codes were wrong. Fix the same.
+
+Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]")
+Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
+Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Link: https://lore.kernel.org/r/20231027021953.1819959-1-rkannoth@marvell.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../marvell/octeontx2/nic/otx2_struct.h       | 34 +++++++++----------
+ 1 file changed, 17 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h
+index fa37b9f312cae..4e5899d8fa2e6 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h
+@@ -318,23 +318,23 @@ enum nix_snd_status_e {
+       NIX_SND_STATUS_EXT_ERR = 0x6,
+       NIX_SND_STATUS_JUMP_FAULT = 0x7,
+       NIX_SND_STATUS_JUMP_POISON = 0x8,
+-      NIX_SND_STATUS_CRC_ERR = 0x9,
+-      NIX_SND_STATUS_IMM_ERR = 0x10,
+-      NIX_SND_STATUS_SG_ERR = 0x11,
+-      NIX_SND_STATUS_MEM_ERR = 0x12,
+-      NIX_SND_STATUS_INVALID_SUBDC = 0x13,
+-      NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x14,
+-      NIX_SND_STATUS_DATA_FAULT = 0x15,
+-      NIX_SND_STATUS_DATA_POISON = 0x16,
+-      NIX_SND_STATUS_NPC_DROP_ACTION = 0x17,
+-      NIX_SND_STATUS_LOCK_VIOL = 0x18,
+-      NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x19,
+-      NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x20,
+-      NIX_SND_STATUS_NPC_MCAST_ABORT = 0x21,
+-      NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x22,
+-      NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x23,
+-      NIX_SND_STATUS_SEND_MEM_FAULT = 0x24,
+-      NIX_SND_STATUS_SEND_STATS_ERR = 0x25,
++      NIX_SND_STATUS_CRC_ERR = 0x10,
++      NIX_SND_STATUS_IMM_ERR = 0x11,
++      NIX_SND_STATUS_SG_ERR = 0x12,
++      NIX_SND_STATUS_MEM_ERR = 0x13,
++      NIX_SND_STATUS_INVALID_SUBDC = 0x14,
++      NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x15,
++      NIX_SND_STATUS_DATA_FAULT = 0x16,
++      NIX_SND_STATUS_DATA_POISON = 0x17,
++      NIX_SND_STATUS_NPC_DROP_ACTION = 0x20,
++      NIX_SND_STATUS_LOCK_VIOL = 0x21,
++      NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x22,
++      NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x23,
++      NIX_SND_STATUS_NPC_MCAST_ABORT = 0x24,
++      NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x25,
++      NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x26,
++      NIX_SND_STATUS_SEND_MEM_FAULT = 0x27,
++      NIX_SND_STATUS_SEND_STATS_ERR = 0x28,
+       NIX_SND_STATUS_MAX,
+ };
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch b/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch

new file mode 100644 (file)

index 0000000..df4f3e7
--- /dev/null
+++ b/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch
@@ -0,0 +1,156 @@
+From 5a8654a938e41485de1b43de81286f0f4a47f6ff Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Oct 2023 07:49:53 +0530
+Subject: octeontx2-pf: Fix holes in error code
+
+From: Ratheesh Kannoth <rkannoth@marvell.com>
+
+[ Upstream commit 7aeeb2cb7a2570bb69a87ad14018b03e06ce5be5 ]
+
+Error code strings are not getting printed properly
+due to holes. Print error code as well.
+
+Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]")
+Signed-off-by: Ratheesh Kannoth <rkannoth@marvell.com>
+Reviewed-by: Wojciech Drewek <wojciech.drewek@intel.com>
+Link: https://lore.kernel.org/r/20231027021953.1819959-2-rkannoth@marvell.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/marvell/octeontx2/nic/otx2_pf.c  | 80 +++++++++++--------
+ 1 file changed, 46 insertions(+), 34 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+index 17e546d0d7e55..101d79a0bb436 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+@@ -1194,31 +1194,32 @@ static char *nix_mnqerr_e_str[NIX_MNQERR_MAX] = {
+ };
+ 
+ static char *nix_snd_status_e_str[NIX_SND_STATUS_MAX] =  {
+-      "NIX_SND_STATUS_GOOD",
+-      "NIX_SND_STATUS_SQ_CTX_FAULT",
+-      "NIX_SND_STATUS_SQ_CTX_POISON",
+-      "NIX_SND_STATUS_SQB_FAULT",
+-      "NIX_SND_STATUS_SQB_POISON",
+-      "NIX_SND_STATUS_HDR_ERR",
+-      "NIX_SND_STATUS_EXT_ERR",
+-      "NIX_SND_STATUS_JUMP_FAULT",
+-      "NIX_SND_STATUS_JUMP_POISON",
+-      "NIX_SND_STATUS_CRC_ERR",
+-      "NIX_SND_STATUS_IMM_ERR",
+-      "NIX_SND_STATUS_SG_ERR",
+-      "NIX_SND_STATUS_MEM_ERR",
+-      "NIX_SND_STATUS_INVALID_SUBDC",
+-      "NIX_SND_STATUS_SUBDC_ORDER_ERR",
+-      "NIX_SND_STATUS_DATA_FAULT",
+-      "NIX_SND_STATUS_DATA_POISON",
+-      "NIX_SND_STATUS_NPC_DROP_ACTION",
+-      "NIX_SND_STATUS_LOCK_VIOL",
+-      "NIX_SND_STATUS_NPC_UCAST_CHAN_ERR",
+-      "NIX_SND_STATUS_NPC_MCAST_CHAN_ERR",
+-      "NIX_SND_STATUS_NPC_MCAST_ABORT",
+-      "NIX_SND_STATUS_NPC_VTAG_PTR_ERR",
+-      "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR",
+-      "NIX_SND_STATUS_SEND_STATS_ERR",
++      [NIX_SND_STATUS_GOOD] = "NIX_SND_STATUS_GOOD",
++      [NIX_SND_STATUS_SQ_CTX_FAULT] = "NIX_SND_STATUS_SQ_CTX_FAULT",
++      [NIX_SND_STATUS_SQ_CTX_POISON] = "NIX_SND_STATUS_SQ_CTX_POISON",
++      [NIX_SND_STATUS_SQB_FAULT] = "NIX_SND_STATUS_SQB_FAULT",
++      [NIX_SND_STATUS_SQB_POISON] = "NIX_SND_STATUS_SQB_POISON",
++      [NIX_SND_STATUS_HDR_ERR] = "NIX_SND_STATUS_HDR_ERR",
++      [NIX_SND_STATUS_EXT_ERR] = "NIX_SND_STATUS_EXT_ERR",
++      [NIX_SND_STATUS_JUMP_FAULT] = "NIX_SND_STATUS_JUMP_FAULT",
++      [NIX_SND_STATUS_JUMP_POISON] = "NIX_SND_STATUS_JUMP_POISON",
++      [NIX_SND_STATUS_CRC_ERR] = "NIX_SND_STATUS_CRC_ERR",
++      [NIX_SND_STATUS_IMM_ERR] = "NIX_SND_STATUS_IMM_ERR",
++      [NIX_SND_STATUS_SG_ERR] = "NIX_SND_STATUS_SG_ERR",
++      [NIX_SND_STATUS_MEM_ERR] = "NIX_SND_STATUS_MEM_ERR",
++      [NIX_SND_STATUS_INVALID_SUBDC] = "NIX_SND_STATUS_INVALID_SUBDC",
++      [NIX_SND_STATUS_SUBDC_ORDER_ERR] = "NIX_SND_STATUS_SUBDC_ORDER_ERR",
++      [NIX_SND_STATUS_DATA_FAULT] = "NIX_SND_STATUS_DATA_FAULT",
++      [NIX_SND_STATUS_DATA_POISON] = "NIX_SND_STATUS_DATA_POISON",
++      [NIX_SND_STATUS_NPC_DROP_ACTION] = "NIX_SND_STATUS_NPC_DROP_ACTION",
++      [NIX_SND_STATUS_LOCK_VIOL] = "NIX_SND_STATUS_LOCK_VIOL",
++      [NIX_SND_STATUS_NPC_UCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_UCAST_CHAN_ERR",
++      [NIX_SND_STATUS_NPC_MCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_MCAST_CHAN_ERR",
++      [NIX_SND_STATUS_NPC_MCAST_ABORT] = "NIX_SND_STATUS_NPC_MCAST_ABORT",
++      [NIX_SND_STATUS_NPC_VTAG_PTR_ERR] = "NIX_SND_STATUS_NPC_VTAG_PTR_ERR",
++      [NIX_SND_STATUS_NPC_VTAG_SIZE_ERR] = "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR",
++      [NIX_SND_STATUS_SEND_MEM_FAULT] = "NIX_SND_STATUS_SEND_MEM_FAULT",
++      [NIX_SND_STATUS_SEND_STATS_ERR] = "NIX_SND_STATUS_SEND_STATS_ERR",
+ };
+ 
+ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+@@ -1238,14 +1239,16 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+                       continue;
+ 
+               if (val & BIT_ULL(42)) {
+-                      netdev_err(pf->netdev, "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n",
++                      netdev_err(pf->netdev,
++                                 "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n",
+                                  qidx, otx2_read64(pf, NIX_LF_ERR_INT));
+               } else {
+                       if (val & BIT_ULL(NIX_CQERRINT_DOOR_ERR))
+                               netdev_err(pf->netdev, "CQ%lld: Doorbell error",
+                                          qidx);
+                       if (val & BIT_ULL(NIX_CQERRINT_CQE_FAULT))
+-                              netdev_err(pf->netdev, "CQ%lld: Memory fault on CQE write to LLC/DRAM",
++                              netdev_err(pf->netdev,
++                                         "CQ%lld: Memory fault on CQE write to LLC/DRAM",
+                                          qidx);
+               }
+ 
+@@ -1268,7 +1271,8 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+                            (val & NIX_SQINT_BITS));
+ 
+               if (val & BIT_ULL(42)) {
+-                      netdev_err(pf->netdev, "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n",
++                      netdev_err(pf->netdev,
++                                 "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n",
+                                  qidx, otx2_read64(pf, NIX_LF_ERR_INT));
+                       goto done;
+               }
+@@ -1278,8 +1282,11 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+                       goto chk_mnq_err_dbg;
+ 
+               sq_op_err_code = FIELD_GET(GENMASK(7, 0), sq_op_err_dbg);
+-              netdev_err(pf->netdev, "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(%llx)  err=%s\n",
+-                         qidx, sq_op_err_dbg, nix_sqoperr_e_str[sq_op_err_code]);
++              netdev_err(pf->netdev,
++                         "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(0x%llx)  err=%s(%#x)\n",
++                         qidx, sq_op_err_dbg,
++                         nix_sqoperr_e_str[sq_op_err_code],
++                         sq_op_err_code);
+ 
+               otx2_write64(pf, NIX_LF_SQ_OP_ERR_DBG, BIT_ULL(44));
+ 
+@@ -1296,16 +1303,21 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+                       goto chk_snd_err_dbg;
+ 
+               mnq_err_code = FIELD_GET(GENMASK(7, 0), mnq_err_dbg);
+-              netdev_err(pf->netdev, "SQ%lld: NIX_LF_MNQ_ERR_DBG(%llx)  err=%s\n",
+-                         qidx, mnq_err_dbg,  nix_mnqerr_e_str[mnq_err_code]);
++              netdev_err(pf->netdev,
++                         "SQ%lld: NIX_LF_MNQ_ERR_DBG(0x%llx)  err=%s(%#x)\n",
++                         qidx, mnq_err_dbg,  nix_mnqerr_e_str[mnq_err_code],
++                         mnq_err_code);
+               otx2_write64(pf, NIX_LF_MNQ_ERR_DBG, BIT_ULL(44));
+ 
+ chk_snd_err_dbg:
+               snd_err_dbg = otx2_read64(pf, NIX_LF_SEND_ERR_DBG);
+               if (snd_err_dbg & BIT(44)) {
+                       snd_err_code = FIELD_GET(GENMASK(7, 0), snd_err_dbg);
+-                      netdev_err(pf->netdev, "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s\n",
+-                                 qidx, snd_err_dbg, nix_snd_status_e_str[snd_err_code]);
++                      netdev_err(pf->netdev,
++                                 "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s(%#x)\n",
++                                 qidx, snd_err_dbg,
++                                 nix_snd_status_e_str[snd_err_code],
++                                 snd_err_code);
+                       otx2_write64(pf, NIX_LF_SEND_ERR_DBG, BIT_ULL(44));
+               }
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch b/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch

new file mode 100644 (file)

index 0000000..5c9a481
--- /dev/null
+++ b/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch
@@ -0,0 +1,162 @@
+From e11cc39744ac02b64560bce825fdfe5810ae0645 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Oct 2023 16:53:45 +0530
+Subject: octeontx2-pf: Free pending and dropped SQEs
+
+From: Geetha sowjanya <gakula@marvell.com>
+
+[ Upstream commit 3423ca23e08bf285a324237abe88e7e7d9becfe6 ]
+
+On interface down, the pending SQEs in the NIX get dropped
+or drained out during SMQ flush. But skb's pointed by these
+SQEs never get free or updated to the stack as respective CQE
+never get added.
+This patch fixes the issue by freeing all valid skb's in SQ SG list.
+
+Fixes: b1bc8457e9d0 ("octeontx2-pf: Cleanup all receive buffers in SG descriptor")
+Signed-off-by: Geetha sowjanya <gakula@marvell.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../marvell/octeontx2/nic/otx2_common.c       | 15 +++----
+ .../marvell/octeontx2/nic/otx2_common.h       |  1 +
+ .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |  1 +
+ .../marvell/octeontx2/nic/otx2_txrx.c         | 42 +++++++++++++++++++
+ 4 files changed, 49 insertions(+), 10 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+index c76dad78c26eb..0f896f606c3e6 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+@@ -797,7 +797,6 @@ void otx2_sqb_flush(struct otx2_nic *pfvf)
+       int qidx, sqe_tail, sqe_head;
+       struct otx2_snd_queue *sq;
+       u64 incr, *ptr, val;
+-      int timeout = 1000;
+ 
+       ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS);
+       for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
+@@ -806,15 +805,11 @@ void otx2_sqb_flush(struct otx2_nic *pfvf)
+                       continue;
+ 
+               incr = (u64)qidx << 32;
+-              while (timeout) {
+-                      val = otx2_atomic64_add(incr, ptr);
+-                      sqe_head = (val >> 20) & 0x3F;
+-                      sqe_tail = (val >> 28) & 0x3F;
+-                      if (sqe_head == sqe_tail)
+-                              break;
+-                      usleep_range(1, 3);
+-                      timeout--;
+-              }
++              val = otx2_atomic64_add(incr, ptr);
++              sqe_head = (val >> 20) & 0x3F;
++              sqe_tail = (val >> 28) & 0x3F;
++              if (sqe_head != sqe_tail)
++                      usleep_range(50, 60);
+       }
+ }
+ 
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+index 876a7b51b8e51..efd66224b3dbf 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+@@ -933,6 +933,7 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en);
+ int otx2_txsch_alloc(struct otx2_nic *pfvf);
+ void otx2_txschq_stop(struct otx2_nic *pfvf);
+ void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
++void otx2_free_pending_sqe(struct otx2_nic *pfvf);
+ void otx2_sqb_flush(struct otx2_nic *pfvf);
+ int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
+                   dma_addr_t *dma);
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+index c558c9b64f5be..c724131172f3f 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+@@ -1596,6 +1596,7 @@ static void otx2_free_hw_resources(struct otx2_nic *pf)
+               else
+                       otx2_cleanup_tx_cqes(pf, cq);
+       }
++      otx2_free_pending_sqe(pf);
+ 
+       otx2_free_sq_res(pf);
+ 
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+index d005434e1e037..20d801d30c732 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+@@ -1224,9 +1224,11 @@ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq)
+ 
+ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq)
+ {
++      int tx_pkts = 0, tx_bytes = 0;
+       struct sk_buff *skb = NULL;
+       struct otx2_snd_queue *sq;
+       struct nix_cqe_tx_s *cqe;
++      struct netdev_queue *txq;
+       int processed_cqe = 0;
+       struct sg_list *sg;
+       int qidx;
+@@ -1247,12 +1249,20 @@ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq)
+               sg = &sq->sg[cqe->comp.sqe_id];
+               skb = (struct sk_buff *)sg->skb;
+               if (skb) {
++                      tx_bytes += skb->len;
++                      tx_pkts++;
+                       otx2_dma_unmap_skb_frags(pfvf, sg);
+                       dev_kfree_skb_any(skb);
+                       sg->skb = (u64)NULL;
+               }
+       }
+ 
++      if (likely(tx_pkts)) {
++              if (qidx >= pfvf->hw.tx_queues)
++                      qidx -= pfvf->hw.xdp_queues;
++              txq = netdev_get_tx_queue(pfvf->netdev, qidx);
++              netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
++      }
+       /* Free CQEs to HW */
+       otx2_write64(pfvf, NIX_LF_CQ_OP_DOOR,
+                    ((u64)cq->cq_idx << 32) | processed_cqe);
+@@ -1279,6 +1289,38 @@ int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable)
+       return err;
+ }
+ 
++void otx2_free_pending_sqe(struct otx2_nic *pfvf)
++{
++      int tx_pkts = 0, tx_bytes = 0;
++      struct sk_buff *skb = NULL;
++      struct otx2_snd_queue *sq;
++      struct netdev_queue *txq;
++      struct sg_list *sg;
++      int sq_idx, sqe;
++
++      for (sq_idx = 0; sq_idx < pfvf->hw.tx_queues; sq_idx++) {
++              sq = &pfvf->qset.sq[sq_idx];
++              for (sqe = 0; sqe < sq->sqe_cnt; sqe++) {
++                      sg = &sq->sg[sqe];
++                      skb = (struct sk_buff *)sg->skb;
++                      if (skb) {
++                              tx_bytes += skb->len;
++                              tx_pkts++;
++                              otx2_dma_unmap_skb_frags(pfvf, sg);
++                              dev_kfree_skb_any(skb);
++                              sg->skb = (u64)NULL;
++                      }
++              }
++
++              if (!tx_pkts)
++                      continue;
++              txq = netdev_get_tx_queue(pfvf->netdev, sq_idx);
++              netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
++              tx_pkts = 0;
++              tx_bytes = 0;
++      }
++}
++
+ static void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, u64 dma_addr,
+                               int len, int *offset)
+ {
+-- 
+2.42.0
+
diff --git a/queue-6.1/octeontx2-pf-qos-send-queues-management.patch b/queue-6.1/octeontx2-pf-qos-send-queues-management.patch

new file mode 100644 (file)

index 0000000..da2f5e6
--- /dev/null
+++ b/queue-6.1/octeontx2-pf-qos-send-queues-management.patch
@@ -0,0 +1,874 @@
+From f6a2d4a39e969d2c49f8cceb5825a7a2d740ea15 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 May 2023 14:21:38 +0530
+Subject: octeontx2-pf: qos send queues management
+
+From: Subbaraya Sundeep <sbhatta@marvell.com>
+
+[ Upstream commit ab6dddd2a669a0ecc2ce07485c7a15fadbb5a0aa ]
+
+Current implementation is such that the number of Send queues (SQs)
+are decided on the device probe which is equal to the number of online
+cpus. These SQs are allocated and deallocated in interface open and c
+lose calls respectively.
+
+This patch defines new APIs for initializing and deinitializing Send
+queues dynamically and allocates more number of transmit queues for
+QOS feature.
+
+Signed-off-by: Subbaraya Sundeep <sbhatta@marvell.com>
+Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
+Signed-off-by: Sunil Kovvuri Goutham <sgoutham@marvell.com>
+Reviewed-by: Simon Horman <simon.horman@corigine.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 3423ca23e08b ("octeontx2-pf: Free pending and dropped SQEs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../marvell/octeontx2/af/rvu_debugfs.c        |   5 +
+ .../ethernet/marvell/octeontx2/nic/Makefile   |   2 +-
+ .../marvell/octeontx2/nic/otx2_common.c       |  43 ++-
+ .../marvell/octeontx2/nic/otx2_common.h       |  39 ++-
+ .../ethernet/marvell/octeontx2/nic/otx2_pf.c  |  44 ++-
+ .../marvell/octeontx2/nic/otx2_txrx.c         |  24 +-
+ .../marvell/octeontx2/nic/otx2_txrx.h         |   3 +-
+ .../ethernet/marvell/octeontx2/nic/otx2_vf.c  |   7 +-
+ .../net/ethernet/marvell/octeontx2/nic/qos.h  |  19 ++
+ .../ethernet/marvell/octeontx2/nic/qos_sq.c   | 282 ++++++++++++++++++
+ 10 files changed, 426 insertions(+), 42 deletions(-)
+ create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos.h
+ create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+index aadc352c2ffbd..5c9dc3f9262f5 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c
+@@ -1222,6 +1222,11 @@ static int rvu_dbg_npa_ctx_display(struct seq_file *m, void *unused, int ctype)
+ 
+       for (aura = id; aura < max_id; aura++) {
+               aq_req.aura_id = aura;
++
++              /* Skip if queue is uninitialized */
++              if (ctype == NPA_AQ_CTYPE_POOL && !test_bit(aura, pfvf->pool_bmap))
++                      continue;
++
+               seq_printf(m, "======%s : %d=======\n",
+                          (ctype == NPA_AQ_CTYPE_AURA) ? "AURA" : "POOL",
+                       aq_req.aura_id);
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+index 73fdb87986148..3d31ddf7c652e 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile
+@@ -8,7 +8,7 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o
+ 
+ rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \
+                otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \
+-               otx2_devlink.o
++               otx2_devlink.o qos_sq.o
+ rvu_nicvf-y := otx2_vf.o otx2_devlink.o
+ 
+ rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+index 2575c207150e1..c76dad78c26eb 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+@@ -513,8 +513,8 @@ void otx2_config_irq_coalescing(struct otx2_nic *pfvf, int qidx)
+                    (pfvf->hw.cq_ecount_wait - 1));
+ }
+ 
+-int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
+-                    dma_addr_t *dma)
++static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
++                           dma_addr_t *dma)
+ {
+       u8 *buf;
+ 
+@@ -532,8 +532,8 @@ int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
+       return 0;
+ }
+ 
+-static int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
+-                         dma_addr_t *dma)
++int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
++                  dma_addr_t *dma)
+ {
+       int ret;
+ 
+@@ -795,11 +795,16 @@ void otx2_txschq_stop(struct otx2_nic *pfvf)
+ void otx2_sqb_flush(struct otx2_nic *pfvf)
+ {
+       int qidx, sqe_tail, sqe_head;
++      struct otx2_snd_queue *sq;
+       u64 incr, *ptr, val;
+       int timeout = 1000;
+ 
+       ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS);
+-      for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) {
++      for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
++              sq = &pfvf->qset.sq[qidx];
++              if (!sq->sqb_ptrs)
++                      continue;
++
+               incr = (u64)qidx << 32;
+               while (timeout) {
+                       val = otx2_atomic64_add(incr, ptr);
+@@ -899,7 +904,7 @@ int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura)
+       return otx2_sync_mbox_msg(&pfvf->mbox);
+ }
+ 
+-static int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
++int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura)
+ {
+       struct otx2_qset *qset = &pfvf->qset;
+       struct otx2_snd_queue *sq;
+@@ -972,9 +977,17 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx)
+               cq->cint_idx = qidx - pfvf->hw.rx_queues;
+               cq->cqe_cnt = qset->sqe_cnt;
+       } else {
+-              cq->cq_type = CQ_XDP;
+-              cq->cint_idx = qidx - non_xdp_queues;
+-              cq->cqe_cnt = qset->sqe_cnt;
++              if (pfvf->hw.xdp_queues &&
++                  qidx < non_xdp_queues + pfvf->hw.xdp_queues) {
++                      cq->cq_type = CQ_XDP;
++                      cq->cint_idx = qidx - non_xdp_queues;
++                      cq->cqe_cnt = qset->sqe_cnt;
++              } else {
++                      cq->cq_type = CQ_QOS;
++                      cq->cint_idx = qidx - non_xdp_queues -
++                                     pfvf->hw.xdp_queues;
++                      cq->cqe_cnt = qset->sqe_cnt;
++              }
+       }
+       cq->cqe_size = pfvf->qset.xqe_size;
+ 
+@@ -1132,7 +1145,7 @@ int otx2_config_nix(struct otx2_nic *pfvf)
+ 
+       /* Set RQ/SQ/CQ counts */
+       nixlf->rq_cnt = pfvf->hw.rx_queues;
+-      nixlf->sq_cnt = pfvf->hw.non_qos_queues;
++      nixlf->sq_cnt = otx2_get_total_tx_queues(pfvf);
+       nixlf->cq_cnt = pfvf->qset.cq_cnt;
+       nixlf->rss_sz = MAX_RSS_INDIR_TBL_SIZE;
+       nixlf->rss_grps = MAX_RSS_GROUPS;
+@@ -1170,7 +1183,7 @@ void otx2_sq_free_sqbs(struct otx2_nic *pfvf)
+       int sqb, qidx;
+       u64 iova, pa;
+ 
+-      for (qidx = 0; qidx < hw->non_qos_queues; qidx++) {
++      for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) {
+               sq = &qset->sq[qidx];
+               if (!sq->sqb_ptrs)
+                       continue;
+@@ -1238,8 +1251,8 @@ void otx2_aura_pool_free(struct otx2_nic *pfvf)
+       pfvf->qset.pool = NULL;
+ }
+ 
+-static int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
+-                        int pool_id, int numptrs)
++int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
++                 int pool_id, int numptrs)
+ {
+       struct npa_aq_enq_req *aq;
+       struct otx2_pool *pool;
+@@ -1315,8 +1328,8 @@ static int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
+       return 0;
+ }
+ 
+-static int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
+-                        int stack_pages, int numptrs, int buf_size)
++int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
++                 int stack_pages, int numptrs, int buf_size)
+ {
+       struct npa_aq_enq_req *aq;
+       struct otx2_pool *pool;
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+index 6c81d09798914..876a7b51b8e51 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+@@ -27,6 +27,7 @@
+ #include "otx2_txrx.h"
+ #include "otx2_devlink.h"
+ #include <rvu_trace.h>
++#include "qos.h"
+ 
+ /* PCI device IDs */
+ #define PCI_DEVID_OCTEONTX2_RVU_PF              0xA063
+@@ -186,6 +187,7 @@ struct otx2_hw {
+       u16                     rx_queues;
+       u16                     tx_queues;
+       u16                     xdp_queues;
++      u16                     tc_tx_queues;
+       u16                     non_qos_queues; /* tx queues plus xdp queues */
+       u16                     max_queues;
+       u16                     pool_cnt;
+@@ -498,6 +500,8 @@ struct otx2_nic {
+       u16                     pfc_schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC];
+       bool                    pfc_alloc_status[NIX_PF_PFC_PRIO_MAX];
+ #endif
++      /* qos */
++      struct otx2_qos         qos;
+ 
+       /* napi event count. It is needed for adaptive irq coalescing. */
+       u32 napi_events;
+@@ -742,8 +746,7 @@ static inline void cn10k_aura_freeptr(void *dev, int aura, u64 buf)
+ /* Alloc pointer from pool/aura */
+ static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura)
+ {
+-      u64 *ptr = (u64 *)otx2_get_regaddr(pfvf,
+-                         NPA_LF_AURA_OP_ALLOCX(0));
++      u64 *ptr = (__force u64 *)otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_ALLOCX(0));
+       u64 incr = (u64)aura | BIT_ULL(63);
+ 
+       return otx2_atomic64_add(incr, ptr);
+@@ -885,12 +888,23 @@ static inline void otx2_dma_unmap_page(struct otx2_nic *pfvf,
+ 
+ static inline u16 otx2_get_smq_idx(struct otx2_nic *pfvf, u16 qidx)
+ {
++      u16 smq;
+ #ifdef CONFIG_DCB
+       if (qidx < NIX_PF_PFC_PRIO_MAX && pfvf->pfc_alloc_status[qidx])
+               return pfvf->pfc_schq_list[NIX_TXSCH_LVL_SMQ][qidx];
+ #endif
++      /* check if qidx falls under QOS queues */
++      if (qidx >= pfvf->hw.non_qos_queues)
++              smq = pfvf->qos.qid_to_sqmap[qidx - pfvf->hw.non_qos_queues];
++      else
++              smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0];
+ 
+-      return pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0];
++      return smq;
++}
++
++static inline u16 otx2_get_total_tx_queues(struct otx2_nic *pfvf)
++{
++      return pfvf->hw.non_qos_queues + pfvf->hw.tc_tx_queues;
+ }
+ 
+ /* MSI-X APIs */
+@@ -920,17 +934,22 @@ int otx2_txsch_alloc(struct otx2_nic *pfvf);
+ void otx2_txschq_stop(struct otx2_nic *pfvf);
+ void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq);
+ void otx2_sqb_flush(struct otx2_nic *pfvf);
+-int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
+-                    dma_addr_t *dma);
++int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool,
++                  dma_addr_t *dma);
+ int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable);
+ void otx2_ctx_disable(struct mbox *mbox, int type, bool npa);
+ int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable);
+ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq);
+ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq);
++int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura);
+ int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
+ int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura);
+ int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq,
+                     dma_addr_t *dma);
++int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id,
++                 int stack_pages, int numptrs, int buf_size);
++int otx2_aura_init(struct otx2_nic *pfvf, int aura_id,
++                 int pool_id, int numptrs);
+ 
+ /* RSS configuration APIs*/
+ int otx2_rss_init(struct otx2_nic *pfvf);
+@@ -1038,4 +1057,14 @@ static inline void cn10k_handle_mcs_event(struct otx2_nic *pfvf,
+ {}
+ #endif /* CONFIG_MACSEC */
+ 
++/* qos support */
++static inline void otx2_qos_init(struct otx2_nic *pfvf, int qos_txqs)
++{
++      struct otx2_hw *hw = &pfvf->hw;
++
++      hw->tc_tx_queues = qos_txqs;
++}
++
++u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
++                    struct net_device *sb_dev);
+ #endif /* OTX2_COMMON_H */
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+index 545984a86f235..c558c9b64f5be 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+@@ -23,6 +23,7 @@
+ #include "otx2_struct.h"
+ #include "otx2_ptp.h"
+ #include "cn10k.h"
++#include "qos.h"
+ #include <rvu_trace.h>
+ 
+ #define DRV_NAME      "rvu_nicpf"
+@@ -1225,6 +1226,7 @@ static char *nix_snd_status_e_str[NIX_SND_STATUS_MAX] =  {
+ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+ {
+       struct otx2_nic *pf = data;
++      struct otx2_snd_queue *sq;
+       u64 val, *ptr;
+       u64 qidx = 0;
+ 
+@@ -1256,10 +1258,14 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+       }
+ 
+       /* SQ */
+-      for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) {
++      for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) {
+               u64 sq_op_err_dbg, mnq_err_dbg, snd_err_dbg;
+               u8 sq_op_err_code, mnq_err_code, snd_err_code;
+ 
++              sq = &pf->qset.sq[qidx];
++              if (!sq->sqb_ptrs)
++                      continue;
++
+               /* Below debug registers captures first errors corresponding to
+                * those registers. We don't have to check against SQ qid as
+                * these are fatal errors.
+@@ -1391,7 +1397,7 @@ static void otx2_free_sq_res(struct otx2_nic *pf)
+       otx2_ctx_disable(&pf->mbox, NIX_AQ_CTYPE_SQ, false);
+       /* Free SQB pointers */
+       otx2_sq_free_sqbs(pf);
+-      for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) {
++      for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) {
+               sq = &qset->sq[qidx];
+               qmem_free(pf->dev, sq->sqe);
+               qmem_free(pf->dev, sq->tso_hdrs);
+@@ -1441,7 +1447,7 @@ static int otx2_init_hw_resources(struct otx2_nic *pf)
+        * so, aura count = pool count.
+        */
+       hw->rqpool_cnt = hw->rx_queues;
+-      hw->sqpool_cnt = hw->non_qos_queues;
++      hw->sqpool_cnt = otx2_get_total_tx_queues(pf);
+       hw->pool_cnt = hw->rqpool_cnt + hw->sqpool_cnt;
+ 
+       /* Maximum hardware supported transmit length */
+@@ -1694,11 +1700,14 @@ int otx2_open(struct net_device *netdev)
+ 
+       netif_carrier_off(netdev);
+ 
+-      pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.non_qos_queues;
+       /* RQ and SQs are mapped to different CQs,
+        * so find out max CQ IRQs (i.e CINTs) needed.
+        */
+-      pf->hw.cint_cnt = max(pf->hw.rx_queues, pf->hw.tx_queues);
++      pf->hw.cint_cnt = max3(pf->hw.rx_queues, pf->hw.tx_queues,
++                             pf->hw.tc_tx_queues);
++
++      pf->qset.cq_cnt = pf->hw.rx_queues + otx2_get_total_tx_queues(pf);
++
+       qset->napi = kcalloc(pf->hw.cint_cnt, sizeof(*cq_poll), GFP_KERNEL);
+       if (!qset->napi)
+               return -ENOMEM;
+@@ -1749,6 +1758,11 @@ int otx2_open(struct net_device *netdev)
+               else
+                       cq_poll->cq_ids[CQ_XDP] = CINT_INVALID_CQ;
+ 
++              cq_poll->cq_ids[CQ_QOS] = (qidx < pf->hw.tc_tx_queues) ?
++                                        (qidx + pf->hw.rx_queues +
++                                         pf->hw.non_qos_queues) :
++                                        CINT_INVALID_CQ;
++
+               cq_poll->dev = (void *)pf;
+               cq_poll->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE;
+               INIT_WORK(&cq_poll->dim.work, otx2_dim_work);
+@@ -1953,6 +1967,12 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
+       int qidx = skb_get_queue_mapping(skb);
+       struct otx2_snd_queue *sq;
+       struct netdev_queue *txq;
++      int sq_idx;
++
++      /* XDP SQs are not mapped with TXQs
++       * advance qid to derive correct sq mapped with QOS
++       */
++      sq_idx = (qidx >= pf->hw.tx_queues) ? (qidx + pf->hw.xdp_queues) : qidx;
+ 
+       /* Check for minimum and maximum packet length */
+       if (skb->len <= ETH_HLEN ||
+@@ -1961,7 +1981,7 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
+               return NETDEV_TX_OK;
+       }
+ 
+-      sq = &pf->qset.sq[qidx];
++      sq = &pf->qset.sq[sq_idx];
+       txq = netdev_get_tx_queue(netdev, qidx);
+ 
+       if (!otx2_sq_append_skb(netdev, sq, skb, qidx)) {
+@@ -1979,8 +1999,8 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev)
+       return NETDEV_TX_OK;
+ }
+ 
+-static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
+-                           struct net_device *sb_dev)
++u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
++                    struct net_device *sb_dev)
+ {
+ #ifdef CONFIG_DCB
+       struct otx2_nic *pf = netdev_priv(netdev);
+@@ -2002,6 +2022,7 @@ static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb,
+ #endif
+       return netdev_pick_tx(netdev, skb, NULL);
+ }
++EXPORT_SYMBOL(otx2_select_queue);
+ 
+ static netdev_features_t otx2_fix_features(struct net_device *dev,
+                                          netdev_features_t features)
+@@ -2715,10 +2736,10 @@ static void otx2_sriov_vfcfg_cleanup(struct otx2_nic *pf)
+ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+ {
+       struct device *dev = &pdev->dev;
++      int err, qcount, qos_txqs;
+       struct net_device *netdev;
+       struct otx2_nic *pf;
+       struct otx2_hw *hw;
+-      int err, qcount;
+       int num_vec;
+ 
+       err = pcim_enable_device(pdev);
+@@ -2743,8 +2764,9 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+ 
+       /* Set number of queues */
+       qcount = min_t(int, num_online_cpus(), OTX2_MAX_CQ_CNT);
++      qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES);
+ 
+-      netdev = alloc_etherdev_mqs(sizeof(*pf), qcount, qcount);
++      netdev = alloc_etherdev_mqs(sizeof(*pf), qcount + qos_txqs, qcount);
+       if (!netdev) {
+               err = -ENOMEM;
+               goto err_release_regions;
+@@ -2931,6 +2953,8 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+               goto err_pf_sriov_init;
+ #endif
+ 
++      otx2_qos_init(pf, qos_txqs);
++
+       return 0;
+ 
+ err_pf_sriov_init:
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+index 5704fb75fa477..d005434e1e037 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c
+@@ -468,12 +468,13 @@ static int otx2_tx_napi_handler(struct otx2_nic *pfvf,
+                       break;
+               }
+ 
+-              if (cq->cq_type == CQ_XDP) {
++              qidx = cq->cq_idx - pfvf->hw.rx_queues;
++
++              if (cq->cq_type == CQ_XDP)
+                       otx2_xdp_snd_pkt_handler(pfvf, sq, cqe);
+-              } else {
+-                      otx2_snd_pkt_handler(pfvf, cq, sq, cqe, budget,
+-                                           &tx_pkts, &tx_bytes);
+-              }
++              else
++                      otx2_snd_pkt_handler(pfvf, cq, &pfvf->qset.sq[qidx],
++                                           cqe, budget, &tx_pkts, &tx_bytes);
+ 
+               cqe->hdr.cqe_type = NIX_XQE_TYPE_INVALID;
+               processed_cqe++;
+@@ -490,7 +491,11 @@ static int otx2_tx_napi_handler(struct otx2_nic *pfvf,
+       if (likely(tx_pkts)) {
+               struct netdev_queue *txq;
+ 
+-              txq = netdev_get_tx_queue(pfvf->netdev, cq->cint_idx);
++              qidx = cq->cq_idx - pfvf->hw.rx_queues;
++
++              if (qidx >= pfvf->hw.tx_queues)
++                      qidx -= pfvf->hw.xdp_queues;
++              txq = netdev_get_tx_queue(pfvf->netdev, qidx);
+               netdev_tx_completed_queue(txq, tx_pkts, tx_bytes);
+               /* Check if queue was stopped earlier due to ring full */
+               smp_mb();
+@@ -738,7 +743,8 @@ static void otx2_sqe_add_hdr(struct otx2_nic *pfvf, struct otx2_snd_queue *sq,
+               sqe_hdr->aura = sq->aura_id;
+               /* Post a CQE Tx after pkt transmission */
+               sqe_hdr->pnc = 1;
+-              sqe_hdr->sq = qidx;
++              sqe_hdr->sq = (qidx >=  pfvf->hw.tx_queues) ?
++                             qidx + pfvf->hw.xdp_queues : qidx;
+       }
+       sqe_hdr->total = skb->len;
+       /* Set SQE identifier which will be used later for freeing SKB */
+@@ -1223,8 +1229,10 @@ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq)
+       struct nix_cqe_tx_s *cqe;
+       int processed_cqe = 0;
+       struct sg_list *sg;
++      int qidx;
+ 
+-      sq = &pfvf->qset.sq[cq->cint_idx];
++      qidx = cq->cq_idx - pfvf->hw.rx_queues;
++      sq = &pfvf->qset.sq[qidx];
+ 
+       if (otx2_nix_cq_op_status(pfvf, cq) || !cq->pend_cqe)
+               return;
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
+index 93cac2c2664c2..7ab6db9a986fa 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h
+@@ -102,7 +102,8 @@ enum cq_type {
+       CQ_RX,
+       CQ_TX,
+       CQ_XDP,
+-      CQS_PER_CINT = 3, /* RQ + SQ + XDP */
++      CQ_QOS,
++      CQS_PER_CINT = 4, /* RQ + SQ + XDP + QOS_SQ */
+ };
+ 
+ struct otx2_cq_poll {
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+index ad90f8f2aad1f..404855bccb4b6 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+@@ -475,6 +475,7 @@ static const struct net_device_ops otx2vf_netdev_ops = {
+       .ndo_open = otx2vf_open,
+       .ndo_stop = otx2vf_stop,
+       .ndo_start_xmit = otx2vf_xmit,
++      .ndo_select_queue = otx2_select_queue,
+       .ndo_set_rx_mode = otx2vf_set_rx_mode,
+       .ndo_set_mac_address = otx2_set_mac_address,
+       .ndo_change_mtu = otx2vf_change_mtu,
+@@ -520,10 +521,10 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+ {
+       int num_vec = pci_msix_vec_count(pdev);
+       struct device *dev = &pdev->dev;
++      int err, qcount, qos_txqs;
+       struct net_device *netdev;
+       struct otx2_nic *vf;
+       struct otx2_hw *hw;
+-      int err, qcount;
+ 
+       err = pcim_enable_device(pdev);
+       if (err) {
+@@ -546,7 +547,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+       pci_set_master(pdev);
+ 
+       qcount = num_online_cpus();
+-      netdev = alloc_etherdev_mqs(sizeof(*vf), qcount, qcount);
++      qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES);
++      netdev = alloc_etherdev_mqs(sizeof(*vf), qcount + qos_txqs, qcount);
+       if (!netdev) {
+               err = -ENOMEM;
+               goto err_release_regions;
+@@ -695,6 +697,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+       if (err)
+               goto err_shutdown_tc;
+ #endif
++      otx2_qos_init(vf, qos_txqs);
+ 
+       return 0;
+ 
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
+new file mode 100644
+index 0000000000000..73a62d092e99a
+--- /dev/null
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h
+@@ -0,0 +1,19 @@
++/* SPDX-License-Identifier: GPL-2.0 */
++/* Marvell RVU Ethernet driver
++ *
++ * Copyright (C) 2023 Marvell.
++ *
++ */
++#ifndef OTX2_QOS_H
++#define OTX2_QOS_H
++
++#define OTX2_QOS_MAX_LEAF_NODES                16
++
++int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq);
++void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq);
++
++struct otx2_qos {
++             u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES];
++      };
++
++#endif
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
+new file mode 100644
+index 0000000000000..e142d43f5a62c
+--- /dev/null
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c
+@@ -0,0 +1,282 @@
++// SPDX-License-Identifier: GPL-2.0
++/* Marvell RVU Physical Function ethernet driver
++ *
++ * Copyright (C) 2023 Marvell.
++ *
++ */
++
++#include <linux/netdevice.h>
++#include <net/tso.h>
++
++#include "cn10k.h"
++#include "otx2_reg.h"
++#include "otx2_common.h"
++#include "otx2_txrx.h"
++#include "otx2_struct.h"
++
++#define OTX2_QOS_MAX_LEAF_NODES 16
++
++static void otx2_qos_aura_pool_free(struct otx2_nic *pfvf, int pool_id)
++{
++      struct otx2_pool *pool;
++
++      if (!pfvf->qset.pool)
++              return;
++
++      pool = &pfvf->qset.pool[pool_id];
++      qmem_free(pfvf->dev, pool->stack);
++      qmem_free(pfvf->dev, pool->fc_addr);
++      pool->stack = NULL;
++      pool->fc_addr = NULL;
++}
++
++static int otx2_qos_sq_aura_pool_init(struct otx2_nic *pfvf, int qidx)
++{
++      struct otx2_qset *qset = &pfvf->qset;
++      int pool_id, stack_pages, num_sqbs;
++      struct otx2_hw *hw = &pfvf->hw;
++      struct otx2_snd_queue *sq;
++      struct otx2_pool *pool;
++      dma_addr_t bufptr;
++      int err, ptr;
++      u64 iova, pa;
++
++      /* Calculate number of SQBs needed.
++       *
++       * For a 128byte SQE, and 4K size SQB, 31 SQEs will fit in one SQB.
++       * Last SQE is used for pointing to next SQB.
++       */
++      num_sqbs = (hw->sqb_size / 128) - 1;
++      num_sqbs = (qset->sqe_cnt + num_sqbs) / num_sqbs;
++
++      /* Get no of stack pages needed */
++      stack_pages =
++              (num_sqbs + hw->stack_pg_ptrs - 1) / hw->stack_pg_ptrs;
++
++      pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx);
++      pool = &pfvf->qset.pool[pool_id];
++
++      /* Initialize aura context */
++      err = otx2_aura_init(pfvf, pool_id, pool_id, num_sqbs);
++      if (err)
++              return err;
++
++      /* Initialize pool context */
++      err = otx2_pool_init(pfvf, pool_id, stack_pages,
++                           num_sqbs, hw->sqb_size);
++      if (err)
++              goto aura_free;
++
++      /* Flush accumulated messages */
++      err = otx2_sync_mbox_msg(&pfvf->mbox);
++      if (err)
++              goto pool_free;
++
++      /* Allocate pointers and free them to aura/pool */
++      sq = &qset->sq[qidx];
++      sq->sqb_count = 0;
++      sq->sqb_ptrs = kcalloc(num_sqbs, sizeof(*sq->sqb_ptrs), GFP_KERNEL);
++      if (!sq->sqb_ptrs) {
++              err = -ENOMEM;
++              goto pool_free;
++      }
++
++      for (ptr = 0; ptr < num_sqbs; ptr++) {
++              err = otx2_alloc_rbuf(pfvf, pool, &bufptr);
++              if (err)
++                      goto sqb_free;
++              pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr);
++              sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr;
++      }
++
++      return 0;
++
++sqb_free:
++      while (ptr--) {
++              if (!sq->sqb_ptrs[ptr])
++                      continue;
++              iova = sq->sqb_ptrs[ptr];
++              pa = otx2_iova_to_phys(pfvf->iommu_domain, iova);
++              dma_unmap_page_attrs(pfvf->dev, iova, hw->sqb_size,
++                                   DMA_FROM_DEVICE,
++                                   DMA_ATTR_SKIP_CPU_SYNC);
++              put_page(virt_to_page(phys_to_virt(pa)));
++              otx2_aura_allocptr(pfvf, pool_id);
++      }
++      sq->sqb_count = 0;
++      kfree(sq->sqb_ptrs);
++pool_free:
++      qmem_free(pfvf->dev, pool->stack);
++aura_free:
++      qmem_free(pfvf->dev, pool->fc_addr);
++      otx2_mbox_reset(&pfvf->mbox.mbox, 0);
++      return err;
++}
++
++static void otx2_qos_sq_free_sqbs(struct otx2_nic *pfvf, int qidx)
++{
++      struct otx2_qset *qset = &pfvf->qset;
++      struct otx2_hw *hw = &pfvf->hw;
++      struct otx2_snd_queue *sq;
++      u64 iova, pa;
++      int sqb;
++
++      sq = &qset->sq[qidx];
++      if (!sq->sqb_ptrs)
++              return;
++      for (sqb = 0; sqb < sq->sqb_count; sqb++) {
++              if (!sq->sqb_ptrs[sqb])
++                      continue;
++              iova = sq->sqb_ptrs[sqb];
++              pa = otx2_iova_to_phys(pfvf->iommu_domain, iova);
++              dma_unmap_page_attrs(pfvf->dev, iova, hw->sqb_size,
++                                   DMA_FROM_DEVICE,
++                                   DMA_ATTR_SKIP_CPU_SYNC);
++              put_page(virt_to_page(phys_to_virt(pa)));
++      }
++
++      sq->sqb_count = 0;
++
++      sq = &qset->sq[qidx];
++      qmem_free(pfvf->dev, sq->sqe);
++      qmem_free(pfvf->dev, sq->tso_hdrs);
++      kfree(sq->sg);
++      kfree(sq->sqb_ptrs);
++      qmem_free(pfvf->dev, sq->timestamps);
++
++      memset((void *)sq, 0, sizeof(*sq));
++}
++
++/* send queue id */
++static void otx2_qos_sqb_flush(struct otx2_nic *pfvf, int qidx)
++{
++      int sqe_tail, sqe_head;
++      u64 incr, *ptr, val;
++
++      ptr = (__force u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS);
++      incr = (u64)qidx << 32;
++      val = otx2_atomic64_add(incr, ptr);
++      sqe_head = (val >> 20) & 0x3F;
++      sqe_tail = (val >> 28) & 0x3F;
++      if (sqe_head != sqe_tail)
++              usleep_range(50, 60);
++}
++
++static int otx2_qos_ctx_disable(struct otx2_nic *pfvf, u16 qidx, int aura_id)
++{
++      struct nix_cn10k_aq_enq_req *cn10k_sq_aq;
++      struct npa_aq_enq_req *aura_aq;
++      struct npa_aq_enq_req *pool_aq;
++      struct nix_aq_enq_req *sq_aq;
++
++      if (test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) {
++              cn10k_sq_aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox);
++              if (!cn10k_sq_aq)
++                      return -ENOMEM;
++              cn10k_sq_aq->qidx = qidx;
++              cn10k_sq_aq->sq.ena = 0;
++              cn10k_sq_aq->sq_mask.ena = 1;
++              cn10k_sq_aq->ctype = NIX_AQ_CTYPE_SQ;
++              cn10k_sq_aq->op = NIX_AQ_INSTOP_WRITE;
++      } else {
++              sq_aq = otx2_mbox_alloc_msg_nix_aq_enq(&pfvf->mbox);
++              if (!sq_aq)
++                      return -ENOMEM;
++              sq_aq->qidx = qidx;
++              sq_aq->sq.ena = 0;
++              sq_aq->sq_mask.ena = 1;
++              sq_aq->ctype = NIX_AQ_CTYPE_SQ;
++              sq_aq->op = NIX_AQ_INSTOP_WRITE;
++      }
++
++      aura_aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox);
++      if (!aura_aq) {
++              otx2_mbox_reset(&pfvf->mbox.mbox, 0);
++              return -ENOMEM;
++      }
++
++      aura_aq->aura_id = aura_id;
++      aura_aq->aura.ena = 0;
++      aura_aq->aura_mask.ena = 1;
++      aura_aq->ctype = NPA_AQ_CTYPE_AURA;
++      aura_aq->op = NPA_AQ_INSTOP_WRITE;
++
++      pool_aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox);
++      if (!pool_aq) {
++              otx2_mbox_reset(&pfvf->mbox.mbox, 0);
++              return -ENOMEM;
++      }
++
++      pool_aq->aura_id = aura_id;
++      pool_aq->pool.ena = 0;
++      pool_aq->pool_mask.ena = 1;
++
++      pool_aq->ctype = NPA_AQ_CTYPE_POOL;
++      pool_aq->op = NPA_AQ_INSTOP_WRITE;
++
++      return otx2_sync_mbox_msg(&pfvf->mbox);
++}
++
++int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq)
++{
++      struct otx2_hw *hw = &pfvf->hw;
++      int pool_id, sq_idx, err;
++
++      if (pfvf->flags & OTX2_FLAG_INTF_DOWN)
++              return -EPERM;
++
++      sq_idx = hw->non_qos_queues + qidx;
++
++      mutex_lock(&pfvf->mbox.lock);
++      err = otx2_qos_sq_aura_pool_init(pfvf, sq_idx);
++      if (err)
++              goto out;
++
++      pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx);
++      pfvf->qos.qid_to_sqmap[qidx] = smq;
++      err = otx2_sq_init(pfvf, sq_idx, pool_id);
++      if (err)
++              goto out;
++out:
++      mutex_unlock(&pfvf->mbox.lock);
++      return err;
++}
++
++void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq)
++{
++      struct otx2_qset *qset = &pfvf->qset;
++      struct otx2_hw *hw = &pfvf->hw;
++      struct otx2_snd_queue *sq;
++      struct otx2_cq_queue *cq;
++      int pool_id, sq_idx;
++
++      sq_idx = hw->non_qos_queues + qidx;
++
++      /* If the DOWN flag is set SQs are already freed */
++      if (pfvf->flags & OTX2_FLAG_INTF_DOWN)
++              return;
++
++      sq = &pfvf->qset.sq[sq_idx];
++      if (!sq->sqb_ptrs)
++              return;
++
++      if (sq_idx < hw->non_qos_queues ||
++          sq_idx >= otx2_get_total_tx_queues(pfvf)) {
++              netdev_err(pfvf->netdev, "Send Queue is not a QoS queue\n");
++              return;
++      }
++
++      cq = &qset->cq[pfvf->hw.rx_queues + sq_idx];
++      pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx);
++
++      otx2_qos_sqb_flush(pfvf, sq_idx);
++      otx2_smq_flush(pfvf, otx2_get_smq_idx(pfvf, sq_idx));
++      otx2_cleanup_tx_cqes(pfvf, cq);
++
++      mutex_lock(&pfvf->mbox.lock);
++      otx2_qos_ctx_disable(pfvf, sq_idx, pool_id);
++      mutex_unlock(&pfvf->mbox.lock);
++
++      otx2_qos_sq_free_sqbs(pfvf, sq_idx);
++      otx2_qos_aura_pool_free(pfvf, pool_id);
++}
+-- 
+2.42.0
+
diff --git a/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch b/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch

new file mode 100644 (file)

index 0000000..40ba572
--- /dev/null
+++ b/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch
@@ -0,0 +1,184 @@
+From e00182d3cffef2a3d2f81c12a80094332e4d9a8b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 13 May 2023 14:21:37 +0530
+Subject: octeontx2-pf: Rename tot_tx_queues to non_qos_queues
+
+From: Hariprasad Kelam <hkelam@marvell.com>
+
+[ Upstream commit 508c58f76ca510956625c945f9b8eb104f2c8208 ]
+
+current implementation is such that tot_tx_queues contains both
+xdp queues and normal tx queues. which will be allocated in interface
+open calls and deallocated on interface down calls respectively.
+
+With addition of QOS, where send quees are allocated/deallacated upon
+user request Qos send queues won't be part of tot_tx_queues. So this
+patch renames tot_tx_queues to non_qos_queues.
+
+Signed-off-by: Hariprasad Kelam <hkelam@marvell.com>
+Reviewed-by: Simon Horman <simon.horman@corigine.com>
+Reviewed-by: Jacob Keller <jacob.e.keller@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 3423ca23e08b ("octeontx2-pf: Free pending and dropped SQEs")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../ethernet/marvell/octeontx2/nic/otx2_common.c   | 12 ++++++------
+ .../ethernet/marvell/octeontx2/nic/otx2_common.h   |  2 +-
+ .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c   | 14 +++++++-------
+ .../net/ethernet/marvell/octeontx2/nic/otx2_vf.c   |  2 +-
+ 4 files changed, 15 insertions(+), 15 deletions(-)
+
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+index 011355e73696e..2575c207150e1 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c
+@@ -799,7 +799,7 @@ void otx2_sqb_flush(struct otx2_nic *pfvf)
+       int timeout = 1000;
+ 
+       ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS);
+-      for (qidx = 0; qidx < pfvf->hw.tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) {
+               incr = (u64)qidx << 32;
+               while (timeout) {
+                       val = otx2_atomic64_add(incr, ptr);
+@@ -1085,7 +1085,7 @@ int otx2_config_nix_queues(struct otx2_nic *pfvf)
+       }
+ 
+       /* Initialize TX queues */
+-      for (qidx = 0; qidx < pfvf->hw.tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) {
+               u16 sqb_aura = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx);
+ 
+               err = otx2_sq_init(pfvf, qidx, sqb_aura);
+@@ -1132,7 +1132,7 @@ int otx2_config_nix(struct otx2_nic *pfvf)
+ 
+       /* Set RQ/SQ/CQ counts */
+       nixlf->rq_cnt = pfvf->hw.rx_queues;
+-      nixlf->sq_cnt = pfvf->hw.tot_tx_queues;
++      nixlf->sq_cnt = pfvf->hw.non_qos_queues;
+       nixlf->cq_cnt = pfvf->qset.cq_cnt;
+       nixlf->rss_sz = MAX_RSS_INDIR_TBL_SIZE;
+       nixlf->rss_grps = MAX_RSS_GROUPS;
+@@ -1170,7 +1170,7 @@ void otx2_sq_free_sqbs(struct otx2_nic *pfvf)
+       int sqb, qidx;
+       u64 iova, pa;
+ 
+-      for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < hw->non_qos_queues; qidx++) {
+               sq = &qset->sq[qidx];
+               if (!sq->sqb_ptrs)
+                       continue;
+@@ -1386,7 +1386,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf)
+       stack_pages =
+               (num_sqbs + hw->stack_pg_ptrs - 1) / hw->stack_pg_ptrs;
+ 
+-      for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < hw->non_qos_queues; qidx++) {
+               pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx);
+               /* Initialize aura context */
+               err = otx2_aura_init(pfvf, pool_id, pool_id, num_sqbs);
+@@ -1406,7 +1406,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf)
+               goto fail;
+ 
+       /* Allocate pointers and free them to aura/pool */
+-      for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < hw->non_qos_queues; qidx++) {
+               pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx);
+               pool = &pfvf->qset.pool[pool_id];
+ 
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+index 8a9793b06769f..6c81d09798914 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h
+@@ -186,7 +186,7 @@ struct otx2_hw {
+       u16                     rx_queues;
+       u16                     tx_queues;
+       u16                     xdp_queues;
+-      u16                     tot_tx_queues;
++      u16                     non_qos_queues; /* tx queues plus xdp queues */
+       u16                     max_queues;
+       u16                     pool_cnt;
+       u16                     rqpool_cnt;
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+index 101d79a0bb436..545984a86f235 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c
+@@ -1256,7 +1256,7 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data)
+       }
+ 
+       /* SQ */
+-      for (qidx = 0; qidx < pf->hw.tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) {
+               u64 sq_op_err_dbg, mnq_err_dbg, snd_err_dbg;
+               u8 sq_op_err_code, mnq_err_code, snd_err_code;
+ 
+@@ -1391,7 +1391,7 @@ static void otx2_free_sq_res(struct otx2_nic *pf)
+       otx2_ctx_disable(&pf->mbox, NIX_AQ_CTYPE_SQ, false);
+       /* Free SQB pointers */
+       otx2_sq_free_sqbs(pf);
+-      for (qidx = 0; qidx < pf->hw.tot_tx_queues; qidx++) {
++      for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) {
+               sq = &qset->sq[qidx];
+               qmem_free(pf->dev, sq->sqe);
+               qmem_free(pf->dev, sq->tso_hdrs);
+@@ -1441,7 +1441,7 @@ static int otx2_init_hw_resources(struct otx2_nic *pf)
+        * so, aura count = pool count.
+        */
+       hw->rqpool_cnt = hw->rx_queues;
+-      hw->sqpool_cnt = hw->tot_tx_queues;
++      hw->sqpool_cnt = hw->non_qos_queues;
+       hw->pool_cnt = hw->rqpool_cnt + hw->sqpool_cnt;
+ 
+       /* Maximum hardware supported transmit length */
+@@ -1694,7 +1694,7 @@ int otx2_open(struct net_device *netdev)
+ 
+       netif_carrier_off(netdev);
+ 
+-      pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.tot_tx_queues;
++      pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.non_qos_queues;
+       /* RQ and SQs are mapped to different CQs,
+        * so find out max CQ IRQs (i.e CINTs) needed.
+        */
+@@ -1714,7 +1714,7 @@ int otx2_open(struct net_device *netdev)
+       if (!qset->cq)
+               goto err_free_mem;
+ 
+-      qset->sq = kcalloc(pf->hw.tot_tx_queues,
++      qset->sq = kcalloc(pf->hw.non_qos_queues,
+                          sizeof(struct otx2_snd_queue), GFP_KERNEL);
+       if (!qset->sq)
+               goto err_free_mem;
+@@ -2532,7 +2532,7 @@ static int otx2_xdp_setup(struct otx2_nic *pf, struct bpf_prog *prog)
+       else
+               pf->hw.xdp_queues = 0;
+ 
+-      pf->hw.tot_tx_queues += pf->hw.xdp_queues;
++      pf->hw.non_qos_queues += pf->hw.xdp_queues;
+ 
+       if (if_up)
+               otx2_open(pf->netdev);
+@@ -2763,7 +2763,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+       hw->pdev = pdev;
+       hw->rx_queues = qcount;
+       hw->tx_queues = qcount;
+-      hw->tot_tx_queues = qcount;
++      hw->non_qos_queues = qcount;
+       hw->max_queues = qcount;
+       hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN;
+       /* Use CQE of 128 byte descriptor size by default */
+diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+index f8f0c01f62a14..ad90f8f2aad1f 100644
+--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c
+@@ -566,7 +566,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id)
+       hw->rx_queues = qcount;
+       hw->tx_queues = qcount;
+       hw->max_queues = qcount;
+-      hw->tot_tx_queues = qcount;
++      hw->non_qos_queues = qcount;
+       hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN;
+       /* Use CQE of 128 byte descriptor size by default */
+       hw->xqe_size = 128;
+-- 
+2.42.0
+
diff --git a/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch b/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch

new file mode 100644 (file)

index 0000000..1d397d7
--- /dev/null
+++ b/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch
@@ -0,0 +1,51 @@
+From 4034c635c55102451912c2907d82937d7443bef6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 4 Oct 2023 10:54:14 -0700
+Subject: pwm: brcmstb: Utilize appropriate clock APIs in suspend/resume
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Florian Fainelli <florian.fainelli@broadcom.com>
+
+[ Upstream commit e9bc4411548aaa738905d37851a0146c16b3bb21 ]
+
+The suspend/resume functions currently utilize
+clk_disable()/clk_enable() respectively which may be no-ops with certain
+clock providers such as SCMI. Fix this to use clk_disable_unprepare()
+and clk_prepare_enable() respectively as we should.
+
+Fixes: 3a9f5957020f ("pwm: Add Broadcom BCM7038 PWM controller support")
+Signed-off-by: Florian Fainelli <florian.fainelli@broadcom.com>
+Acked-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/pwm/pwm-brcmstb.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/pwm/pwm-brcmstb.c b/drivers/pwm/pwm-brcmstb.c
+index 3db3f96edf78d..6afd34d651c77 100644
+--- a/drivers/pwm/pwm-brcmstb.c
++++ b/drivers/pwm/pwm-brcmstb.c
+@@ -290,7 +290,7 @@ static int brcmstb_pwm_suspend(struct device *dev)
+ {
+       struct brcmstb_pwm *p = dev_get_drvdata(dev);
+ 
+-      clk_disable(p->clk);
++      clk_disable_unprepare(p->clk);
+ 
+       return 0;
+ }
+@@ -299,7 +299,7 @@ static int brcmstb_pwm_resume(struct device *dev)
+ {
+       struct brcmstb_pwm *p = dev_get_drvdata(dev);
+ 
+-      clk_enable(p->clk);
++      clk_prepare_enable(p->clk);
+ 
+       return 0;
+ }
+-- 
+2.42.0
+
diff --git a/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch b/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch

new file mode 100644 (file)

index 0000000..ab1ae5f
--- /dev/null
+++ b/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch
@@ -0,0 +1,115 @@
+From f21ab36aee4e32227e5dc76c74ef752c2193b133 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 5 Jul 2023 10:06:48 +0200
+Subject: pwm: sti: Reduce number of allocations and drop usage of chip_data
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+
+[ Upstream commit 2d6812b41e0d832919d72c72ebddf361df53ba1b ]
+
+Instead of using one allocation per capture channel, use a single one. Also
+store it in driver data instead of chip data.
+
+This has several advantages:
+
+ - driver data isn't cleared when pwm_put() is called
+ - Reduces memory fragmentation
+
+Also register the pwm chip only after the per capture channel data is
+initialized as the capture callback relies on this initialization and it
+might be called even before pwmchip_add() returns.
+
+It would be still better to have struct sti_pwm_compat_data and the
+per-channel data struct sti_cpt_ddata in a single memory chunk, but that's
+not easily possible because the number of capture channels isn't known yet
+when the driver data struct is allocated.
+
+Fixes: e926b12c611c ("pwm: Clear chip_data in pwm_put()")
+Reported-by: George Stark <gnstark@sberdevices.ru>
+Fixes: c97267ae831d ("pwm: sti: Add PWM capture callback")
+Link: https://lore.kernel.org/r/20230705080650.2353391-7-u.kleine-koenig@pengutronix.de
+Signed-off-by: Uwe Kleine-König <u.kleine-koenig@pengutronix.de>
+Signed-off-by: Thierry Reding <thierry.reding@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/pwm/pwm-sti.c | 29 ++++++++++++++---------------
+ 1 file changed, 14 insertions(+), 15 deletions(-)
+
+diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c
+index 44b1f93256b36..652fdb8dc7bfa 100644
+--- a/drivers/pwm/pwm-sti.c
++++ b/drivers/pwm/pwm-sti.c
+@@ -79,6 +79,7 @@ struct sti_pwm_compat_data {
+       unsigned int cpt_num_devs;
+       unsigned int max_pwm_cnt;
+       unsigned int max_prescale;
++      struct sti_cpt_ddata *ddata;
+ };
+ 
+ struct sti_pwm_chip {
+@@ -314,7 +315,7 @@ static int sti_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm,
+ {
+       struct sti_pwm_chip *pc = to_sti_pwmchip(chip);
+       struct sti_pwm_compat_data *cdata = pc->cdata;
+-      struct sti_cpt_ddata *ddata = pwm_get_chip_data(pwm);
++      struct sti_cpt_ddata *ddata = &cdata->ddata[pwm->hwpwm];
+       struct device *dev = pc->dev;
+       unsigned int effective_ticks;
+       unsigned long long high, low;
+@@ -440,7 +441,7 @@ static irqreturn_t sti_pwm_interrupt(int irq, void *data)
+       while (cpt_int_stat) {
+               devicenum = ffs(cpt_int_stat) - 1;
+ 
+-              ddata = pwm_get_chip_data(&pc->chip.pwms[devicenum]);
++              ddata = &pc->cdata->ddata[devicenum];
+ 
+               /*
+                * Capture input:
+@@ -638,30 +639,28 @@ static int sti_pwm_probe(struct platform_device *pdev)
+                       dev_err(dev, "failed to prepare clock\n");
+                       return ret;
+               }
++
++              cdata->ddata = devm_kzalloc(dev, cdata->cpt_num_devs * sizeof(*cdata->ddata), GFP_KERNEL);
++              if (!cdata->ddata)
++                      return -ENOMEM;
+       }
+ 
+       pc->chip.dev = dev;
+       pc->chip.ops = &sti_pwm_ops;
+       pc->chip.npwm = pc->cdata->pwm_num_devs;
+ 
+-      ret = pwmchip_add(&pc->chip);
+-      if (ret < 0) {
+-              clk_unprepare(pc->pwm_clk);
+-              clk_unprepare(pc->cpt_clk);
+-              return ret;
+-      }
+-
+       for (i = 0; i < cdata->cpt_num_devs; i++) {
+-              struct sti_cpt_ddata *ddata;
+-
+-              ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL);
+-              if (!ddata)
+-                      return -ENOMEM;
++              struct sti_cpt_ddata *ddata = &cdata->ddata[i];
+ 
+               init_waitqueue_head(&ddata->wait);
+               mutex_init(&ddata->lock);
++      }
+ 
+-              pwm_set_chip_data(&pc->chip.pwms[i], ddata);
++      ret = pwmchip_add(&pc->chip);
++      if (ret < 0) {
++              clk_unprepare(pc->pwm_clk);
++              clk_unprepare(pc->cpt_clk);
++              return ret;
+       }
+ 
+       platform_set_drvdata(pdev, pc);
+-- 
+2.42.0
+
diff --git a/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch b/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch

new file mode 100644 (file)

index 0000000..64cec14
--- /dev/null
+++ b/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch
@@ -0,0 +1,42 @@
+From 43453f1f795cd8c5e12ff2302a4d1e1c177f1f55 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Nov 2023 23:43:36 +0100
+Subject: r8169: respect userspace disabling IFF_MULTICAST
+
+From: Heiner Kallweit <hkallweit1@gmail.com>
+
+[ Upstream commit 8999ce4cfc87e61b4143ec2e7b93d8e92e11fa7f ]
+
+So far we ignore the setting of IFF_MULTICAST. Fix this and clear bit
+AcceptMulticast if IFF_MULTICAST isn't set.
+
+Note: Based on the implementations I've seen it doesn't seem to be 100% clear
+what a driver is supposed to do if IFF_ALLMULTI is set but IFF_MULTICAST
+is not. This patch is based on the understanding that IFF_MULTICAST has
+precedence.
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Signed-off-by: Heiner Kallweit <hkallweit1@gmail.com>
+Link: https://lore.kernel.org/r/4a57ba02-d52d-4369-9f14-3565e6c1f7dc@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/realtek/r8169_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c
+index c56d3538889b6..d14706265d9cb 100644
+--- a/drivers/net/ethernet/realtek/r8169_main.c
++++ b/drivers/net/ethernet/realtek/r8169_main.c
+@@ -2512,6 +2512,8 @@ static void rtl_set_rx_mode(struct net_device *dev)
+ 
+       if (dev->flags & IFF_PROMISC) {
+               rx_mode |= AcceptAllPhys;
++      } else if (!(dev->flags & IFF_MULTICAST)) {
++              rx_mode &= ~AcceptMulticast;
+       } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT ||
+                  dev->flags & IFF_ALLMULTI ||
+                  tp->mac_version == RTL_GIGA_MAC_VER_35 ||
+-- 
+2.42.0
+
diff --git a/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch b/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch

new file mode 100644 (file)

index 0000000..0e9d021
--- /dev/null
+++ b/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch
@@ -0,0 +1,56 @@
+From fe1e50b0024a7bd3424894e00fc66fd5271b2ec1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 27 Oct 2023 21:12:53 +0530
+Subject: RISC-V: Don't fail in riscv_of_parent_hartid() for disabled HARTs
+
+From: Anup Patel <apatel@ventanamicro.com>
+
+[ Upstream commit c4676f8dc1e12e68d6511f9ed89707fdad4c962c ]
+
+The riscv_of_processor_hartid() used by riscv_of_parent_hartid() fails
+for HARTs disabled in the DT. This results in the following warning
+thrown by the RISC-V INTC driver for the E-core on SiFive boards:
+
+[    0.000000] riscv-intc: unable to find hart id for /cpus/cpu@0/interrupt-controller
+
+The riscv_of_parent_hartid() is only expected to read the hartid
+from the DT so we directly call of_get_cpu_hwid() instead of calling
+riscv_of_processor_hartid().
+
+Fixes: ad635e723e17 ("riscv: cpu: Add 64bit hartid support on RV64")
+Signed-off-by: Anup Patel <apatel@ventanamicro.com>
+Reviewed-by: Atish Patra <atishp@rivosinc.com>
+Link: https://lore.kernel.org/r/20231027154254.355853-2-apatel@ventanamicro.com
+Signed-off-by: Palmer Dabbelt <palmer@rivosinc.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ arch/riscv/kernel/cpu.c | 11 ++++++-----
+ 1 file changed, 6 insertions(+), 5 deletions(-)
+
+diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c
+index 852ecccd8920f..0f76181dc634d 100644
+--- a/arch/riscv/kernel/cpu.c
++++ b/arch/riscv/kernel/cpu.c
+@@ -57,13 +57,14 @@ int riscv_of_processor_hartid(struct device_node *node, unsigned long *hart)
+  */
+ int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid)
+ {
+-      int rc;
+-
+       for (; node; node = node->parent) {
+               if (of_device_is_compatible(node, "riscv")) {
+-                      rc = riscv_of_processor_hartid(node, hartid);
+-                      if (!rc)
+-                              return 0;
++                      *hartid = (unsigned long)of_get_cpu_hwid(node, 0);
++                      if (*hartid == ~0UL) {
++                              pr_warn("Found CPU without hart ID\n");
++                              return -ENODEV;
++                      }
++                      return 0;
+               }
+       }
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch b/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch

new file mode 100644 (file)

index 0000000..bca9c5e
--- /dev/null
+++ b/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch
@@ -0,0 +1,41 @@
+From 17a251fbf65b4aaa37dadd8958676d8a677fde8a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 31 Oct 2023 11:47:32 +0800
+Subject: selftests: pmtu.sh: fix result checking
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit 63e201916b27260218e528a2f8758be47f99bbf4 ]
+
+In the PMTU test, when all previous tests are skipped and the new test
+passes, the exit code is set to 0. However, the current check mistakenly
+treats this as an assignment, causing the check to pass every time.
+
+Consequently, regardless of how many tests have failed, if the latest test
+passes, the PMTU test will report a pass.
+
+Fixes: 2a9d3716b810 ("selftests: pmtu.sh: improve the test result processing")
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Acked-by: Po-Hsu Lin <po-hsu.lin@canonical.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/pmtu.sh | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh
+index dfe3d287f01d2..0d705fdcf3b76 100755
+--- a/tools/testing/selftests/net/pmtu.sh
++++ b/tools/testing/selftests/net/pmtu.sh
+@@ -2013,7 +2013,7 @@ run_test() {
+       case $ret in
+               0)
+                       all_skipped=false
+-                      [ $exitcode=$ksft_skip ] && exitcode=0
++                      [ $exitcode -eq $ksft_skip ] && exitcode=0
+               ;;
+               $ksft_skip)
+                       [ $all_skipped = true ] && exitcode=$ksft_skip
+-- 
+2.42.0
+
diff --git a/queue-6.1/series b/queue-6.1/series

index 36c9103aa8a660344de50c4f0f5738676ca4b9cb..fd0c2c183250386b15335b994b05377f648e718c 100644 (file)
--- a/queue-6.1/series
+++ b/queue-6.1/series
@@ -329,3 +329,39 @@ media-cadence-csi2rx-unregister-v4l2-async-notifier.patch
  media-dvb-usb-v2-af9035-fix-missing-unlock.patch
  media-cec-meson-always-include-meson-sub-directory-i.patch
  regmap-prevent-noinc-writes-from-clobbering-cache.patch
+pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch
+pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch
+input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch
+watchdog-ixp4xx-make-sure-restart-always-works.patch
+llc-verify-mac-len-before-reading-mac-header.patch
+hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch
+tipc-change-nla_policy-for-bearer-related-names-to-n.patch
+bpf-check-map-usercnt-after-timer-timer-is-assigned.patch
+inet-shrink-struct-flowi_common.patch
+octeontx2-pf-fix-error-codes.patch
+octeontx2-pf-fix-holes-in-error-code.patch
+net-page_pool-add-missing-free_percpu-when-page_pool.patch
+dccp-call-security_inet_conn_request-after-setting-i.patch
+dccp-tcp-call-security_inet_conn_request-after-setti.patch
+net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch
+fix-termination-state-for-idr_for_each_entry_ul.patch
+net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch
+selftests-pmtu.sh-fix-result-checking.patch
+octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch
+octeontx2-pf-qos-send-queues-management.patch
+octeontx2-pf-free-pending-and-dropped-sqes.patch
+net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch
+net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch
+net-smc-put-sk-reference-if-close-work-was-canceled.patch
+nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch
+tg3-power-down-device-only-on-system_power_off.patch
+nbd-fix-uaf-in-nbd_open.patch
+blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch
+virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch
+vsock-virtio-remove-socket-from-connected-bound-list.patch
+r8169-respect-userspace-disabling-iff_multicast.patch
+i2c-iproc-handle-invalid-slave-state.patch
+netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch
+netfilter-nft_redir-use-struct-nf_nat_range2-through.patch
+netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch
+risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch
diff --git a/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch b/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch

new file mode 100644 (file)

index 0000000..f6369ca
--- /dev/null
+++ b/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch
@@ -0,0 +1,46 @@
+From bb2266f47afc23ff00d5cf74034978d21d28dfd0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Nov 2023 13:50:29 +0200
+Subject: tg3: power down device only on SYSTEM_POWER_OFF
+
+From: George Shuklin <george.shuklin@gmail.com>
+
+[ Upstream commit 9fc3bc7643341dc5be7d269f3d3dbe441d8d7ac3 ]
+
+Dell R650xs servers hangs on reboot if tg3 driver calls
+tg3_power_down.
+
+This happens only if network adapters (BCM5720 for R650xs) were
+initialized using SNP (e.g. by booting ipxe.efi).
+
+The actual problem is on Dell side, but this fix allows servers
+to come back alive after reboot.
+
+Signed-off-by: George Shuklin <george.shuklin@gmail.com>
+Fixes: 2ca1c94ce0b6 ("tg3: Disable tg3 device on system reboot to avoid triggering AER")
+Reviewed-by: Pavan Chebbi <pavan.chebbi@broadcom.com>
+Reviewed-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20231103115029.83273-1-george.shuklin@gmail.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/tg3.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c
+index 9609041016776..85570e40c8e9b 100644
+--- a/drivers/net/ethernet/broadcom/tg3.c
++++ b/drivers/net/ethernet/broadcom/tg3.c
+@@ -18086,7 +18086,8 @@ static void tg3_shutdown(struct pci_dev *pdev)
+       if (netif_running(dev))
+               dev_close(dev);
+ 
+-      tg3_power_down(tp);
++      if (system_state == SYSTEM_POWER_OFF)
++              tg3_power_down(tp);
+ 
+       rtnl_unlock();
+ 
+-- 
+2.42.0
+
diff --git a/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch b/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch

new file mode 100644 (file)

index 0000000..db0295a
--- /dev/null
+++ b/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch
@@ -0,0 +1,111 @@
+From 8159bf4cc90607234a192c9a6170a78a50801d0d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 30 Oct 2023 16:55:40 +0900
+Subject: tipc: Change nla_policy for bearer-related names to NLA_NUL_STRING
+
+From: Shigeru Yoshida <syoshida@redhat.com>
+
+[ Upstream commit 19b3f72a41a8751e26bffc093bb7e1cef29ad579 ]
+
+syzbot reported the following uninit-value access issue [1]:
+
+=====================================================
+BUG: KMSAN: uninit-value in strlen lib/string.c:418 [inline]
+BUG: KMSAN: uninit-value in strstr+0xb8/0x2f0 lib/string.c:756
+ strlen lib/string.c:418 [inline]
+ strstr+0xb8/0x2f0 lib/string.c:756
+ tipc_nl_node_reset_link_stats+0x3ea/0xb50 net/tipc/node.c:2595
+ genl_family_rcv_msg_doit net/netlink/genetlink.c:971 [inline]
+ genl_family_rcv_msg net/netlink/genetlink.c:1051 [inline]
+ genl_rcv_msg+0x11ec/0x1290 net/netlink/genetlink.c:1066
+ netlink_rcv_skb+0x371/0x650 net/netlink/af_netlink.c:2545
+ genl_rcv+0x40/0x60 net/netlink/genetlink.c:1075
+ netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline]
+ netlink_unicast+0xf47/0x1250 net/netlink/af_netlink.c:1368
+ netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ sock_sendmsg net/socket.c:753 [inline]
+ ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541
+ ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595
+ __sys_sendmsg net/socket.c:2624 [inline]
+ __do_sys_sendmsg net/socket.c:2633 [inline]
+ __se_sys_sendmsg net/socket.c:2631 [inline]
+ __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631
+ do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+ do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+Uninit was created at:
+ slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767
+ slab_alloc_node mm/slub.c:3478 [inline]
+ kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523
+ kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559
+ __alloc_skb+0x318/0x740 net/core/skbuff.c:650
+ alloc_skb include/linux/skbuff.h:1286 [inline]
+ netlink_alloc_large_skb net/netlink/af_netlink.c:1214 [inline]
+ netlink_sendmsg+0xb34/0x13d0 net/netlink/af_netlink.c:1885
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ sock_sendmsg net/socket.c:753 [inline]
+ ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541
+ ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595
+ __sys_sendmsg net/socket.c:2624 [inline]
+ __do_sys_sendmsg net/socket.c:2633 [inline]
+ __se_sys_sendmsg net/socket.c:2631 [inline]
+ __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631
+ do_syscall_x64 arch/x86/entry/common.c:50 [inline]
+ do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80
+ entry_SYSCALL_64_after_hwframe+0x63/0xcd
+
+TIPC bearer-related names including link names must be null-terminated
+strings. If a link name which is not null-terminated is passed through
+netlink, strstr() and similar functions can cause buffer overrun. This
+causes the above issue.
+
+This patch changes the nla_policy for bearer-related names from NLA_STRING
+to NLA_NUL_STRING. This resolves the issue by ensuring that only
+null-terminated strings are accepted as bearer-related names.
+
+syzbot reported similar uninit-value issue related to bearer names [2]. The
+root cause of this issue is that a non-null-terminated bearer name was
+passed. This patch also resolved this issue.
+
+Fixes: 7be57fc69184 ("tipc: add link get/dump to new netlink api")
+Fixes: 0655f6a8635b ("tipc: add bearer disable/enable to new netlink api")
+Reported-and-tested-by: syzbot+5138ca807af9d2b42574@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=5138ca807af9d2b42574 [1]
+Reported-and-tested-by: syzbot+9425c47dccbcb4c17d51@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=9425c47dccbcb4c17d51 [2]
+Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Link: https://lore.kernel.org/r/20231030075540.3784537-1-syoshida@redhat.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/tipc/netlink.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c
+index e8fd257c0e688..1a9a5bdaccf4f 100644
+--- a/net/tipc/netlink.c
++++ b/net/tipc/netlink.c
+@@ -88,7 +88,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = {
+ 
+ const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = {
+       [TIPC_NLA_LINK_UNSPEC]          = { .type = NLA_UNSPEC },
+-      [TIPC_NLA_LINK_NAME]            = { .type = NLA_STRING,
++      [TIPC_NLA_LINK_NAME]            = { .type = NLA_NUL_STRING,
+                                           .len = TIPC_MAX_LINK_NAME },
+       [TIPC_NLA_LINK_MTU]             = { .type = NLA_U32 },
+       [TIPC_NLA_LINK_BROADCAST]       = { .type = NLA_FLAG },
+@@ -125,7 +125,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = {
+ 
+ const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1]        = {
+       [TIPC_NLA_BEARER_UNSPEC]        = { .type = NLA_UNSPEC },
+-      [TIPC_NLA_BEARER_NAME]          = { .type = NLA_STRING,
++      [TIPC_NLA_BEARER_NAME]          = { .type = NLA_NUL_STRING,
+                                           .len = TIPC_MAX_BEARER_NAME },
+       [TIPC_NLA_BEARER_PROP]          = { .type = NLA_NESTED },
+       [TIPC_NLA_BEARER_DOMAIN]        = { .type = NLA_U32 }
+-- 
+2.42.0
+
diff --git a/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch b/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch

new file mode 100644 (file)

index 0000000..82faff2
--- /dev/null
+++ b/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch
@@ -0,0 +1,1983 @@
+From dc5f3dc5e6910cd026685601ab84ffd77ceafc09 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 13 Jan 2023 22:21:37 +0000
+Subject: virtio/vsock: replace virtio_vsock_pkt with sk_buff
+
+From: Bobby Eshleman <bobby.eshleman@bytedance.com>
+
+[ Upstream commit 71dc9ec9ac7d3eee785cdc986c3daeb821381e20 ]
+
+This commit changes virtio/vsock to use sk_buff instead of
+virtio_vsock_pkt. Beyond better conforming to other net code, using
+sk_buff allows vsock to use sk_buff-dependent features in the future
+(such as sockmap) and improves throughput.
+
+This patch introduces the following performance changes:
+
+Tool: Uperf
+Env: Phys Host + L1 Guest
+Payload: 64k
+Threads: 16
+Test Runs: 10
+Type: SOCK_STREAM
+Before: commit b7bfaa761d760 ("Linux 6.2-rc3")
+
+Before
+------
+g2h: 16.77Gb/s
+h2g: 10.56Gb/s
+
+After
+-----
+g2h: 21.04Gb/s
+h2g: 10.76Gb/s
+
+Signed-off-by: Bobby Eshleman <bobby.eshleman@bytedance.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 3a5cc90a4d17 ("vsock/virtio: remove socket from connected/bound list on shutdown")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/vhost/vsock.c                   | 214 +++++-------
+ include/linux/virtio_vsock.h            | 129 ++++++--
+ net/vmw_vsock/virtio_transport.c        | 149 +++------
+ net/vmw_vsock/virtio_transport_common.c | 422 +++++++++++++-----------
+ net/vmw_vsock/vsock_loopback.c          |  51 +--
+ 5 files changed, 498 insertions(+), 467 deletions(-)
+
+diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c
+index a2b3743723639..1f3b89c885cca 100644
+--- a/drivers/vhost/vsock.c
++++ b/drivers/vhost/vsock.c
+@@ -51,8 +51,7 @@ struct vhost_vsock {
+       struct hlist_node hash;
+ 
+       struct vhost_work send_pkt_work;
+-      spinlock_t send_pkt_list_lock;
+-      struct list_head send_pkt_list; /* host->guest pending packets */
++      struct sk_buff_head send_pkt_queue; /* host->guest pending packets */
+ 
+       atomic_t queued_replies;
+ 
+@@ -108,40 +107,31 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+       vhost_disable_notify(&vsock->dev, vq);
+ 
+       do {
+-              struct virtio_vsock_pkt *pkt;
++              struct virtio_vsock_hdr *hdr;
++              size_t iov_len, payload_len;
+               struct iov_iter iov_iter;
++              u32 flags_to_restore = 0;
++              struct sk_buff *skb;
+               unsigned out, in;
+               size_t nbytes;
+-              size_t iov_len, payload_len;
+               int head;
+-              u32 flags_to_restore = 0;
+ 
+-              spin_lock_bh(&vsock->send_pkt_list_lock);
+-              if (list_empty(&vsock->send_pkt_list)) {
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
++              skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
++
++              if (!skb) {
+                       vhost_enable_notify(&vsock->dev, vq);
+                       break;
+               }
+ 
+-              pkt = list_first_entry(&vsock->send_pkt_list,
+-                                     struct virtio_vsock_pkt, list);
+-              list_del_init(&pkt->list);
+-              spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
+               head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov),
+                                        &out, &in, NULL, NULL);
+               if (head < 0) {
+-                      spin_lock_bh(&vsock->send_pkt_list_lock);
+-                      list_add(&pkt->list, &vsock->send_pkt_list);
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
++                      virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
+                       break;
+               }
+ 
+               if (head == vq->num) {
+-                      spin_lock_bh(&vsock->send_pkt_list_lock);
+-                      list_add(&pkt->list, &vsock->send_pkt_list);
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
++                      virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
+                       /* We cannot finish yet if more buffers snuck in while
+                        * re-enabling notify.
+                        */
+@@ -153,26 +143,27 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+               }
+ 
+               if (out) {
+-                      virtio_transport_free_pkt(pkt);
++                      kfree_skb(skb);
+                       vq_err(vq, "Expected 0 output buffers, got %u\n", out);
+                       break;
+               }
+ 
+               iov_len = iov_length(&vq->iov[out], in);
+-              if (iov_len < sizeof(pkt->hdr)) {
+-                      virtio_transport_free_pkt(pkt);
++              if (iov_len < sizeof(*hdr)) {
++                      kfree_skb(skb);
+                       vq_err(vq, "Buffer len [%zu] too small\n", iov_len);
+                       break;
+               }
+ 
+               iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len);
+-              payload_len = pkt->len - pkt->off;
++              payload_len = skb->len;
++              hdr = virtio_vsock_hdr(skb);
+ 
+               /* If the packet is greater than the space available in the
+                * buffer, we split it using multiple buffers.
+                */
+-              if (payload_len > iov_len - sizeof(pkt->hdr)) {
+-                      payload_len = iov_len - sizeof(pkt->hdr);
++              if (payload_len > iov_len - sizeof(*hdr)) {
++                      payload_len = iov_len - sizeof(*hdr);
+ 
+                       /* As we are copying pieces of large packet's buffer to
+                        * small rx buffers, headers of packets in rx queue are
+@@ -185,31 +176,30 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+                        * bits set. After initialized header will be copied to
+                        * rx buffer, these required bits will be restored.
+                        */
+-                      if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
+-                              pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
++                      if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
++                              hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+                               flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM;
+ 
+-                              if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) {
+-                                      pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
++                              if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) {
++                                      hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+                                       flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR;
+                               }
+                       }
+               }
+ 
+               /* Set the correct length in the header */
+-              pkt->hdr.len = cpu_to_le32(payload_len);
++              hdr->len = cpu_to_le32(payload_len);
+ 
+-              nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+-              if (nbytes != sizeof(pkt->hdr)) {
+-                      virtio_transport_free_pkt(pkt);
++              nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter);
++              if (nbytes != sizeof(*hdr)) {
++                      kfree_skb(skb);
+                       vq_err(vq, "Faulted on copying pkt hdr\n");
+                       break;
+               }
+ 
+-              nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len,
+-                                    &iov_iter);
++              nbytes = copy_to_iter(skb->data, payload_len, &iov_iter);
+               if (nbytes != payload_len) {
+-                      virtio_transport_free_pkt(pkt);
++                      kfree_skb(skb);
+                       vq_err(vq, "Faulted on copying pkt buf\n");
+                       break;
+               }
+@@ -217,31 +207,28 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+               /* Deliver to monitoring devices all packets that we
+                * will transmit.
+                */
+-              virtio_transport_deliver_tap_pkt(pkt);
++              virtio_transport_deliver_tap_pkt(skb);
+ 
+-              vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len);
++              vhost_add_used(vq, head, sizeof(*hdr) + payload_len);
+               added = true;
+ 
+-              pkt->off += payload_len;
++              skb_pull(skb, payload_len);
+               total_len += payload_len;
+ 
+               /* If we didn't send all the payload we can requeue the packet
+                * to send it with the next available buffer.
+                */
+-              if (pkt->off < pkt->len) {
+-                      pkt->hdr.flags |= cpu_to_le32(flags_to_restore);
++              if (skb->len > 0) {
++                      hdr->flags |= cpu_to_le32(flags_to_restore);
+ 
+-                      /* We are queueing the same virtio_vsock_pkt to handle
++                      /* We are queueing the same skb to handle
+                        * the remaining bytes, and we want to deliver it
+                        * to monitoring devices in the next iteration.
+                        */
+-                      pkt->tap_delivered = false;
+-
+-                      spin_lock_bh(&vsock->send_pkt_list_lock);
+-                      list_add(&pkt->list, &vsock->send_pkt_list);
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
++                      virtio_vsock_skb_clear_tap_delivered(skb);
++                      virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
+               } else {
+-                      if (pkt->reply) {
++                      if (virtio_vsock_skb_reply(skb)) {
+                               int val;
+ 
+                               val = atomic_dec_return(&vsock->queued_replies);
+@@ -253,7 +240,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock,
+                                       restart_tx = true;
+                       }
+ 
+-                      virtio_transport_free_pkt(pkt);
++                      consume_skb(skb);
+               }
+       } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len)));
+       if (added)
+@@ -278,28 +265,26 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work)
+ }
+ 
+ static int
+-vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt)
++vhost_transport_send_pkt(struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vhost_vsock *vsock;
+-      int len = pkt->len;
++      int len = skb->len;
+ 
+       rcu_read_lock();
+ 
+       /* Find the vhost_vsock according to guest context id  */
+-      vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid));
++      vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid));
+       if (!vsock) {
+               rcu_read_unlock();
+-              virtio_transport_free_pkt(pkt);
++              kfree_skb(skb);
+               return -ENODEV;
+       }
+ 
+-      if (pkt->reply)
++      if (virtio_vsock_skb_reply(skb))
+               atomic_inc(&vsock->queued_replies);
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      list_add_tail(&pkt->list, &vsock->send_pkt_list);
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
++      virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
+       vhost_work_queue(&vsock->dev, &vsock->send_pkt_work);
+ 
+       rcu_read_unlock();
+@@ -310,10 +295,8 @@ static int
+ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+ {
+       struct vhost_vsock *vsock;
+-      struct virtio_vsock_pkt *pkt, *n;
+       int cnt = 0;
+       int ret = -ENODEV;
+-      LIST_HEAD(freeme);
+ 
+       rcu_read_lock();
+ 
+@@ -322,20 +305,7 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+       if (!vsock)
+               goto out;
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+-              if (pkt->vsk != vsk)
+-                      continue;
+-              list_move(&pkt->list, &freeme);
+-      }
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
+-      list_for_each_entry_safe(pkt, n, &freeme, list) {
+-              if (pkt->reply)
+-                      cnt++;
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
++      cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue);
+ 
+       if (cnt) {
+               struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX];
+@@ -352,12 +322,14 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk)
+       return ret;
+ }
+ 
+-static struct virtio_vsock_pkt *
+-vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
++static struct sk_buff *
++vhost_vsock_alloc_skb(struct vhost_virtqueue *vq,
+                     unsigned int out, unsigned int in)
+ {
+-      struct virtio_vsock_pkt *pkt;
++      struct virtio_vsock_hdr *hdr;
+       struct iov_iter iov_iter;
++      struct sk_buff *skb;
++      size_t payload_len;
+       size_t nbytes;
+       size_t len;
+ 
+@@ -366,50 +338,48 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq,
+               return NULL;
+       }
+ 
+-      pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+-      if (!pkt)
++      len = iov_length(vq->iov, out);
++
++      /* len contains both payload and hdr */
++      skb = virtio_vsock_alloc_skb(len, GFP_KERNEL);
++      if (!skb)
+               return NULL;
+ 
+-      len = iov_length(vq->iov, out);
+       iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len);
+ 
+-      nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter);
+-      if (nbytes != sizeof(pkt->hdr)) {
++      hdr = virtio_vsock_hdr(skb);
++      nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter);
++      if (nbytes != sizeof(*hdr)) {
+               vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n",
+-                     sizeof(pkt->hdr), nbytes);
+-              kfree(pkt);
++                     sizeof(*hdr), nbytes);
++              kfree_skb(skb);
+               return NULL;
+       }
+ 
+-      pkt->len = le32_to_cpu(pkt->hdr.len);
++      payload_len = le32_to_cpu(hdr->len);
+ 
+       /* No payload */
+-      if (!pkt->len)
+-              return pkt;
++      if (!payload_len)
++              return skb;
+ 
+-      /* The pkt is too big */
+-      if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) {
+-              kfree(pkt);
++      /* The pkt is too big or the length in the header is invalid */
++      if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE ||
++          payload_len + sizeof(*hdr) > len) {
++              kfree_skb(skb);
+               return NULL;
+       }
+ 
+-      pkt->buf = kvmalloc(pkt->len, GFP_KERNEL);
+-      if (!pkt->buf) {
+-              kfree(pkt);
+-              return NULL;
+-      }
++      virtio_vsock_skb_rx_put(skb);
+ 
+-      pkt->buf_len = pkt->len;
+-
+-      nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter);
+-      if (nbytes != pkt->len) {
+-              vq_err(vq, "Expected %u byte payload, got %zu bytes\n",
+-                     pkt->len, nbytes);
+-              virtio_transport_free_pkt(pkt);
++      nbytes = copy_from_iter(skb->data, payload_len, &iov_iter);
++      if (nbytes != payload_len) {
++              vq_err(vq, "Expected %zu byte payload, got %zu bytes\n",
++                     payload_len, nbytes);
++              kfree_skb(skb);
+               return NULL;
+       }
+ 
+-      return pkt;
++      return skb;
+ }
+ 
+ /* Is there space left for replies to rx packets? */
+@@ -496,9 +466,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+                                                 poll.work);
+       struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock,
+                                                dev);
+-      struct virtio_vsock_pkt *pkt;
+       int head, pkts = 0, total_len = 0;
+       unsigned int out, in;
++      struct sk_buff *skb;
+       bool added = false;
+ 
+       mutex_lock(&vq->mutex);
+@@ -511,6 +481,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+ 
+       vhost_disable_notify(&vsock->dev, vq);
+       do {
++              struct virtio_vsock_hdr *hdr;
++
+               if (!vhost_vsock_more_replies(vsock)) {
+                       /* Stop tx until the device processes already
+                        * pending replies.  Leave tx virtqueue
+@@ -532,24 +504,26 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work)
+                       break;
+               }
+ 
+-              pkt = vhost_vsock_alloc_pkt(vq, out, in);
+-              if (!pkt) {
++              skb = vhost_vsock_alloc_skb(vq, out, in);
++              if (!skb) {
+                       vq_err(vq, "Faulted on pkt\n");
+                       continue;
+               }
+ 
+-              total_len += sizeof(pkt->hdr) + pkt->len;
++              total_len += sizeof(*hdr) + skb->len;
+ 
+               /* Deliver to monitoring devices all received packets */
+-              virtio_transport_deliver_tap_pkt(pkt);
++              virtio_transport_deliver_tap_pkt(skb);
++
++              hdr = virtio_vsock_hdr(skb);
+ 
+               /* Only accept correctly addressed packets */
+-              if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid &&
+-                  le64_to_cpu(pkt->hdr.dst_cid) ==
++              if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid &&
++                  le64_to_cpu(hdr->dst_cid) ==
+                   vhost_transport_get_local_cid())
+-                      virtio_transport_recv_pkt(&vhost_transport, pkt);
++                      virtio_transport_recv_pkt(&vhost_transport, skb);
+               else
+-                      virtio_transport_free_pkt(pkt);
++                      kfree_skb(skb);
+ 
+               vhost_add_used(vq, head, 0);
+               added = true;
+@@ -693,8 +667,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file)
+                      VHOST_VSOCK_WEIGHT, true, NULL);
+ 
+       file->private_data = vsock;
+-      spin_lock_init(&vsock->send_pkt_list_lock);
+-      INIT_LIST_HEAD(&vsock->send_pkt_list);
++      skb_queue_head_init(&vsock->send_pkt_queue);
+       vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work);
+       return 0;
+ 
+@@ -760,16 +733,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file)
+       vhost_vsock_flush(vsock);
+       vhost_dev_stop(&vsock->dev);
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      while (!list_empty(&vsock->send_pkt_list)) {
+-              struct virtio_vsock_pkt *pkt;
+-
+-              pkt = list_first_entry(&vsock->send_pkt_list,
+-                              struct virtio_vsock_pkt, list);
+-              list_del_init(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
++      virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
+ 
+       vhost_dev_cleanup(&vsock->dev);
+       kfree(vsock->dev.vqs);
+diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h
+index 35d7eedb5e8e4..3f9c166113063 100644
+--- a/include/linux/virtio_vsock.h
++++ b/include/linux/virtio_vsock.h
+@@ -7,6 +7,109 @@
+ #include <net/sock.h>
+ #include <net/af_vsock.h>
+ 
++#define VIRTIO_VSOCK_SKB_HEADROOM (sizeof(struct virtio_vsock_hdr))
++
++struct virtio_vsock_skb_cb {
++      bool reply;
++      bool tap_delivered;
++};
++
++#define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb))
++
++static inline struct virtio_vsock_hdr *virtio_vsock_hdr(struct sk_buff *skb)
++{
++      return (struct virtio_vsock_hdr *)skb->head;
++}
++
++static inline bool virtio_vsock_skb_reply(struct sk_buff *skb)
++{
++      return VIRTIO_VSOCK_SKB_CB(skb)->reply;
++}
++
++static inline void virtio_vsock_skb_set_reply(struct sk_buff *skb)
++{
++      VIRTIO_VSOCK_SKB_CB(skb)->reply = true;
++}
++
++static inline bool virtio_vsock_skb_tap_delivered(struct sk_buff *skb)
++{
++      return VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered;
++}
++
++static inline void virtio_vsock_skb_set_tap_delivered(struct sk_buff *skb)
++{
++      VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = true;
++}
++
++static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb)
++{
++      VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false;
++}
++
++static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb)
++{
++      u32 len;
++
++      len = le32_to_cpu(virtio_vsock_hdr(skb)->len);
++
++      if (len > 0)
++              skb_put(skb, len);
++}
++
++static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask)
++{
++      struct sk_buff *skb;
++
++      if (size < VIRTIO_VSOCK_SKB_HEADROOM)
++              return NULL;
++
++      skb = alloc_skb(size, mask);
++      if (!skb)
++              return NULL;
++
++      skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM);
++      return skb;
++}
++
++static inline void
++virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb)
++{
++      spin_lock_bh(&list->lock);
++      __skb_queue_head(list, skb);
++      spin_unlock_bh(&list->lock);
++}
++
++static inline void
++virtio_vsock_skb_queue_tail(struct sk_buff_head *list, struct sk_buff *skb)
++{
++      spin_lock_bh(&list->lock);
++      __skb_queue_tail(list, skb);
++      spin_unlock_bh(&list->lock);
++}
++
++static inline struct sk_buff *virtio_vsock_skb_dequeue(struct sk_buff_head *list)
++{
++      struct sk_buff *skb;
++
++      spin_lock_bh(&list->lock);
++      skb = __skb_dequeue(list);
++      spin_unlock_bh(&list->lock);
++
++      return skb;
++}
++
++static inline void virtio_vsock_skb_queue_purge(struct sk_buff_head *list)
++{
++      spin_lock_bh(&list->lock);
++      __skb_queue_purge(list);
++      spin_unlock_bh(&list->lock);
++}
++
++static inline size_t virtio_vsock_skb_len(struct sk_buff *skb)
++{
++      return (size_t)(skb_end_pointer(skb) - skb->head);
++}
++
+ #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE      (1024 * 4)
+ #define VIRTIO_VSOCK_MAX_BUF_SIZE             0xFFFFFFFFUL
+ #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE         (1024 * 64)
+@@ -35,23 +138,10 @@ struct virtio_vsock_sock {
+       u32 last_fwd_cnt;
+       u32 rx_bytes;
+       u32 buf_alloc;
+-      struct list_head rx_queue;
++      struct sk_buff_head rx_queue;
+       u32 msg_count;
+ };
+ 
+-struct virtio_vsock_pkt {
+-      struct virtio_vsock_hdr hdr;
+-      struct list_head list;
+-      /* socket refcnt not held, only use for cancellation */
+-      struct vsock_sock *vsk;
+-      void *buf;
+-      u32 buf_len;
+-      u32 len;
+-      u32 off;
+-      bool reply;
+-      bool tap_delivered;
+-};
+-
+ struct virtio_vsock_pkt_info {
+       u32 remote_cid, remote_port;
+       struct vsock_sock *vsk;
+@@ -68,7 +158,7 @@ struct virtio_transport {
+       struct vsock_transport transport;
+ 
+       /* Takes ownership of the packet */
+-      int (*send_pkt)(struct virtio_vsock_pkt *pkt);
++      int (*send_pkt)(struct sk_buff *skb);
+ };
+ 
+ ssize_t
+@@ -149,11 +239,10 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk,
+ void virtio_transport_destruct(struct vsock_sock *vsk);
+ 
+ void virtio_transport_recv_pkt(struct virtio_transport *t,
+-                             struct virtio_vsock_pkt *pkt);
+-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt);
+-void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt);
++                             struct sk_buff *skb);
++void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb);
+ u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted);
+ void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit);
+-void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt);
+-
++void virtio_transport_deliver_tap_pkt(struct sk_buff *skb);
++int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list);
+ #endif /* _LINUX_VIRTIO_VSOCK_H */
+diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c
+index 460e7fbb42da3..16575ea836590 100644
+--- a/net/vmw_vsock/virtio_transport.c
++++ b/net/vmw_vsock/virtio_transport.c
+@@ -42,8 +42,7 @@ struct virtio_vsock {
+       bool tx_run;
+ 
+       struct work_struct send_pkt_work;
+-      spinlock_t send_pkt_list_lock;
+-      struct list_head send_pkt_list;
++      struct sk_buff_head send_pkt_queue;
+ 
+       atomic_t queued_replies;
+ 
+@@ -101,41 +100,31 @@ virtio_transport_send_pkt_work(struct work_struct *work)
+       vq = vsock->vqs[VSOCK_VQ_TX];
+ 
+       for (;;) {
+-              struct virtio_vsock_pkt *pkt;
+               struct scatterlist hdr, buf, *sgs[2];
+               int ret, in_sg = 0, out_sg = 0;
++              struct sk_buff *skb;
+               bool reply;
+ 
+-              spin_lock_bh(&vsock->send_pkt_list_lock);
+-              if (list_empty(&vsock->send_pkt_list)) {
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
++              skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue);
++              if (!skb)
+                       break;
+-              }
+-
+-              pkt = list_first_entry(&vsock->send_pkt_list,
+-                                     struct virtio_vsock_pkt, list);
+-              list_del_init(&pkt->list);
+-              spin_unlock_bh(&vsock->send_pkt_list_lock);
+ 
+-              virtio_transport_deliver_tap_pkt(pkt);
++              virtio_transport_deliver_tap_pkt(skb);
++              reply = virtio_vsock_skb_reply(skb);
+ 
+-              reply = pkt->reply;
+-
+-              sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
++              sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb)));
+               sgs[out_sg++] = &hdr;
+-              if (pkt->buf) {
+-                      sg_init_one(&buf, pkt->buf, pkt->len);
++              if (skb->len > 0) {
++                      sg_init_one(&buf, skb->data, skb->len);
+                       sgs[out_sg++] = &buf;
+               }
+ 
+-              ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL);
++              ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL);
+               /* Usually this means that there is no more space available in
+                * the vq
+                */
+               if (ret < 0) {
+-                      spin_lock_bh(&vsock->send_pkt_list_lock);
+-                      list_add(&pkt->list, &vsock->send_pkt_list);
+-                      spin_unlock_bh(&vsock->send_pkt_list_lock);
++                      virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb);
+                       break;
+               }
+ 
+@@ -164,32 +153,32 @@ virtio_transport_send_pkt_work(struct work_struct *work)
+ }
+ 
+ static int
+-virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt)
++virtio_transport_send_pkt(struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr;
+       struct virtio_vsock *vsock;
+-      int len = pkt->len;
++      int len = skb->len;
++
++      hdr = virtio_vsock_hdr(skb);
+ 
+       rcu_read_lock();
+       vsock = rcu_dereference(the_virtio_vsock);
+       if (!vsock) {
+-              virtio_transport_free_pkt(pkt);
++              kfree_skb(skb);
+               len = -ENODEV;
+               goto out_rcu;
+       }
+ 
+-      if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) {
+-              virtio_transport_free_pkt(pkt);
++      if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) {
++              kfree_skb(skb);
+               len = -ENODEV;
+               goto out_rcu;
+       }
+ 
+-      if (pkt->reply)
++      if (virtio_vsock_skb_reply(skb))
+               atomic_inc(&vsock->queued_replies);
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      list_add_tail(&pkt->list, &vsock->send_pkt_list);
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
++      virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb);
+       queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work);
+ 
+ out_rcu:
+@@ -201,9 +190,7 @@ static int
+ virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+ {
+       struct virtio_vsock *vsock;
+-      struct virtio_vsock_pkt *pkt, *n;
+       int cnt = 0, ret;
+-      LIST_HEAD(freeme);
+ 
+       rcu_read_lock();
+       vsock = rcu_dereference(the_virtio_vsock);
+@@ -212,20 +199,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+               goto out_rcu;
+       }
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) {
+-              if (pkt->vsk != vsk)
+-                      continue;
+-              list_move(&pkt->list, &freeme);
+-      }
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
+-
+-      list_for_each_entry_safe(pkt, n, &freeme, list) {
+-              if (pkt->reply)
+-                      cnt++;
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
++      cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue);
+ 
+       if (cnt) {
+               struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX];
+@@ -246,38 +220,28 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk)
+ 
+ static void virtio_vsock_rx_fill(struct virtio_vsock *vsock)
+ {
+-      int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE;
+-      struct virtio_vsock_pkt *pkt;
+-      struct scatterlist hdr, buf, *sgs[2];
++      int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM;
++      struct scatterlist pkt, *p;
+       struct virtqueue *vq;
++      struct sk_buff *skb;
+       int ret;
+ 
+       vq = vsock->vqs[VSOCK_VQ_RX];
+ 
+       do {
+-              pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+-              if (!pkt)
++              skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL);
++              if (!skb)
+                       break;
+ 
+-              pkt->buf = kmalloc(buf_len, GFP_KERNEL);
+-              if (!pkt->buf) {
+-                      virtio_transport_free_pkt(pkt);
++              memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM);
++              sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len);
++              p = &pkt;
++              ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL);
++              if (ret < 0) {
++                      kfree_skb(skb);
+                       break;
+               }
+ 
+-              pkt->buf_len = buf_len;
+-              pkt->len = buf_len;
+-
+-              sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr));
+-              sgs[0] = &hdr;
+-
+-              sg_init_one(&buf, pkt->buf, buf_len);
+-              sgs[1] = &buf;
+-              ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL);
+-              if (ret) {
+-                      virtio_transport_free_pkt(pkt);
+-                      break;
+-              }
+               vsock->rx_buf_nr++;
+       } while (vq->num_free);
+       if (vsock->rx_buf_nr > vsock->rx_buf_max_nr)
+@@ -299,12 +263,12 @@ static void virtio_transport_tx_work(struct work_struct *work)
+               goto out;
+ 
+       do {
+-              struct virtio_vsock_pkt *pkt;
++              struct sk_buff *skb;
+               unsigned int len;
+ 
+               virtqueue_disable_cb(vq);
+-              while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) {
+-                      virtio_transport_free_pkt(pkt);
++              while ((skb = virtqueue_get_buf(vq, &len)) != NULL) {
++                      consume_skb(skb);
+                       added = true;
+               }
+       } while (!virtqueue_enable_cb(vq));
+@@ -529,7 +493,7 @@ static void virtio_transport_rx_work(struct work_struct *work)
+       do {
+               virtqueue_disable_cb(vq);
+               for (;;) {
+-                      struct virtio_vsock_pkt *pkt;
++                      struct sk_buff *skb;
+                       unsigned int len;
+ 
+                       if (!virtio_transport_more_replies(vsock)) {
+@@ -540,23 +504,22 @@ static void virtio_transport_rx_work(struct work_struct *work)
+                               goto out;
+                       }
+ 
+-                      pkt = virtqueue_get_buf(vq, &len);
+-                      if (!pkt) {
++                      skb = virtqueue_get_buf(vq, &len);
++                      if (!skb)
+                               break;
+-                      }
+ 
+                       vsock->rx_buf_nr--;
+ 
+                       /* Drop short/long packets */
+-                      if (unlikely(len < sizeof(pkt->hdr) ||
+-                                   len > sizeof(pkt->hdr) + pkt->len)) {
+-                              virtio_transport_free_pkt(pkt);
++                      if (unlikely(len < sizeof(struct virtio_vsock_hdr) ||
++                                   len > virtio_vsock_skb_len(skb))) {
++                              kfree_skb(skb);
+                               continue;
+                       }
+ 
+-                      pkt->len = len - sizeof(pkt->hdr);
+-                      virtio_transport_deliver_tap_pkt(pkt);
+-                      virtio_transport_recv_pkt(&virtio_transport, pkt);
++                      virtio_vsock_skb_rx_put(skb);
++                      virtio_transport_deliver_tap_pkt(skb);
++                      virtio_transport_recv_pkt(&virtio_transport, skb);
+               }
+       } while (!virtqueue_enable_cb(vq));
+ 
+@@ -624,7 +587,7 @@ static void virtio_vsock_vqs_start(struct virtio_vsock *vsock)
+ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
+ {
+       struct virtio_device *vdev = vsock->vdev;
+-      struct virtio_vsock_pkt *pkt;
++      struct sk_buff *skb;
+ 
+       /* Reset all connected sockets when the VQs disappear */
+       vsock_for_each_connected_socket(&virtio_transport.transport,
+@@ -651,23 +614,16 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock)
+       virtio_reset_device(vdev);
+ 
+       mutex_lock(&vsock->rx_lock);
+-      while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
+-              virtio_transport_free_pkt(pkt);
++      while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX])))
++              kfree_skb(skb);
+       mutex_unlock(&vsock->rx_lock);
+ 
+       mutex_lock(&vsock->tx_lock);
+-      while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
+-              virtio_transport_free_pkt(pkt);
++      while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX])))
++              kfree_skb(skb);
+       mutex_unlock(&vsock->tx_lock);
+ 
+-      spin_lock_bh(&vsock->send_pkt_list_lock);
+-      while (!list_empty(&vsock->send_pkt_list)) {
+-              pkt = list_first_entry(&vsock->send_pkt_list,
+-                                     struct virtio_vsock_pkt, list);
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
+-      spin_unlock_bh(&vsock->send_pkt_list_lock);
++      virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue);
+ 
+       /* Delete virtqueues and flush outstanding callbacks if any */
+       vdev->config->del_vqs(vdev);
+@@ -704,8 +660,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev)
+       mutex_init(&vsock->tx_lock);
+       mutex_init(&vsock->rx_lock);
+       mutex_init(&vsock->event_lock);
+-      spin_lock_init(&vsock->send_pkt_list_lock);
+-      INIT_LIST_HEAD(&vsock->send_pkt_list);
++      skb_queue_head_init(&vsock->send_pkt_queue);
+       INIT_WORK(&vsock->rx_work, virtio_transport_rx_work);
+       INIT_WORK(&vsock->tx_work, virtio_transport_tx_work);
+       INIT_WORK(&vsock->event_work, virtio_transport_event_work);
+diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
+index a9980e9b93040..a1581c77cf84a 100644
+--- a/net/vmw_vsock/virtio_transport_common.c
++++ b/net/vmw_vsock/virtio_transport_common.c
+@@ -37,53 +37,56 @@ virtio_transport_get_ops(struct vsock_sock *vsk)
+       return container_of(t, struct virtio_transport, transport);
+ }
+ 
+-static struct virtio_vsock_pkt *
+-virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
++/* Returns a new packet on success, otherwise returns NULL.
++ *
++ * If NULL is returned, errp is set to a negative errno.
++ */
++static struct sk_buff *
++virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info,
+                          size_t len,
+                          u32 src_cid,
+                          u32 src_port,
+                          u32 dst_cid,
+                          u32 dst_port)
+ {
+-      struct virtio_vsock_pkt *pkt;
++      const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len;
++      struct virtio_vsock_hdr *hdr;
++      struct sk_buff *skb;
++      void *payload;
+       int err;
+ 
+-      pkt = kzalloc(sizeof(*pkt), GFP_KERNEL);
+-      if (!pkt)
++      skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL);
++      if (!skb)
+               return NULL;
+ 
+-      pkt->hdr.type           = cpu_to_le16(info->type);
+-      pkt->hdr.op             = cpu_to_le16(info->op);
+-      pkt->hdr.src_cid        = cpu_to_le64(src_cid);
+-      pkt->hdr.dst_cid        = cpu_to_le64(dst_cid);
+-      pkt->hdr.src_port       = cpu_to_le32(src_port);
+-      pkt->hdr.dst_port       = cpu_to_le32(dst_port);
+-      pkt->hdr.flags          = cpu_to_le32(info->flags);
+-      pkt->len                = len;
+-      pkt->hdr.len            = cpu_to_le32(len);
+-      pkt->reply              = info->reply;
+-      pkt->vsk                = info->vsk;
++      hdr = virtio_vsock_hdr(skb);
++      hdr->type       = cpu_to_le16(info->type);
++      hdr->op         = cpu_to_le16(info->op);
++      hdr->src_cid    = cpu_to_le64(src_cid);
++      hdr->dst_cid    = cpu_to_le64(dst_cid);
++      hdr->src_port   = cpu_to_le32(src_port);
++      hdr->dst_port   = cpu_to_le32(dst_port);
++      hdr->flags      = cpu_to_le32(info->flags);
++      hdr->len        = cpu_to_le32(len);
+ 
+       if (info->msg && len > 0) {
+-              pkt->buf = kmalloc(len, GFP_KERNEL);
+-              if (!pkt->buf)
+-                      goto out_pkt;
+-
+-              pkt->buf_len = len;
+-
+-              err = memcpy_from_msg(pkt->buf, info->msg, len);
++              payload = skb_put(skb, len);
++              err = memcpy_from_msg(payload, info->msg, len);
+               if (err)
+                       goto out;
+ 
+               if (msg_data_left(info->msg) == 0 &&
+                   info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) {
+-                      pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
++                      hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM);
+ 
+                       if (info->msg->msg_flags & MSG_EOR)
+-                              pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
++                              hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR);
+               }
+       }
+ 
++      if (info->reply)
++              virtio_vsock_skb_set_reply(skb);
++
+       trace_virtio_transport_alloc_pkt(src_cid, src_port,
+                                        dst_cid, dst_port,
+                                        len,
+@@ -91,19 +94,18 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info,
+                                        info->op,
+                                        info->flags);
+ 
+-      return pkt;
++      return skb;
+ 
+ out:
+-      kfree(pkt->buf);
+-out_pkt:
+-      kfree(pkt);
++      kfree_skb(skb);
+       return NULL;
+ }
+ 
+ /* Packet capture */
+ static struct sk_buff *virtio_transport_build_skb(void *opaque)
+ {
+-      struct virtio_vsock_pkt *pkt = opaque;
++      struct virtio_vsock_hdr *pkt_hdr;
++      struct sk_buff *pkt = opaque;
+       struct af_vsockmon_hdr *hdr;
+       struct sk_buff *skb;
+       size_t payload_len;
+@@ -113,10 +115,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
+        * the payload length from the header and the buffer pointer taking
+        * care of the offset in the original packet.
+        */
+-      payload_len = le32_to_cpu(pkt->hdr.len);
+-      payload_buf = pkt->buf + pkt->off;
++      pkt_hdr = virtio_vsock_hdr(pkt);
++      payload_len = pkt->len;
++      payload_buf = pkt->data;
+ 
+-      skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len,
++      skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len,
+                       GFP_ATOMIC);
+       if (!skb)
+               return NULL;
+@@ -124,16 +127,16 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
+       hdr = skb_put(skb, sizeof(*hdr));
+ 
+       /* pkt->hdr is little-endian so no need to byteswap here */
+-      hdr->src_cid = pkt->hdr.src_cid;
+-      hdr->src_port = pkt->hdr.src_port;
+-      hdr->dst_cid = pkt->hdr.dst_cid;
+-      hdr->dst_port = pkt->hdr.dst_port;
++      hdr->src_cid = pkt_hdr->src_cid;
++      hdr->src_port = pkt_hdr->src_port;
++      hdr->dst_cid = pkt_hdr->dst_cid;
++      hdr->dst_port = pkt_hdr->dst_port;
+ 
+       hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO);
+-      hdr->len = cpu_to_le16(sizeof(pkt->hdr));
++      hdr->len = cpu_to_le16(sizeof(*pkt_hdr));
+       memset(hdr->reserved, 0, sizeof(hdr->reserved));
+ 
+-      switch (le16_to_cpu(pkt->hdr.op)) {
++      switch (le16_to_cpu(pkt_hdr->op)) {
+       case VIRTIO_VSOCK_OP_REQUEST:
+       case VIRTIO_VSOCK_OP_RESPONSE:
+               hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT);
+@@ -154,7 +157,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
+               break;
+       }
+ 
+-      skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr));
++      skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr));
+ 
+       if (payload_len) {
+               skb_put_data(skb, payload_buf, payload_len);
+@@ -163,13 +166,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque)
+       return skb;
+ }
+ 
+-void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt)
++void virtio_transport_deliver_tap_pkt(struct sk_buff *skb)
+ {
+-      if (pkt->tap_delivered)
++      if (virtio_vsock_skb_tap_delivered(skb))
+               return;
+ 
+-      vsock_deliver_tap(virtio_transport_build_skb, pkt);
+-      pkt->tap_delivered = true;
++      vsock_deliver_tap(virtio_transport_build_skb, skb);
++      virtio_vsock_skb_set_tap_delivered(skb);
+ }
+ EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt);
+ 
+@@ -192,8 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+       u32 src_cid, src_port, dst_cid, dst_port;
+       const struct virtio_transport *t_ops;
+       struct virtio_vsock_sock *vvs;
+-      struct virtio_vsock_pkt *pkt;
+       u32 pkt_len = info->pkt_len;
++      struct sk_buff *skb;
+ 
+       info->type = virtio_transport_get_type(sk_vsock(vsk));
+ 
+@@ -224,42 +227,47 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk,
+       if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW)
+               return pkt_len;
+ 
+-      pkt = virtio_transport_alloc_pkt(info, pkt_len,
++      skb = virtio_transport_alloc_skb(info, pkt_len,
+                                        src_cid, src_port,
+                                        dst_cid, dst_port);
+-      if (!pkt) {
++      if (!skb) {
+               virtio_transport_put_credit(vvs, pkt_len);
+               return -ENOMEM;
+       }
+ 
+-      virtio_transport_inc_tx_pkt(vvs, pkt);
++      virtio_transport_inc_tx_pkt(vvs, skb);
+ 
+-      return t_ops->send_pkt(pkt);
++      return t_ops->send_pkt(skb);
+ }
+ 
+ static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs,
+-                                      struct virtio_vsock_pkt *pkt)
++                                      struct sk_buff *skb)
+ {
+-      if (vvs->rx_bytes + pkt->len > vvs->buf_alloc)
++      if (vvs->rx_bytes + skb->len > vvs->buf_alloc)
+               return false;
+ 
+-      vvs->rx_bytes += pkt->len;
++      vvs->rx_bytes += skb->len;
+       return true;
+ }
+ 
+ static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs,
+-                                      struct virtio_vsock_pkt *pkt)
++                                      struct sk_buff *skb)
+ {
+-      vvs->rx_bytes -= pkt->len;
+-      vvs->fwd_cnt += pkt->len;
++      int len;
++
++      len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len;
++      vvs->rx_bytes -= len;
++      vvs->fwd_cnt += len;
+ }
+ 
+-void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt)
++void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
++
+       spin_lock_bh(&vvs->rx_lock);
+       vvs->last_fwd_cnt = vvs->fwd_cnt;
+-      pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
+-      pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc);
++      hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt);
++      hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc);
+       spin_unlock_bh(&vvs->rx_lock);
+ }
+ EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt);
+@@ -303,29 +311,29 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk,
+                               size_t len)
+ {
+       struct virtio_vsock_sock *vvs = vsk->trans;
+-      struct virtio_vsock_pkt *pkt;
+       size_t bytes, total = 0, off;
++      struct sk_buff *skb, *tmp;
+       int err = -EFAULT;
+ 
+       spin_lock_bh(&vvs->rx_lock);
+ 
+-      list_for_each_entry(pkt, &vvs->rx_queue, list) {
+-              off = pkt->off;
++      skb_queue_walk_safe(&vvs->rx_queue, skb,  tmp) {
++              off = 0;
+ 
+               if (total == len)
+                       break;
+ 
+-              while (total < len && off < pkt->len) {
++              while (total < len && off < skb->len) {
+                       bytes = len - total;
+-                      if (bytes > pkt->len - off)
+-                              bytes = pkt->len - off;
++                      if (bytes > skb->len - off)
++                              bytes = skb->len - off;
+ 
+                       /* sk_lock is held by caller so no one else can dequeue.
+                        * Unlock rx_lock since memcpy_to_msg() may sleep.
+                        */
+                       spin_unlock_bh(&vvs->rx_lock);
+ 
+-                      err = memcpy_to_msg(msg, pkt->buf + off, bytes);
++                      err = memcpy_to_msg(msg, skb->data + off, bytes);
+                       if (err)
+                               goto out;
+ 
+@@ -352,37 +360,38 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk,
+                                  size_t len)
+ {
+       struct virtio_vsock_sock *vvs = vsk->trans;
+-      struct virtio_vsock_pkt *pkt;
+       size_t bytes, total = 0;
+-      u32 free_space;
++      struct sk_buff *skb;
+       int err = -EFAULT;
++      u32 free_space;
+ 
+       spin_lock_bh(&vvs->rx_lock);
+-      while (total < len && !list_empty(&vvs->rx_queue)) {
+-              pkt = list_first_entry(&vvs->rx_queue,
+-                                     struct virtio_vsock_pkt, list);
++      while (total < len && !skb_queue_empty(&vvs->rx_queue)) {
++              skb = __skb_dequeue(&vvs->rx_queue);
+ 
+               bytes = len - total;
+-              if (bytes > pkt->len - pkt->off)
+-                      bytes = pkt->len - pkt->off;
++              if (bytes > skb->len)
++                      bytes = skb->len;
+ 
+               /* sk_lock is held by caller so no one else can dequeue.
+                * Unlock rx_lock since memcpy_to_msg() may sleep.
+                */
+               spin_unlock_bh(&vvs->rx_lock);
+ 
+-              err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes);
++              err = memcpy_to_msg(msg, skb->data, bytes);
+               if (err)
+                       goto out;
+ 
+               spin_lock_bh(&vvs->rx_lock);
+ 
+               total += bytes;
+-              pkt->off += bytes;
+-              if (pkt->off == pkt->len) {
+-                      virtio_transport_dec_rx_pkt(vvs, pkt);
+-                      list_del(&pkt->list);
+-                      virtio_transport_free_pkt(pkt);
++              skb_pull(skb, bytes);
++
++              if (skb->len == 0) {
++                      virtio_transport_dec_rx_pkt(vvs, skb);
++                      consume_skb(skb);
++              } else {
++                      __skb_queue_head(&vvs->rx_queue, skb);
+               }
+       }
+ 
+@@ -414,10 +423,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+                                                int flags)
+ {
+       struct virtio_vsock_sock *vvs = vsk->trans;
+-      struct virtio_vsock_pkt *pkt;
+       int dequeued_len = 0;
+       size_t user_buf_len = msg_data_left(msg);
+       bool msg_ready = false;
++      struct sk_buff *skb;
+ 
+       spin_lock_bh(&vvs->rx_lock);
+ 
+@@ -427,13 +436,18 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+       }
+ 
+       while (!msg_ready) {
+-              pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list);
++              struct virtio_vsock_hdr *hdr;
++
++              skb = __skb_dequeue(&vvs->rx_queue);
++              if (!skb)
++                      break;
++              hdr = virtio_vsock_hdr(skb);
+ 
+               if (dequeued_len >= 0) {
+                       size_t pkt_len;
+                       size_t bytes_to_copy;
+ 
+-                      pkt_len = (size_t)le32_to_cpu(pkt->hdr.len);
++                      pkt_len = (size_t)le32_to_cpu(hdr->len);
+                       bytes_to_copy = min(user_buf_len, pkt_len);
+ 
+                       if (bytes_to_copy) {
+@@ -444,7 +458,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+                                */
+                               spin_unlock_bh(&vvs->rx_lock);
+ 
+-                              err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy);
++                              err = memcpy_to_msg(msg, skb->data, bytes_to_copy);
+                               if (err) {
+                                       /* Copy of message failed. Rest of
+                                        * fragments will be freed without copy.
+@@ -452,6 +466,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+                                       dequeued_len = err;
+                               } else {
+                                       user_buf_len -= bytes_to_copy;
++                                      skb_pull(skb, bytes_to_copy);
+                               }
+ 
+                               spin_lock_bh(&vvs->rx_lock);
+@@ -461,17 +476,16 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk,
+                               dequeued_len += pkt_len;
+               }
+ 
+-              if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) {
++              if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) {
+                       msg_ready = true;
+                       vvs->msg_count--;
+ 
+-                      if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR)
++                      if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR)
+                               msg->msg_flags |= MSG_EOR;
+               }
+ 
+-              virtio_transport_dec_rx_pkt(vvs, pkt);
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
++              virtio_transport_dec_rx_pkt(vvs, skb);
++              kfree_skb(skb);
+       }
+ 
+       spin_unlock_bh(&vvs->rx_lock);
+@@ -609,7 +623,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk,
+ 
+       spin_lock_init(&vvs->rx_lock);
+       spin_lock_init(&vvs->tx_lock);
+-      INIT_LIST_HEAD(&vvs->rx_queue);
++      skb_queue_head_init(&vvs->rx_queue);
+ 
+       return 0;
+ }
+@@ -806,16 +820,16 @@ void virtio_transport_destruct(struct vsock_sock *vsk)
+ EXPORT_SYMBOL_GPL(virtio_transport_destruct);
+ 
+ static int virtio_transport_reset(struct vsock_sock *vsk,
+-                                struct virtio_vsock_pkt *pkt)
++                                struct sk_buff *skb)
+ {
+       struct virtio_vsock_pkt_info info = {
+               .op = VIRTIO_VSOCK_OP_RST,
+-              .reply = !!pkt,
++              .reply = !!skb,
+               .vsk = vsk,
+       };
+ 
+       /* Send RST only if the original pkt is not a RST pkt */
+-      if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
++      if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST)
+               return 0;
+ 
+       return virtio_transport_send_pkt_info(vsk, &info);
+@@ -825,29 +839,30 @@ static int virtio_transport_reset(struct vsock_sock *vsk,
+  * attempt was made to connect to a socket that does not exist.
+  */
+ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
+-                                        struct virtio_vsock_pkt *pkt)
++                                        struct sk_buff *skb)
+ {
+-      struct virtio_vsock_pkt *reply;
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct virtio_vsock_pkt_info info = {
+               .op = VIRTIO_VSOCK_OP_RST,
+-              .type = le16_to_cpu(pkt->hdr.type),
++              .type = le16_to_cpu(hdr->type),
+               .reply = true,
+       };
++      struct sk_buff *reply;
+ 
+       /* Send RST only if the original pkt is not a RST pkt */
+-      if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
++      if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST)
+               return 0;
+ 
+-      reply = virtio_transport_alloc_pkt(&info, 0,
+-                                         le64_to_cpu(pkt->hdr.dst_cid),
+-                                         le32_to_cpu(pkt->hdr.dst_port),
+-                                         le64_to_cpu(pkt->hdr.src_cid),
+-                                         le32_to_cpu(pkt->hdr.src_port));
++      reply = virtio_transport_alloc_skb(&info, 0,
++                                         le64_to_cpu(hdr->dst_cid),
++                                         le32_to_cpu(hdr->dst_port),
++                                         le64_to_cpu(hdr->src_cid),
++                                         le32_to_cpu(hdr->src_port));
+       if (!reply)
+               return -ENOMEM;
+ 
+       if (!t) {
+-              virtio_transport_free_pkt(reply);
++              kfree_skb(reply);
+               return -ENOTCONN;
+       }
+ 
+@@ -858,16 +873,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t,
+ static void virtio_transport_remove_sock(struct vsock_sock *vsk)
+ {
+       struct virtio_vsock_sock *vvs = vsk->trans;
+-      struct virtio_vsock_pkt *pkt, *tmp;
+ 
+       /* We don't need to take rx_lock, as the socket is closing and we are
+        * removing it.
+        */
+-      list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) {
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
+-
++      __skb_queue_purge(&vvs->rx_queue);
+       vsock_remove_sock(vsk);
+ }
+ 
+@@ -981,13 +991,14 @@ EXPORT_SYMBOL_GPL(virtio_transport_release);
+ 
+ static int
+ virtio_transport_recv_connecting(struct sock *sk,
+-                               struct virtio_vsock_pkt *pkt)
++                               struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vsock_sock *vsk = vsock_sk(sk);
+-      int err;
+       int skerr;
++      int err;
+ 
+-      switch (le16_to_cpu(pkt->hdr.op)) {
++      switch (le16_to_cpu(hdr->op)) {
+       case VIRTIO_VSOCK_OP_RESPONSE:
+               sk->sk_state = TCP_ESTABLISHED;
+               sk->sk_socket->state = SS_CONNECTED;
+@@ -1008,7 +1019,7 @@ virtio_transport_recv_connecting(struct sock *sk,
+       return 0;
+ 
+ destroy:
+-      virtio_transport_reset(vsk, pkt);
++      virtio_transport_reset(vsk, skb);
+       sk->sk_state = TCP_CLOSE;
+       sk->sk_err = skerr;
+       sk_error_report(sk);
+@@ -1017,34 +1028,37 @@ virtio_transport_recv_connecting(struct sock *sk,
+ 
+ static void
+ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
+-                            struct virtio_vsock_pkt *pkt)
++                            struct sk_buff *skb)
+ {
+       struct virtio_vsock_sock *vvs = vsk->trans;
+       bool can_enqueue, free_pkt = false;
++      struct virtio_vsock_hdr *hdr;
++      u32 len;
+ 
+-      pkt->len = le32_to_cpu(pkt->hdr.len);
+-      pkt->off = 0;
++      hdr = virtio_vsock_hdr(skb);
++      len = le32_to_cpu(hdr->len);
+ 
+       spin_lock_bh(&vvs->rx_lock);
+ 
+-      can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt);
++      can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb);
+       if (!can_enqueue) {
+               free_pkt = true;
+               goto out;
+       }
+ 
+-      if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)
++      if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)
+               vvs->msg_count++;
+ 
+       /* Try to copy small packets into the buffer of last packet queued,
+        * to avoid wasting memory queueing the entire buffer with a small
+        * payload.
+        */
+-      if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) {
+-              struct virtio_vsock_pkt *last_pkt;
++      if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) {
++              struct virtio_vsock_hdr *last_hdr;
++              struct sk_buff *last_skb;
+ 
+-              last_pkt = list_last_entry(&vvs->rx_queue,
+-                                         struct virtio_vsock_pkt, list);
++              last_skb = skb_peek_tail(&vvs->rx_queue);
++              last_hdr = virtio_vsock_hdr(last_skb);
+ 
+               /* If there is space in the last packet queued, we copy the
+                * new packet in its buffer. We avoid this if the last packet
+@@ -1052,35 +1066,35 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk,
+                * delimiter of SEQPACKET message, so 'pkt' is the first packet
+                * of a new message.
+                */
+-              if ((pkt->len <= last_pkt->buf_len - last_pkt->len) &&
+-                  !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) {
+-                      memcpy(last_pkt->buf + last_pkt->len, pkt->buf,
+-                             pkt->len);
+-                      last_pkt->len += pkt->len;
++              if (skb->len < skb_tailroom(last_skb) &&
++                  !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) {
++                      memcpy(skb_put(last_skb, skb->len), skb->data, skb->len);
+                       free_pkt = true;
+-                      last_pkt->hdr.flags |= pkt->hdr.flags;
++                      last_hdr->flags |= hdr->flags;
++                      last_hdr->len = cpu_to_le32(last_skb->len);
+                       goto out;
+               }
+       }
+ 
+-      list_add_tail(&pkt->list, &vvs->rx_queue);
++      __skb_queue_tail(&vvs->rx_queue, skb);
+ 
+ out:
+       spin_unlock_bh(&vvs->rx_lock);
+       if (free_pkt)
+-              virtio_transport_free_pkt(pkt);
++              kfree_skb(skb);
+ }
+ 
+ static int
+ virtio_transport_recv_connected(struct sock *sk,
+-                              struct virtio_vsock_pkt *pkt)
++                              struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vsock_sock *vsk = vsock_sk(sk);
+       int err = 0;
+ 
+-      switch (le16_to_cpu(pkt->hdr.op)) {
++      switch (le16_to_cpu(hdr->op)) {
+       case VIRTIO_VSOCK_OP_RW:
+-              virtio_transport_recv_enqueue(vsk, pkt);
++              virtio_transport_recv_enqueue(vsk, skb);
+               vsock_data_ready(sk);
+               return err;
+       case VIRTIO_VSOCK_OP_CREDIT_REQUEST:
+@@ -1090,18 +1104,17 @@ virtio_transport_recv_connected(struct sock *sk,
+               sk->sk_write_space(sk);
+               break;
+       case VIRTIO_VSOCK_OP_SHUTDOWN:
+-              if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
++              if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV)
+                       vsk->peer_shutdown |= RCV_SHUTDOWN;
+-              if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
++              if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+                       vsk->peer_shutdown |= SEND_SHUTDOWN;
+               if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+                   vsock_stream_has_data(vsk) <= 0 &&
+                   !sock_flag(sk, SOCK_DONE)) {
+                       (void)virtio_transport_reset(vsk, NULL);
+-
+                       virtio_transport_do_close(vsk, true);
+               }
+-              if (le32_to_cpu(pkt->hdr.flags))
++              if (le32_to_cpu(virtio_vsock_hdr(skb)->flags))
+                       sk->sk_state_change(sk);
+               break;
+       case VIRTIO_VSOCK_OP_RST:
+@@ -1112,28 +1125,30 @@ virtio_transport_recv_connected(struct sock *sk,
+               break;
+       }
+ 
+-      virtio_transport_free_pkt(pkt);
++      kfree_skb(skb);
+       return err;
+ }
+ 
+ static void
+ virtio_transport_recv_disconnecting(struct sock *sk,
+-                                  struct virtio_vsock_pkt *pkt)
++                                  struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vsock_sock *vsk = vsock_sk(sk);
+ 
+-      if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST)
++      if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST)
+               virtio_transport_do_close(vsk, true);
+ }
+ 
+ static int
+ virtio_transport_send_response(struct vsock_sock *vsk,
+-                             struct virtio_vsock_pkt *pkt)
++                             struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct virtio_vsock_pkt_info info = {
+               .op = VIRTIO_VSOCK_OP_RESPONSE,
+-              .remote_cid = le64_to_cpu(pkt->hdr.src_cid),
+-              .remote_port = le32_to_cpu(pkt->hdr.src_port),
++              .remote_cid = le64_to_cpu(hdr->src_cid),
++              .remote_port = le32_to_cpu(hdr->src_port),
+               .reply = true,
+               .vsk = vsk,
+       };
+@@ -1142,8 +1157,9 @@ virtio_transport_send_response(struct vsock_sock *vsk,
+ }
+ 
+ static bool virtio_transport_space_update(struct sock *sk,
+-                                        struct virtio_vsock_pkt *pkt)
++                                        struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vsock_sock *vsk = vsock_sk(sk);
+       struct virtio_vsock_sock *vvs = vsk->trans;
+       bool space_available;
+@@ -1158,8 +1174,8 @@ static bool virtio_transport_space_update(struct sock *sk,
+ 
+       /* buf_alloc and fwd_cnt is always included in the hdr */
+       spin_lock_bh(&vvs->tx_lock);
+-      vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc);
+-      vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt);
++      vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc);
++      vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt);
+       space_available = virtio_transport_has_space(vsk);
+       spin_unlock_bh(&vvs->tx_lock);
+       return space_available;
+@@ -1167,27 +1183,28 @@ static bool virtio_transport_space_update(struct sock *sk,
+ 
+ /* Handle server socket */
+ static int
+-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
++virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb,
+                            struct virtio_transport *t)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct vsock_sock *vsk = vsock_sk(sk);
+       struct vsock_sock *vchild;
+       struct sock *child;
+       int ret;
+ 
+-      if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) {
+-              virtio_transport_reset_no_sock(t, pkt);
++      if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) {
++              virtio_transport_reset_no_sock(t, skb);
+               return -EINVAL;
+       }
+ 
+       if (sk_acceptq_is_full(sk)) {
+-              virtio_transport_reset_no_sock(t, pkt);
++              virtio_transport_reset_no_sock(t, skb);
+               return -ENOMEM;
+       }
+ 
+       child = vsock_create_connected(sk);
+       if (!child) {
+-              virtio_transport_reset_no_sock(t, pkt);
++              virtio_transport_reset_no_sock(t, skb);
+               return -ENOMEM;
+       }
+ 
+@@ -1198,10 +1215,10 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
+       child->sk_state = TCP_ESTABLISHED;
+ 
+       vchild = vsock_sk(child);
+-      vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid),
+-                      le32_to_cpu(pkt->hdr.dst_port));
+-      vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid),
+-                      le32_to_cpu(pkt->hdr.src_port));
++      vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid),
++                      le32_to_cpu(hdr->dst_port));
++      vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid),
++                      le32_to_cpu(hdr->src_port));
+ 
+       ret = vsock_assign_transport(vchild, vsk);
+       /* Transport assigned (looking at remote_addr) must be the same
+@@ -1209,17 +1226,17 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt,
+        */
+       if (ret || vchild->transport != &t->transport) {
+               release_sock(child);
+-              virtio_transport_reset_no_sock(t, pkt);
++              virtio_transport_reset_no_sock(t, skb);
+               sock_put(child);
+               return ret;
+       }
+ 
+-      if (virtio_transport_space_update(child, pkt))
++      if (virtio_transport_space_update(child, skb))
+               child->sk_write_space(child);
+ 
+       vsock_insert_connected(vchild);
+       vsock_enqueue_accept(sk, child);
+-      virtio_transport_send_response(vchild, pkt);
++      virtio_transport_send_response(vchild, skb);
+ 
+       release_sock(child);
+ 
+@@ -1237,29 +1254,30 @@ static bool virtio_transport_valid_type(u16 type)
+  * lock.
+  */
+ void virtio_transport_recv_pkt(struct virtio_transport *t,
+-                             struct virtio_vsock_pkt *pkt)
++                             struct sk_buff *skb)
+ {
++      struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb);
+       struct sockaddr_vm src, dst;
+       struct vsock_sock *vsk;
+       struct sock *sk;
+       bool space_available;
+ 
+-      vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid),
+-                      le32_to_cpu(pkt->hdr.src_port));
+-      vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid),
+-                      le32_to_cpu(pkt->hdr.dst_port));
++      vsock_addr_init(&src, le64_to_cpu(hdr->src_cid),
++                      le32_to_cpu(hdr->src_port));
++      vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid),
++                      le32_to_cpu(hdr->dst_port));
+ 
+       trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port,
+                                       dst.svm_cid, dst.svm_port,
+-                                      le32_to_cpu(pkt->hdr.len),
+-                                      le16_to_cpu(pkt->hdr.type),
+-                                      le16_to_cpu(pkt->hdr.op),
+-                                      le32_to_cpu(pkt->hdr.flags),
+-                                      le32_to_cpu(pkt->hdr.buf_alloc),
+-                                      le32_to_cpu(pkt->hdr.fwd_cnt));
+-
+-      if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) {
+-              (void)virtio_transport_reset_no_sock(t, pkt);
++                                      le32_to_cpu(hdr->len),
++                                      le16_to_cpu(hdr->type),
++                                      le16_to_cpu(hdr->op),
++                                      le32_to_cpu(hdr->flags),
++                                      le32_to_cpu(hdr->buf_alloc),
++                                      le32_to_cpu(hdr->fwd_cnt));
++
++      if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) {
++              (void)virtio_transport_reset_no_sock(t, skb);
+               goto free_pkt;
+       }
+ 
+@@ -1270,13 +1288,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
+       if (!sk) {
+               sk = vsock_find_bound_socket(&dst);
+               if (!sk) {
+-                      (void)virtio_transport_reset_no_sock(t, pkt);
++                      (void)virtio_transport_reset_no_sock(t, skb);
+                       goto free_pkt;
+               }
+       }
+ 
+-      if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) {
+-              (void)virtio_transport_reset_no_sock(t, pkt);
++      if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) {
++              (void)virtio_transport_reset_no_sock(t, skb);
+               sock_put(sk);
+               goto free_pkt;
+       }
+@@ -1287,13 +1305,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
+ 
+       /* Check if sk has been closed before lock_sock */
+       if (sock_flag(sk, SOCK_DONE)) {
+-              (void)virtio_transport_reset_no_sock(t, pkt);
++              (void)virtio_transport_reset_no_sock(t, skb);
+               release_sock(sk);
+               sock_put(sk);
+               goto free_pkt;
+       }
+ 
+-      space_available = virtio_transport_space_update(sk, pkt);
++      space_available = virtio_transport_space_update(sk, skb);
+ 
+       /* Update CID in case it has changed after a transport reset event */
+       if (vsk->local_addr.svm_cid != VMADDR_CID_ANY)
+@@ -1304,23 +1322,23 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
+ 
+       switch (sk->sk_state) {
+       case TCP_LISTEN:
+-              virtio_transport_recv_listen(sk, pkt, t);
+-              virtio_transport_free_pkt(pkt);
++              virtio_transport_recv_listen(sk, skb, t);
++              kfree_skb(skb);
+               break;
+       case TCP_SYN_SENT:
+-              virtio_transport_recv_connecting(sk, pkt);
+-              virtio_transport_free_pkt(pkt);
++              virtio_transport_recv_connecting(sk, skb);
++              kfree_skb(skb);
+               break;
+       case TCP_ESTABLISHED:
+-              virtio_transport_recv_connected(sk, pkt);
++              virtio_transport_recv_connected(sk, skb);
+               break;
+       case TCP_CLOSING:
+-              virtio_transport_recv_disconnecting(sk, pkt);
+-              virtio_transport_free_pkt(pkt);
++              virtio_transport_recv_disconnecting(sk, skb);
++              kfree_skb(skb);
+               break;
+       default:
+-              (void)virtio_transport_reset_no_sock(t, pkt);
+-              virtio_transport_free_pkt(pkt);
++              (void)virtio_transport_reset_no_sock(t, skb);
++              kfree_skb(skb);
+               break;
+       }
+ 
+@@ -1333,16 +1351,42 @@ void virtio_transport_recv_pkt(struct virtio_transport *t,
+       return;
+ 
+ free_pkt:
+-      virtio_transport_free_pkt(pkt);
++      kfree_skb(skb);
+ }
+ EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt);
+ 
+-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt)
++/* Remove skbs found in a queue that have a vsk that matches.
++ *
++ * Each skb is freed.
++ *
++ * Returns the count of skbs that were reply packets.
++ */
++int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue)
+ {
+-      kvfree(pkt->buf);
+-      kfree(pkt);
++      struct sk_buff_head freeme;
++      struct sk_buff *skb, *tmp;
++      int cnt = 0;
++
++      skb_queue_head_init(&freeme);
++
++      spin_lock_bh(&queue->lock);
++      skb_queue_walk_safe(queue, skb, tmp) {
++              if (vsock_sk(skb->sk) != vsk)
++                      continue;
++
++              __skb_unlink(skb, queue);
++              __skb_queue_tail(&freeme, skb);
++
++              if (virtio_vsock_skb_reply(skb))
++                      cnt++;
++      }
++      spin_unlock_bh(&queue->lock);
++
++      __skb_queue_purge(&freeme);
++
++      return cnt;
+ }
+-EXPORT_SYMBOL_GPL(virtio_transport_free_pkt);
++EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs);
+ 
+ MODULE_LICENSE("GPL v2");
+ MODULE_AUTHOR("Asias He");
+diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c
+index 169a8cf65b390..671e03240fc52 100644
+--- a/net/vmw_vsock/vsock_loopback.c
++++ b/net/vmw_vsock/vsock_loopback.c
+@@ -16,7 +16,7 @@ struct vsock_loopback {
+       struct workqueue_struct *workqueue;
+ 
+       spinlock_t pkt_list_lock; /* protects pkt_list */
+-      struct list_head pkt_list;
++      struct sk_buff_head pkt_queue;
+       struct work_struct pkt_work;
+ };
+ 
+@@ -27,13 +27,13 @@ static u32 vsock_loopback_get_local_cid(void)
+       return VMADDR_CID_LOCAL;
+ }
+ 
+-static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt)
++static int vsock_loopback_send_pkt(struct sk_buff *skb)
+ {
+       struct vsock_loopback *vsock = &the_vsock_loopback;
+-      int len = pkt->len;
++      int len = skb->len;
+ 
+       spin_lock_bh(&vsock->pkt_list_lock);
+-      list_add_tail(&pkt->list, &vsock->pkt_list);
++      skb_queue_tail(&vsock->pkt_queue, skb);
+       spin_unlock_bh(&vsock->pkt_list_lock);
+ 
+       queue_work(vsock->workqueue, &vsock->pkt_work);
+@@ -44,21 +44,8 @@ static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt)
+ static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk)
+ {
+       struct vsock_loopback *vsock = &the_vsock_loopback;
+-      struct virtio_vsock_pkt *pkt, *n;
+-      LIST_HEAD(freeme);
+ 
+-      spin_lock_bh(&vsock->pkt_list_lock);
+-      list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) {
+-              if (pkt->vsk != vsk)
+-                      continue;
+-              list_move(&pkt->list, &freeme);
+-      }
+-      spin_unlock_bh(&vsock->pkt_list_lock);
+-
+-      list_for_each_entry_safe(pkt, n, &freeme, list) {
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
++      virtio_transport_purge_skbs(vsk, &vsock->pkt_queue);
+ 
+       return 0;
+ }
+@@ -121,20 +108,18 @@ static void vsock_loopback_work(struct work_struct *work)
+ {
+       struct vsock_loopback *vsock =
+               container_of(work, struct vsock_loopback, pkt_work);
+-      LIST_HEAD(pkts);
++      struct sk_buff_head pkts;
++      struct sk_buff *skb;
++
++      skb_queue_head_init(&pkts);
+ 
+       spin_lock_bh(&vsock->pkt_list_lock);
+-      list_splice_init(&vsock->pkt_list, &pkts);
++      skb_queue_splice_init(&vsock->pkt_queue, &pkts);
+       spin_unlock_bh(&vsock->pkt_list_lock);
+ 
+-      while (!list_empty(&pkts)) {
+-              struct virtio_vsock_pkt *pkt;
+-
+-              pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list);
+-              list_del_init(&pkt->list);
+-
+-              virtio_transport_deliver_tap_pkt(pkt);
+-              virtio_transport_recv_pkt(&loopback_transport, pkt);
++      while ((skb = __skb_dequeue(&pkts))) {
++              virtio_transport_deliver_tap_pkt(skb);
++              virtio_transport_recv_pkt(&loopback_transport, skb);
+       }
+ }
+ 
+@@ -148,7 +133,7 @@ static int __init vsock_loopback_init(void)
+               return -ENOMEM;
+ 
+       spin_lock_init(&vsock->pkt_list_lock);
+-      INIT_LIST_HEAD(&vsock->pkt_list);
++      skb_queue_head_init(&vsock->pkt_queue);
+       INIT_WORK(&vsock->pkt_work, vsock_loopback_work);
+ 
+       ret = vsock_core_register(&loopback_transport.transport,
+@@ -166,19 +151,13 @@ static int __init vsock_loopback_init(void)
+ static void __exit vsock_loopback_exit(void)
+ {
+       struct vsock_loopback *vsock = &the_vsock_loopback;
+-      struct virtio_vsock_pkt *pkt;
+ 
+       vsock_core_unregister(&loopback_transport.transport);
+ 
+       flush_work(&vsock->pkt_work);
+ 
+       spin_lock_bh(&vsock->pkt_list_lock);
+-      while (!list_empty(&vsock->pkt_list)) {
+-              pkt = list_first_entry(&vsock->pkt_list,
+-                                     struct virtio_vsock_pkt, list);
+-              list_del(&pkt->list);
+-              virtio_transport_free_pkt(pkt);
+-      }
++      virtio_vsock_skb_queue_purge(&vsock->pkt_queue);
+       spin_unlock_bh(&vsock->pkt_list_lock);
+ 
+       destroy_workqueue(vsock->workqueue);
+-- 
+2.42.0
+
diff --git a/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch b/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch

new file mode 100644 (file)

index 0000000..012b78e
--- /dev/null
+++ b/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch
@@ -0,0 +1,75 @@
+From 1ee503bd1f558d4497498a5f553aa2e0961bfa18 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 3 Nov 2023 18:55:48 +0100
+Subject: vsock/virtio: remove socket from connected/bound list on shutdown
+
+From: Filippo Storniolo <f.storniolo95@gmail.com>
+
+[ Upstream commit 3a5cc90a4d1756072619fe511d07621bdef7f120 ]
+
+If the same remote peer, using the same port, tries to connect
+to a server on a listening port more than once, the server will
+reject the connection, causing a "connection reset by peer"
+error on the remote peer. This is due to the presence of a
+dangling socket from a previous connection in both the connected
+and bound socket lists.
+The inconsistency of the above lists only occurs when the remote
+peer disconnects and the server remains active.
+
+This bug does not occur when the server socket is closed:
+virtio_transport_release() will eventually schedule a call to
+virtio_transport_do_close() and the latter will remove the socket
+from the bound and connected socket lists and clear the sk_buff.
+
+However, virtio_transport_do_close() will only perform the above
+actions if it has been scheduled, and this will not happen
+if the server is processing the shutdown message from a remote peer.
+
+To fix this, introduce a call to vsock_remove_sock()
+when the server is handling a client disconnect.
+This is to remove the socket from the bound and connected socket
+lists without clearing the sk_buff.
+
+Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko")
+Reported-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Tested-by: Daan De Meyer <daan.j.demeyer@gmail.com>
+Co-developed-by: Luigi Leonardi <luigi.leonardi@outlook.com>
+Signed-off-by: Luigi Leonardi <luigi.leonardi@outlook.com>
+Signed-off-by: Filippo Storniolo <f.storniolo95@gmail.com>
+Reviewed-by: Stefano Garzarella <sgarzare@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/vmw_vsock/virtio_transport_common.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c
+index a1581c77cf84a..73e5093928325 100644
+--- a/net/vmw_vsock/virtio_transport_common.c
++++ b/net/vmw_vsock/virtio_transport_common.c
+@@ -1108,11 +1108,17 @@ virtio_transport_recv_connected(struct sock *sk,
+                       vsk->peer_shutdown |= RCV_SHUTDOWN;
+               if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND)
+                       vsk->peer_shutdown |= SEND_SHUTDOWN;
+-              if (vsk->peer_shutdown == SHUTDOWN_MASK &&
+-                  vsock_stream_has_data(vsk) <= 0 &&
+-                  !sock_flag(sk, SOCK_DONE)) {
+-                      (void)virtio_transport_reset(vsk, NULL);
+-                      virtio_transport_do_close(vsk, true);
++              if (vsk->peer_shutdown == SHUTDOWN_MASK) {
++                      if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) {
++                              (void)virtio_transport_reset(vsk, NULL);
++                              virtio_transport_do_close(vsk, true);
++                      }
++                      /* Remove this socket anyway because the remote peer sent
++                       * the shutdown. This way a new connection will succeed
++                       * if the remote peer uses the same source port,
++                       * even if the old socket is still unreleased, but now disconnected.
++                       */
++                      vsock_remove_sock(vsk);
+               }
+               if (le32_to_cpu(virtio_vsock_hdr(skb)->flags))
+                       sk->sk_state_change(sk);
+-- 
+2.42.0
+
diff --git a/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch b/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch

new file mode 100644 (file)

index 0000000..20d7e62
--- /dev/null
+++ b/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch
@@ -0,0 +1,88 @@
+From 9156bf060d4bf888e7dae1dfad5025795047337f Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 26 Sep 2023 11:13:44 +0200
+Subject: watchdog: ixp4xx: Make sure restart always works
+
+From: Linus Walleij <linus.walleij@linaro.org>
+
+[ Upstream commit b4075ecfe348a44209534c75ad72392c63a489a6 ]
+
+The IXP4xx watchdog in early "A0" silicon is unreliable and
+cannot be registered, however for some systems such as the
+USRobotics USR8200 the watchdog is the only restart option,
+so implement a "dummy" watchdog that can only support restart
+in this case.
+
+Fixes: 1aea522809e6 ("watchdog: ixp4xx: Implement restart")
+Signed-off-by: Linus Walleij <linus.walleij@linaro.org>
+Reviewed-by: Guenter Roeck <linux@roeck-us.net>
+Link: https://lore.kernel.org/r/20230926-ixp4xx-wdt-restart-v2-1-15cf4639b423@linaro.org
+Signed-off-by: Guenter Roeck <linux@roeck-us.net>
+Signed-off-by: Wim Van Sebroeck <wim@linux-watchdog.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/watchdog/ixp4xx_wdt.c | 28 +++++++++++++++++++++++++---
+ 1 file changed, 25 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c
+index 281a48d9889fc..0fc91e9c4a773 100644
+--- a/drivers/watchdog/ixp4xx_wdt.c
++++ b/drivers/watchdog/ixp4xx_wdt.c
+@@ -105,6 +105,25 @@ static const struct watchdog_ops ixp4xx_wdt_ops = {
+       .owner = THIS_MODULE,
+ };
+ 
++/*
++ * The A0 version of the IXP422 had a bug in the watchdog making
++ * is useless, but we still need to use it to restart the system
++ * as it is the only way, so in this special case we register a
++ * "dummy" watchdog that doesn't really work, but will support
++ * the restart operation.
++ */
++static int ixp4xx_wdt_dummy(struct watchdog_device *wdd)
++{
++      return 0;
++}
++
++static const struct watchdog_ops ixp4xx_wdt_restart_only_ops = {
++      .start = ixp4xx_wdt_dummy,
++      .stop = ixp4xx_wdt_dummy,
++      .restart = ixp4xx_wdt_restart,
++      .owner = THIS_MODULE,
++};
++
+ static const struct watchdog_info ixp4xx_wdt_info = {
+       .options = WDIOF_KEEPALIVEPING
+               | WDIOF_MAGICCLOSE
+@@ -120,14 +139,17 @@ static void ixp4xx_clock_action(void *d)
+ 
+ static int ixp4xx_wdt_probe(struct platform_device *pdev)
+ {
++      static const struct watchdog_ops *iwdt_ops;
+       struct device *dev = &pdev->dev;
+       struct ixp4xx_wdt *iwdt;
+       struct clk *clk;
+       int ret;
+ 
+       if (!(read_cpuid_id() & 0xf) && !cpu_is_ixp46x()) {
+-              dev_err(dev, "Rev. A0 IXP42x CPU detected - watchdog disabled\n");
+-              return -ENODEV;
++              dev_info(dev, "Rev. A0 IXP42x CPU detected - only restart supported\n");
++              iwdt_ops = &ixp4xx_wdt_restart_only_ops;
++      } else {
++              iwdt_ops = &ixp4xx_wdt_ops;
+       }
+ 
+       iwdt = devm_kzalloc(dev, sizeof(*iwdt), GFP_KERNEL);
+@@ -153,7 +175,7 @@ static int ixp4xx_wdt_probe(struct platform_device *pdev)
+               iwdt->rate = IXP4XX_TIMER_FREQ;
+ 
+       iwdt->wdd.info = &ixp4xx_wdt_info;
+-      iwdt->wdd.ops = &ixp4xx_wdt_ops;
++      iwdt->wdd.ops = iwdt_ops;
+       iwdt->wdd.min_timeout = 1;
+       iwdt->wdd.max_timeout = U32_MAX / iwdt->rate;
+       iwdt->wdd.parent = dev;
+-- 
+2.42.0
+
author	Sasha Levin <sashal@kernel.org>
	Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Sun, 12 Nov 2023 02:50:12 +0000 (21:50 -0500)
queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/i2c-iproc-handle-invalid-slave-state.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/inet-shrink-struct-flowi_common.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/nbd-fix-uaf-in-nbd_open.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/octeontx2-pf-fix-error-codes.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/octeontx2-pf-qos-send-queues-management.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/selftests-pmtu.sh-fix-result-checking.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/series		patch \| blob \| blame \| history
queue-6.1/tg3-power-down-device-only-on-system_power_off.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch	[new file with mode: 0644]	patch \| blob
queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch	[new file with mode: 0644]	patch \| blob