From: Sasha Levin Date: Sun, 12 Nov 2023 02:50:12 +0000 (-0500) Subject: Fixes for 6.1 X-Git-Tag: v4.14.330~61 X-Git-Url: http://git.ipfire.org/gitweb.cgi?a=commitdiff_plain;h=5e7d656cd16215c93f569fa6f5684d8dfb0e61ad;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.1 Signed-off-by: Sasha Levin --- diff --git a/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch b/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch new file mode 100644 index 00000000000..070e9d22935 --- /dev/null +++ b/queue-6.1/blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch @@ -0,0 +1,43 @@ +From 0542337a6f654f25f6109d7fd0f7d620af585d19 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 7 Nov 2023 19:12:47 +0800 +Subject: blk-core: use pr_warn_ratelimited() in bio_check_ro() + +From: Yu Kuai + +[ Upstream commit 1b0a151c10a6d823f033023b9fdd9af72a89591b ] + +If one of the underlying disks of raid or dm is set to read-only, then +each io will generate new log, which will cause message storm. This +environment is indeed problematic, however we can't make sure our +naive custormer won't do this, hence use pr_warn_ratelimited() to +prevent message storm in this case. + +Signed-off-by: Yu Kuai +Fixes: 57e95e4670d1 ("block: fix and cleanup bio_check_ro") +Signed-off-by: Ye Bin +Link: https://lore.kernel.org/r/20231107111247.2157820-1-yukuai1@huaweicloud.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + block/blk-core.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/block/blk-core.c b/block/blk-core.c +index ebb7a1689b261..6eaf2b0ad7cca 100644 +--- a/block/blk-core.c ++++ b/block/blk-core.c +@@ -490,8 +490,8 @@ static inline void bio_check_ro(struct bio *bio) + if (op_is_write(bio_op(bio)) && bdev_read_only(bio->bi_bdev)) { + if (op_is_flush(bio->bi_opf) && !bio_sectors(bio)) + return; +- pr_warn("Trying to write to read-only block-device %pg\n", +- bio->bi_bdev); ++ pr_warn_ratelimited("Trying to write to read-only block-device %pg\n", ++ bio->bi_bdev); + /* Older lvm-tools actually trigger this */ + } + } +-- +2.42.0 + diff --git a/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch b/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch new file mode 100644 index 00000000000..03c599e17c1 --- /dev/null +++ b/queue-6.1/bpf-check-map-usercnt-after-timer-timer-is-assigned.patch @@ -0,0 +1,113 @@ +From ef364ef17a004ff8e3a33e4168c585acfdfa4568 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 14:36:16 +0800 +Subject: bpf: Check map->usercnt after timer->timer is assigned + +From: Hou Tao + +[ Upstream commit fd381ce60a2d79cc967506208085336d3d268ae0 ] + +When there are concurrent uref release and bpf timer init operations, +the following sequence diagram is possible. It will break the guarantee +provided by bpf_timer: bpf_timer will still be alive after userspace +application releases or unpins the map. It also will lead to kmemleak +for old kernel version which doesn't release bpf_timer when map is +released. + +bpf program X: + +bpf_timer_init() + lock timer->lock + read timer->timer as NULL + read map->usercnt != 0 + + process Y: + + close(map_fd) + // put last uref + bpf_map_put_uref() + atomic_dec_and_test(map->usercnt) + array_map_free_timers() + bpf_timer_cancel_and_free() + // just return + read timer->timer is NULL + + t = bpf_map_kmalloc_node() + timer->timer = t + unlock timer->lock + +Fix the problem by checking map->usercnt after timer->timer is assigned, +so when there are concurrent uref release and bpf timer init, either +bpf_timer_cancel_and_free() from uref release reads a no-NULL timer +or the newly-added atomic64_read() returns a zero usercnt. + +Because atomic_dec_and_test(map->usercnt) and READ_ONCE(timer->timer) +in bpf_timer_cancel_and_free() are not protected by a lock, so add +a memory barrier to guarantee the order between map->usercnt and +timer->timer. Also use WRITE_ONCE(timer->timer, x) to match the lockless +read of timer->timer in bpf_timer_cancel_and_free(). + +Reported-by: Hsin-Wei Hung +Closes: https://lore.kernel.org/bpf/CABcoxUaT2k9hWsS1tNgXyoU3E-=PuOgMn737qK984fbFmfYixQ@mail.gmail.com +Fixes: b00628b1c7d5 ("bpf: Introduce bpf timers.") +Signed-off-by: Hou Tao +Link: https://lore.kernel.org/r/20231030063616.1653024-1-houtao@huaweicloud.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + kernel/bpf/helpers.c | 25 ++++++++++++++++--------- + 1 file changed, 16 insertions(+), 9 deletions(-) + +diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c +index a6b04faed282b..6212e4ae084bb 100644 +--- a/kernel/bpf/helpers.c ++++ b/kernel/bpf/helpers.c +@@ -1156,13 +1156,6 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map + ret = -EBUSY; + goto out; + } +- if (!atomic64_read(&map->usercnt)) { +- /* maps with timers must be either held by user space +- * or pinned in bpffs. +- */ +- ret = -EPERM; +- goto out; +- } + /* allocate hrtimer via map_kmalloc to use memcg accounting */ + t = bpf_map_kmalloc_node(map, sizeof(*t), GFP_ATOMIC, map->numa_node); + if (!t) { +@@ -1175,7 +1168,21 @@ BPF_CALL_3(bpf_timer_init, struct bpf_timer_kern *, timer, struct bpf_map *, map + rcu_assign_pointer(t->callback_fn, NULL); + hrtimer_init(&t->timer, clockid, HRTIMER_MODE_REL_SOFT); + t->timer.function = bpf_timer_cb; +- timer->timer = t; ++ WRITE_ONCE(timer->timer, t); ++ /* Guarantee the order between timer->timer and map->usercnt. So ++ * when there are concurrent uref release and bpf timer init, either ++ * bpf_timer_cancel_and_free() called by uref release reads a no-NULL ++ * timer or atomic64_read() below returns a zero usercnt. ++ */ ++ smp_mb(); ++ if (!atomic64_read(&map->usercnt)) { ++ /* maps with timers must be either held by user space ++ * or pinned in bpffs. ++ */ ++ WRITE_ONCE(timer->timer, NULL); ++ kfree(t); ++ ret = -EPERM; ++ } + out: + __bpf_spin_unlock_irqrestore(&timer->lock); + return ret; +@@ -1343,7 +1350,7 @@ void bpf_timer_cancel_and_free(void *val) + /* The subsequent bpf_timer_start/cancel() helpers won't be able to use + * this timer, since it won't be initialized. + */ +- timer->timer = NULL; ++ WRITE_ONCE(timer->timer, NULL); + out: + __bpf_spin_unlock_irqrestore(&timer->lock); + if (!t) +-- +2.42.0 + diff --git a/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch b/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch new file mode 100644 index 00000000000..6b935eed89d --- /dev/null +++ b/queue-6.1/dccp-call-security_inet_conn_request-after-setting-i.patch @@ -0,0 +1,59 @@ +From d55df215ba76d0171b2e4892577406bf603ac157 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 13:10:41 -0700 +Subject: dccp: Call security_inet_conn_request() after setting IPv4 addresses. + +From: Kuniyuki Iwashima + +[ Upstream commit fa2df45af13091f76b89adb84a28f13818d5d631 ] + +Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child +sockets") introduced security_inet_conn_request() in some functions +where reqsk is allocated. The hook is added just after the allocation, +so reqsk's IPv4 remote address was not initialised then. + +However, SELinux/Smack started to read it in netlbl_req_setattr() +after the cited commits. + +This bug was partially fixed by commit 284904aa7946 ("lsm: Relocate +the IPv4 security_inet_conn_request() hooks"). + +This patch fixes the last bug in DCCPv4. + +Fixes: 389fb800ac8b ("netlabel: Label incoming TCP connections correctly in SELinux") +Fixes: 07feee8f812f ("netlabel: Cleanup the Smack/NetLabel code to fix incoming TCP connections") +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paul Moore +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/dccp/ipv4.c | 6 +++--- + 1 file changed, 3 insertions(+), 3 deletions(-) + +diff --git a/net/dccp/ipv4.c b/net/dccp/ipv4.c +index 247179d4c8865..9fe6d96797169 100644 +--- a/net/dccp/ipv4.c ++++ b/net/dccp/ipv4.c +@@ -628,9 +628,6 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + +- if (security_inet_conn_request(sk, skb, req)) +- goto drop_and_free; +- + ireq = inet_rsk(req); + sk_rcv_saddr_set(req_to_sk(req), ip_hdr(skb)->daddr); + sk_daddr_set(req_to_sk(req), ip_hdr(skb)->saddr); +@@ -638,6 +635,9 @@ int dccp_v4_conn_request(struct sock *sk, struct sk_buff *skb) + ireq->ireq_family = AF_INET; + ireq->ir_iif = READ_ONCE(sk->sk_bound_dev_if); + ++ if (security_inet_conn_request(sk, skb, req)) ++ goto drop_and_free; ++ + /* + * Step 3: Process LISTEN state + * +-- +2.42.0 + diff --git a/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch b/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch new file mode 100644 index 00000000000..0bc57bec00e --- /dev/null +++ b/queue-6.1/dccp-tcp-call-security_inet_conn_request-after-setti.patch @@ -0,0 +1,85 @@ +From 41c12a435b1d8d7850233fc50e0898436c63ae67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 13:10:42 -0700 +Subject: dccp/tcp: Call security_inet_conn_request() after setting IPv6 + addresses. + +From: Kuniyuki Iwashima + +[ Upstream commit 23be1e0e2a83a8543214d2599a31d9a2185a796b ] + +Initially, commit 4237c75c0a35 ("[MLSXFRM]: Auto-labeling of child +sockets") introduced security_inet_conn_request() in some functions +where reqsk is allocated. The hook is added just after the allocation, +so reqsk's IPv6 remote address was not initialised then. + +However, SELinux/Smack started to read it in netlbl_req_setattr() +after commit e1adea927080 ("calipso: Allow request sockets to be +relabelled by the lsm."). + +Commit 284904aa7946 ("lsm: Relocate the IPv4 security_inet_conn_request() +hooks") fixed that kind of issue only in TCPv4 because IPv6 labeling was +not supported at that time. Finally, the same issue was introduced again +in IPv6. + +Let's apply the same fix on DCCPv6 and TCPv6. + +Fixes: e1adea927080 ("calipso: Allow request sockets to be relabelled by the lsm.") +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paul Moore +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/dccp/ipv6.c | 6 +++--- + net/ipv6/syncookies.c | 7 ++++--- + 2 files changed, 7 insertions(+), 6 deletions(-) + +diff --git a/net/dccp/ipv6.c b/net/dccp/ipv6.c +index 6fb34eaf1237a..e0b0bf75a46c2 100644 +--- a/net/dccp/ipv6.c ++++ b/net/dccp/ipv6.c +@@ -359,15 +359,15 @@ static int dccp_v6_conn_request(struct sock *sk, struct sk_buff *skb) + if (dccp_parse_options(sk, dreq, skb)) + goto drop_and_free; + +- if (security_inet_conn_request(sk, skb, req)) +- goto drop_and_free; +- + ireq = inet_rsk(req); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; + ireq->ireq_family = AF_INET6; + ireq->ir_mark = inet_request_mark(sk, skb); + ++ if (security_inet_conn_request(sk, skb, req)) ++ goto drop_and_free; ++ + if (ipv6_opt_accepted(sk, skb, IP6CB(skb)) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { +diff --git a/net/ipv6/syncookies.c b/net/ipv6/syncookies.c +index 5014aa6634527..8698b49dfc8de 100644 +--- a/net/ipv6/syncookies.c ++++ b/net/ipv6/syncookies.c +@@ -180,14 +180,15 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) + treq = tcp_rsk(req); + treq->tfo_listener = false; + +- if (security_inet_conn_request(sk, skb, req)) +- goto out_free; +- + req->mss = mss; + ireq->ir_rmt_port = th->source; + ireq->ir_num = ntohs(th->dest); + ireq->ir_v6_rmt_addr = ipv6_hdr(skb)->saddr; + ireq->ir_v6_loc_addr = ipv6_hdr(skb)->daddr; ++ ++ if (security_inet_conn_request(sk, skb, req)) ++ goto out_free; ++ + if (ipv6_opt_accepted(sk, skb, &TCP_SKB_CB(skb)->header.h6) || + np->rxopt.bits.rxinfo || np->rxopt.bits.rxoinfo || + np->rxopt.bits.rxhlim || np->rxopt.bits.rxohlim) { +-- +2.42.0 + diff --git a/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch b/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch new file mode 100644 index 00000000000..b8398c8b19a --- /dev/null +++ b/queue-6.1/fix-termination-state-for-idr_for_each_entry_ul.patch @@ -0,0 +1,64 @@ +From 3be08cb79ea133375cf3b3ab3bd02118f119bb6f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 24 Oct 2023 09:53:33 +1100 +Subject: Fix termination state for idr_for_each_entry_ul() + +From: NeilBrown + +[ Upstream commit e8ae8ad479e2d037daa33756e5e72850a7bd37a9 ] + +The comment for idr_for_each_entry_ul() states + + after normal termination @entry is left with the value NULL + +This is not correct in the case where UINT_MAX has an entry in the idr. +In that case @entry will be non-NULL after termination. +No current code depends on the documentation being correct, but to +save future code we should fix it. + +Also fix idr_for_each_entry_continue_ul(). While this is not documented +as leaving @entry as NULL, the mellanox driver appears to depend on +it doing so. So make that explicit in the documentation as well as in +the code. + +Fixes: e33d2b74d805 ("idr: fix overflow case for idr_for_each_entry_ul()") +Cc: Matthew Wilcox +Cc: Chris Mi +Cc: Cong Wang +Signed-off-by: NeilBrown +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/linux/idr.h | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/include/linux/idr.h b/include/linux/idr.h +index a0dce14090a9e..da5f5fa4a3a6a 100644 +--- a/include/linux/idr.h ++++ b/include/linux/idr.h +@@ -200,7 +200,7 @@ static inline void idr_preload_end(void) + */ + #define idr_for_each_entry_ul(idr, entry, tmp, id) \ + for (tmp = 0, id = 0; \ +- tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ ++ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ + tmp = id, ++id) + + /** +@@ -224,10 +224,12 @@ static inline void idr_preload_end(void) + * @id: Entry ID. + * + * Continue to iterate over entries, continuing after the current position. ++ * After normal termination @entry is left with the value NULL. This ++ * is convenient for a "not found" value. + */ + #define idr_for_each_entry_continue_ul(idr, entry, tmp, id) \ + for (tmp = id; \ +- tmp <= id && ((entry) = idr_get_next_ul(idr, &(id))) != NULL; \ ++ ((entry) = tmp <= id ? idr_get_next_ul(idr, &(id)) : NULL) != NULL; \ + tmp = id, ++id) + + /* +-- +2.42.0 + diff --git a/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch b/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch new file mode 100644 index 00000000000..12aa62e5d8e --- /dev/null +++ b/queue-6.1/hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch @@ -0,0 +1,42 @@ +From 9e7063b06e42ddfb71d39500f0b6e465401d09d1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Oct 2023 15:19:01 +0300 +Subject: hsr: Prevent use after free in prp_create_tagged_frame() + +From: Dan Carpenter + +[ Upstream commit 876f8ab52363f649bcc74072157dfd7adfbabc0d ] + +The prp_fill_rct() function can fail. In that situation, it frees the +skb and returns NULL. Meanwhile on the success path, it returns the +original skb. So it's straight forward to fix bug by using the returned +value. + +Fixes: 451d8123f897 ("net: prp: add packet handling support") +Signed-off-by: Dan Carpenter +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/57af1f28-7f57-4a96-bcd3-b7a0f2340845@moroto.mountain +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/hsr/hsr_forward.c | 4 +--- + 1 file changed, 1 insertion(+), 3 deletions(-) + +diff --git a/net/hsr/hsr_forward.c b/net/hsr/hsr_forward.c +index b71dab630a873..80cdc6f6b34c9 100644 +--- a/net/hsr/hsr_forward.c ++++ b/net/hsr/hsr_forward.c +@@ -342,9 +342,7 @@ struct sk_buff *prp_create_tagged_frame(struct hsr_frame_info *frame, + skb = skb_copy_expand(frame->skb_std, 0, + skb_tailroom(frame->skb_std) + HSR_HLEN, + GFP_ATOMIC); +- prp_fill_rct(skb, frame, port); +- +- return skb; ++ return prp_fill_rct(skb, frame, port); + } + + static void hsr_deliver_master(struct sk_buff *skb, struct net_device *dev, +-- +2.42.0 + diff --git a/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch b/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch new file mode 100644 index 00000000000..d34e57aa197 --- /dev/null +++ b/queue-6.1/i2c-iproc-handle-invalid-slave-state.patch @@ -0,0 +1,200 @@ +From ea9042fe39247c12add88da66f6ccda2b3b6f98f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 24 Aug 2023 14:23:51 -0700 +Subject: i2c: iproc: handle invalid slave state + +From: Roman Bacik + +[ Upstream commit ba15a14399c262f91ce30c19fcbdc952262dd1be ] + +Add the code to handle an invalid state when both bits S_RX_EVENT +(indicating a transaction) and S_START_BUSY (indicating the end +of transaction - transition of START_BUSY from 1 to 0) are set in +the interrupt status register during a slave read. + +Signed-off-by: Roman Bacik +Fixes: 1ca1b4516088 ("i2c: iproc: handle Master aborted error") +Acked-by: Ray Jui +Signed-off-by: Wolfram Sang +Signed-off-by: Sasha Levin +--- + drivers/i2c/busses/i2c-bcm-iproc.c | 133 ++++++++++++++++------------- + 1 file changed, 75 insertions(+), 58 deletions(-) + +diff --git a/drivers/i2c/busses/i2c-bcm-iproc.c b/drivers/i2c/busses/i2c-bcm-iproc.c +index 30a2a3200bed9..86a080f24d8a2 100644 +--- a/drivers/i2c/busses/i2c-bcm-iproc.c ++++ b/drivers/i2c/busses/i2c-bcm-iproc.c +@@ -316,26 +316,44 @@ static void bcm_iproc_i2c_slave_init( + iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val); + } + +-static void bcm_iproc_i2c_check_slave_status( +- struct bcm_iproc_i2c_dev *iproc_i2c) ++static bool bcm_iproc_i2c_check_slave_status ++ (struct bcm_iproc_i2c_dev *iproc_i2c, u32 status) + { + u32 val; ++ bool recover = false; + +- val = iproc_i2c_rd_reg(iproc_i2c, S_CMD_OFFSET); +- /* status is valid only when START_BUSY is cleared after it was set */ +- if (val & BIT(S_CMD_START_BUSY_SHIFT)) +- return; ++ /* check slave transmit status only if slave is transmitting */ ++ if (!iproc_i2c->slave_rx_only) { ++ val = iproc_i2c_rd_reg(iproc_i2c, S_CMD_OFFSET); ++ /* status is valid only when START_BUSY is cleared */ ++ if (!(val & BIT(S_CMD_START_BUSY_SHIFT))) { ++ val = (val >> S_CMD_STATUS_SHIFT) & S_CMD_STATUS_MASK; ++ if (val == S_CMD_STATUS_TIMEOUT || ++ val == S_CMD_STATUS_MASTER_ABORT) { ++ dev_warn(iproc_i2c->device, ++ (val == S_CMD_STATUS_TIMEOUT) ? ++ "slave random stretch time timeout\n" : ++ "Master aborted read transaction\n"); ++ recover = true; ++ } ++ } ++ } ++ ++ /* RX_EVENT is not valid when START_BUSY is set */ ++ if ((status & BIT(IS_S_RX_EVENT_SHIFT)) && ++ (status & BIT(IS_S_START_BUSY_SHIFT))) { ++ dev_warn(iproc_i2c->device, "Slave aborted read transaction\n"); ++ recover = true; ++ } + +- val = (val >> S_CMD_STATUS_SHIFT) & S_CMD_STATUS_MASK; +- if (val == S_CMD_STATUS_TIMEOUT || val == S_CMD_STATUS_MASTER_ABORT) { +- dev_err(iproc_i2c->device, (val == S_CMD_STATUS_TIMEOUT) ? +- "slave random stretch time timeout\n" : +- "Master aborted read transaction\n"); ++ if (recover) { + /* re-initialize i2c for recovery */ + bcm_iproc_i2c_enable_disable(iproc_i2c, false); + bcm_iproc_i2c_slave_init(iproc_i2c, true); + bcm_iproc_i2c_enable_disable(iproc_i2c, true); + } ++ ++ return recover; + } + + static void bcm_iproc_i2c_slave_read(struct bcm_iproc_i2c_dev *iproc_i2c) +@@ -420,48 +438,6 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c, + u32 val; + u8 value; + +- /* +- * Slave events in case of master-write, master-write-read and, +- * master-read +- * +- * Master-write : only IS_S_RX_EVENT_SHIFT event +- * Master-write-read: both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT +- * events +- * Master-read : both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT +- * events or only IS_S_RD_EVENT_SHIFT +- * +- * iproc has a slave rx fifo size of 64 bytes. Rx fifo full interrupt +- * (IS_S_RX_FIFO_FULL_SHIFT) will be generated when RX fifo becomes +- * full. This can happen if Master issues write requests of more than +- * 64 bytes. +- */ +- if (status & BIT(IS_S_RX_EVENT_SHIFT) || +- status & BIT(IS_S_RD_EVENT_SHIFT) || +- status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) { +- /* disable slave interrupts */ +- val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET); +- val &= ~iproc_i2c->slave_int_mask; +- iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val); +- +- if (status & BIT(IS_S_RD_EVENT_SHIFT)) +- /* Master-write-read request */ +- iproc_i2c->slave_rx_only = false; +- else +- /* Master-write request only */ +- iproc_i2c->slave_rx_only = true; +- +- /* schedule tasklet to read data later */ +- tasklet_schedule(&iproc_i2c->slave_rx_tasklet); +- +- /* +- * clear only IS_S_RX_EVENT_SHIFT and +- * IS_S_RX_FIFO_FULL_SHIFT interrupt. +- */ +- val = BIT(IS_S_RX_EVENT_SHIFT); +- if (status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) +- val |= BIT(IS_S_RX_FIFO_FULL_SHIFT); +- iproc_i2c_wr_reg(iproc_i2c, IS_OFFSET, val); +- } + + if (status & BIT(IS_S_TX_UNDERRUN_SHIFT)) { + iproc_i2c->tx_underrun++; +@@ -493,8 +469,9 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c, + * less than PKT_LENGTH bytes were output on the SMBUS + */ + iproc_i2c->slave_int_mask &= ~BIT(IE_S_TX_UNDERRUN_SHIFT); +- iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, +- iproc_i2c->slave_int_mask); ++ val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET); ++ val &= ~BIT(IE_S_TX_UNDERRUN_SHIFT); ++ iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val); + + /* End of SMBUS for Master Read */ + val = BIT(S_TX_WR_STATUS_SHIFT); +@@ -515,9 +492,49 @@ static bool bcm_iproc_i2c_slave_isr(struct bcm_iproc_i2c_dev *iproc_i2c, + BIT(IS_S_START_BUSY_SHIFT)); + } + +- /* check slave transmit status only if slave is transmitting */ +- if (!iproc_i2c->slave_rx_only) +- bcm_iproc_i2c_check_slave_status(iproc_i2c); ++ /* if the controller has been reset, immediately return from the ISR */ ++ if (bcm_iproc_i2c_check_slave_status(iproc_i2c, status)) ++ return true; ++ ++ /* ++ * Slave events in case of master-write, master-write-read and, ++ * master-read ++ * ++ * Master-write : only IS_S_RX_EVENT_SHIFT event ++ * Master-write-read: both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT ++ * events ++ * Master-read : both IS_S_RX_EVENT_SHIFT and IS_S_RD_EVENT_SHIFT ++ * events or only IS_S_RD_EVENT_SHIFT ++ * ++ * iproc has a slave rx fifo size of 64 bytes. Rx fifo full interrupt ++ * (IS_S_RX_FIFO_FULL_SHIFT) will be generated when RX fifo becomes ++ * full. This can happen if Master issues write requests of more than ++ * 64 bytes. ++ */ ++ if (status & BIT(IS_S_RX_EVENT_SHIFT) || ++ status & BIT(IS_S_RD_EVENT_SHIFT) || ++ status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) { ++ /* disable slave interrupts */ ++ val = iproc_i2c_rd_reg(iproc_i2c, IE_OFFSET); ++ val &= ~iproc_i2c->slave_int_mask; ++ iproc_i2c_wr_reg(iproc_i2c, IE_OFFSET, val); ++ ++ if (status & BIT(IS_S_RD_EVENT_SHIFT)) ++ /* Master-write-read request */ ++ iproc_i2c->slave_rx_only = false; ++ else ++ /* Master-write request only */ ++ iproc_i2c->slave_rx_only = true; ++ ++ /* schedule tasklet to read data later */ ++ tasklet_schedule(&iproc_i2c->slave_rx_tasklet); ++ ++ /* clear IS_S_RX_FIFO_FULL_SHIFT interrupt */ ++ if (status & BIT(IS_S_RX_FIFO_FULL_SHIFT)) { ++ val = BIT(IS_S_RX_FIFO_FULL_SHIFT); ++ iproc_i2c_wr_reg(iproc_i2c, IS_OFFSET, val); ++ } ++ } + + return true; + } +-- +2.42.0 + diff --git a/queue-6.1/inet-shrink-struct-flowi_common.patch b/queue-6.1/inet-shrink-struct-flowi_common.patch new file mode 100644 index 00000000000..ca831d23b4d --- /dev/null +++ b/queue-6.1/inet-shrink-struct-flowi_common.patch @@ -0,0 +1,44 @@ +From 54f549733fa56fa6f5de1e4198c516777a13b2da Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Oct 2023 14:10:37 +0000 +Subject: inet: shrink struct flowi_common + +From: Eric Dumazet + +[ Upstream commit 1726483b79a72e0150734d5367e4a0238bf8fcff ] + +I am looking at syzbot reports triggering kernel stack overflows +involving a cascade of ipvlan devices. + +We can save 8 bytes in struct flowi_common. + +This patch alone will not fix the issue, but is a start. + +Fixes: 24ba14406c5c ("route: Add multipath_hash in flowi_common to make user-define hash") +Signed-off-by: Eric Dumazet +Cc: wenxu +Reviewed-by: David Ahern +Link: https://lore.kernel.org/r/20231025141037.3448203-1-edumazet@google.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + include/net/flow.h | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/include/net/flow.h b/include/net/flow.h +index 2f0da4f0318b5..079cc493fe67d 100644 +--- a/include/net/flow.h ++++ b/include/net/flow.h +@@ -39,8 +39,8 @@ struct flowi_common { + #define FLOWI_FLAG_KNOWN_NH 0x02 + __u32 flowic_secid; + kuid_t flowic_uid; +- struct flowi_tunnel flowic_tun_key; + __u32 flowic_multipath_hash; ++ struct flowi_tunnel flowic_tun_key; + }; + + union flowi_uli { +-- +2.42.0 + diff --git a/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch b/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch new file mode 100644 index 00000000000..77c203d8be0 --- /dev/null +++ b/queue-6.1/input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch @@ -0,0 +1,43 @@ +From 52f66df7f9d6a80f301b583c80168ff716396f9b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 29 Oct 2023 02:53:36 +0000 +Subject: Input: synaptics-rmi4 - fix use after free in + rmi_unregister_function() + +From: Dan Carpenter + +[ Upstream commit eb988e46da2e4eae89f5337e047ce372fe33d5b1 ] + +The put_device() calls rmi_release_function() which frees "fn" so the +dereference on the next line "fn->num_of_irqs" is a use after free. +Move the put_device() to the end to fix this. + +Fixes: 24d28e4f1271 ("Input: synaptics-rmi4 - convert irq distribution to irq_domain") +Signed-off-by: Dan Carpenter +Link: https://lore.kernel.org/r/706efd36-7561-42f3-adfa-dd1d0bd4f5a1@moroto.mountain +Signed-off-by: Dmitry Torokhov +Signed-off-by: Sasha Levin +--- + drivers/input/rmi4/rmi_bus.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/input/rmi4/rmi_bus.c b/drivers/input/rmi4/rmi_bus.c +index 50a0134b6901b..e6557d5f50ce5 100644 +--- a/drivers/input/rmi4/rmi_bus.c ++++ b/drivers/input/rmi4/rmi_bus.c +@@ -277,11 +277,11 @@ void rmi_unregister_function(struct rmi_function *fn) + + device_del(&fn->dev); + of_node_put(fn->dev.of_node); +- put_device(&fn->dev); + + for (i = 0; i < fn->num_of_irqs; i++) + irq_dispose_mapping(fn->irq[i]); + ++ put_device(&fn->dev); + } + + /** +-- +2.42.0 + diff --git a/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch b/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch new file mode 100644 index 00000000000..aef9eb41f5b --- /dev/null +++ b/queue-6.1/llc-verify-mac-len-before-reading-mac-header.patch @@ -0,0 +1,113 @@ +From e59a5f2fb7ab319630f42646ebae0d856244f914 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Oct 2023 19:42:38 -0400 +Subject: llc: verify mac len before reading mac header + +From: Willem de Bruijn + +[ Upstream commit 7b3ba18703a63f6fd487183b9262b08e5632da1b ] + +LLC reads the mac header with eth_hdr without verifying that the skb +has an Ethernet header. + +Syzbot was able to enter llc_rcv on a tun device. Tun can insert +packets without mac len and with user configurable skb->protocol +(passing a tun_pi header when not configuring IFF_NO_PI). + + BUG: KMSAN: uninit-value in llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline] + BUG: KMSAN: uninit-value in llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111 + llc_station_ac_send_test_r net/llc/llc_station.c:81 [inline] + llc_station_rcv+0x6fb/0x1290 net/llc/llc_station.c:111 + llc_rcv+0xc5d/0x14a0 net/llc/llc_input.c:218 + __netif_receive_skb_one_core net/core/dev.c:5523 [inline] + __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5637 + netif_receive_skb_internal net/core/dev.c:5723 [inline] + netif_receive_skb+0x58/0x660 net/core/dev.c:5782 + tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 + tun_get_user+0x54c5/0x69c0 drivers/net/tun.c:2002 + +Add a mac_len test before all three eth_hdr(skb) calls under net/llc. + +There are further uses in include/net/llc_pdu.h. All these are +protected by a test skb->protocol == ETH_P_802_2. Which does not +protect against this tun scenario. + +But the mac_len test added in this patch in llc_fixup_skb will +indirectly protect those too. That is called from llc_rcv before any +other LLC code. + +It is tempting to just add a blanket mac_len check in llc_rcv, but +not sure whether that could break valid LLC paths that do not assume +an Ethernet header. 802.2 LLC may be used on top of non-802.3 +protocols in principle. The below referenced commit shows that used +to, on top of Token Ring. + +At least one of the three eth_hdr uses goes back to before the start +of git history. But the one that syzbot exercises is introduced in +this commit. That commit is old enough (2008), that effectively all +stable kernels should receive this. + +Fixes: f83f1768f833 ("[LLC]: skb allocation size for responses") +Reported-by: syzbot+a8c7be6dee0de1b669cc@syzkaller.appspotmail.com +Signed-off-by: Willem de Bruijn +Link: https://lore.kernel.org/r/20231025234251.3796495-1-willemdebruijn.kernel@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/llc/llc_input.c | 10 ++++++++-- + net/llc/llc_s_ac.c | 3 +++ + net/llc/llc_station.c | 3 +++ + 3 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/net/llc/llc_input.c b/net/llc/llc_input.c +index 7cac441862e21..51bccfb00a9cd 100644 +--- a/net/llc/llc_input.c ++++ b/net/llc/llc_input.c +@@ -127,8 +127,14 @@ static inline int llc_fixup_skb(struct sk_buff *skb) + skb->transport_header += llc_len; + skb_pull(skb, llc_len); + if (skb->protocol == htons(ETH_P_802_2)) { +- __be16 pdulen = eth_hdr(skb)->h_proto; +- s32 data_size = ntohs(pdulen) - llc_len; ++ __be16 pdulen; ++ s32 data_size; ++ ++ if (skb->mac_len < ETH_HLEN) ++ return 0; ++ ++ pdulen = eth_hdr(skb)->h_proto; ++ data_size = ntohs(pdulen) - llc_len; + + if (data_size < 0 || + !pskb_may_pull(skb, data_size)) +diff --git a/net/llc/llc_s_ac.c b/net/llc/llc_s_ac.c +index 79d1cef8f15a9..06fb8e6944b06 100644 +--- a/net/llc/llc_s_ac.c ++++ b/net/llc/llc_s_ac.c +@@ -153,6 +153,9 @@ int llc_sap_action_send_test_r(struct llc_sap *sap, struct sk_buff *skb) + int rc = 1; + u32 data_size; + ++ if (skb->mac_len < ETH_HLEN) ++ return 1; ++ + llc_pdu_decode_sa(skb, mac_da); + llc_pdu_decode_da(skb, mac_sa); + llc_pdu_decode_ssap(skb, &dsap); +diff --git a/net/llc/llc_station.c b/net/llc/llc_station.c +index 05c6ae0920534..f506542925109 100644 +--- a/net/llc/llc_station.c ++++ b/net/llc/llc_station.c +@@ -76,6 +76,9 @@ static int llc_station_ac_send_test_r(struct sk_buff *skb) + u32 data_size; + struct sk_buff *nskb; + ++ if (skb->mac_len < ETH_HLEN) ++ goto out; ++ + /* The test request command is type U (llc_len = 3) */ + data_size = ntohs(eth_hdr(skb)->h_proto) - 3; + nskb = llc_alloc_frame(NULL, skb->dev, LLC_PDU_TYPE_U, data_size); +-- +2.42.0 + diff --git a/queue-6.1/nbd-fix-uaf-in-nbd_open.patch b/queue-6.1/nbd-fix-uaf-in-nbd_open.patch new file mode 100644 index 00000000000..981ba86998f --- /dev/null +++ b/queue-6.1/nbd-fix-uaf-in-nbd_open.patch @@ -0,0 +1,73 @@ +From 77347a8505e3a84797ef1b8474922e2feefe48bf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 7 Nov 2023 18:34:35 +0800 +Subject: nbd: fix uaf in nbd_open + +From: Li Lingfeng + +[ Upstream commit 327462725b0f759f093788dfbcb2f1fd132f956b ] + +Commit 4af5f2e03013 ("nbd: use blk_mq_alloc_disk and +blk_cleanup_disk") cleans up disk by blk_cleanup_disk() and it won't set +disk->private_data as NULL as before. UAF may be triggered in nbd_open() +if someone tries to open nbd device right after nbd_put() since nbd has +been free in nbd_dev_remove(). + +Fix this by implementing ->free_disk and free private data in it. + +Fixes: 4af5f2e03013 ("nbd: use blk_mq_alloc_disk and blk_cleanup_disk") +Signed-off-by: Li Lingfeng +Reviewed-by: Josef Bacik +Link: https://lore.kernel.org/r/20231107103435.2074904-1-lilingfeng@huaweicloud.com +Signed-off-by: Jens Axboe +Signed-off-by: Sasha Levin +--- + drivers/block/nbd.c | 11 +++++++++-- + 1 file changed, 9 insertions(+), 2 deletions(-) + +diff --git a/drivers/block/nbd.c b/drivers/block/nbd.c +index 7718c81e1dba8..e94d2ff6b1223 100644 +--- a/drivers/block/nbd.c ++++ b/drivers/block/nbd.c +@@ -250,7 +250,6 @@ static void nbd_dev_remove(struct nbd_device *nbd) + struct gendisk *disk = nbd->disk; + + del_gendisk(disk); +- put_disk(disk); + blk_mq_free_tag_set(&nbd->tag_set); + + /* +@@ -261,7 +260,7 @@ static void nbd_dev_remove(struct nbd_device *nbd) + idr_remove(&nbd_index_idr, nbd->index); + mutex_unlock(&nbd_index_mutex); + destroy_workqueue(nbd->recv_workq); +- kfree(nbd); ++ put_disk(disk); + } + + static void nbd_dev_remove_work(struct work_struct *work) +@@ -1608,6 +1607,13 @@ static void nbd_release(struct gendisk *disk, fmode_t mode) + nbd_put(nbd); + } + ++static void nbd_free_disk(struct gendisk *disk) ++{ ++ struct nbd_device *nbd = disk->private_data; ++ ++ kfree(nbd); ++} ++ + static const struct block_device_operations nbd_fops = + { + .owner = THIS_MODULE, +@@ -1615,6 +1621,7 @@ static const struct block_device_operations nbd_fops = + .release = nbd_release, + .ioctl = nbd_ioctl, + .compat_ioctl = nbd_ioctl, ++ .free_disk = nbd_free_disk, + }; + + #if IS_ENABLED(CONFIG_DEBUG_FS) +-- +2.42.0 + diff --git a/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch b/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch new file mode 100644 index 00000000000..e9fd73636a5 --- /dev/null +++ b/queue-6.1/net-page_pool-add-missing-free_percpu-when-page_pool.patch @@ -0,0 +1,48 @@ +From f6f08cbd9ad20a06e17adec789b23e8478d73984 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 17:12:56 +0800 +Subject: net: page_pool: add missing free_percpu when page_pool_init fail + +From: Jian Shen + +[ Upstream commit 8ffbd1669ed1d58939d6e878dffaa2f60bf961a4 ] + +When ptr_ring_init() returns failure in page_pool_init(), free_percpu() +is not called to free pool->recycle_stats, which may cause memory +leak. + +Fixes: ad6fa1e1ab1b ("page_pool: Add recycle stats") +Signed-off-by: Jian Shen +Signed-off-by: Jijie Shao +Reviewed-by: Yunsheng Lin +Reviewed-by: Jiri Pirko +Reviewed-by: Somnath Kotur +Reviewed-by: Ilias Apalodimas +Link: https://lore.kernel.org/r/20231030091256.2915394-1-shaojijie@huawei.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/core/page_pool.c | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/net/core/page_pool.c b/net/core/page_pool.c +index 2396c99bedeaa..caf6d950d54ad 100644 +--- a/net/core/page_pool.c ++++ b/net/core/page_pool.c +@@ -209,8 +209,12 @@ static int page_pool_init(struct page_pool *pool, + return -ENOMEM; + #endif + +- if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) ++ if (ptr_ring_init(&pool->ring, ring_qsize, GFP_KERNEL) < 0) { ++#ifdef CONFIG_PAGE_POOL_STATS ++ free_percpu(pool->recycle_stats); ++#endif + return -ENOMEM; ++ } + + atomic_set(&pool->pages_state_release_cnt, 0); + +-- +2.42.0 + diff --git a/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch b/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch new file mode 100644 index 00000000000..488cf72443d --- /dev/null +++ b/queue-6.1/net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch @@ -0,0 +1,43 @@ +From 86470605f1916f8312661e33a42580217c7c6cb2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 16:50:14 -0400 +Subject: net: r8169: Disable multicast filter for RTL8168H and RTL8107E + +From: Patrick Thompson + +[ Upstream commit efa5f1311c4998e9e6317c52bc5ee93b3a0f36df ] + +RTL8168H and RTL8107E ethernet adapters erroneously filter unicast +eapol packets unless allmulti is enabled. These devices correspond to +RTL_GIGA_MAC_VER_46 and VER_48. Add an exception for VER_46 and VER_48 +in the same way that VER_35 has an exception. + +Fixes: 6e1d0b898818 ("r8169:add support for RTL8168H and RTL8107E") +Signed-off-by: Patrick Thompson +Reviewed-by: Jacob Keller +Reviewed-by: Heiner Kallweit +Link: https://lore.kernel.org/r/20231030205031.177855-1-ptf@google.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/realtek/r8169_main.c | 4 +++- + 1 file changed, 3 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c +index 94f902d8e975f..c56d3538889b6 100644 +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -2514,7 +2514,9 @@ static void rtl_set_rx_mode(struct net_device *dev) + rx_mode |= AcceptAllPhys; + } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || + dev->flags & IFF_ALLMULTI || +- tp->mac_version == RTL_GIGA_MAC_VER_35) { ++ tp->mac_version == RTL_GIGA_MAC_VER_35 || ++ tp->mac_version == RTL_GIGA_MAC_VER_46 || ++ tp->mac_version == RTL_GIGA_MAC_VER_48) { + /* accept all multicasts */ + } else if (netdev_mc_empty(dev)) { + rx_mode &= ~AcceptMulticast; +-- +2.42.0 + diff --git a/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch b/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch new file mode 100644 index 00000000000..fcd3c595621 --- /dev/null +++ b/queue-6.1/net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch @@ -0,0 +1,64 @@ +From 7853abecf41949469a9d1cbeff72cfbb11a80a67 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Nov 2023 14:07:39 +0800 +Subject: net/smc: allow cdc msg send rather than drop it with NULL sndbuf_desc + +From: D. Wythe + +[ Upstream commit c5bf605ba4f9d6fbbb120595ab95002f4716edcb ] + +This patch re-fix the issues mentioned by commit 22a825c541d7 +("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()"). + +Blocking sending message do solve the issues though, but it also +prevents the peer to receive the final message. Besides, in logic, +whether the sndbuf_desc is NULL or not have no impact on the processing +of cdc message sending. + +Hence that, this patch allows the cdc message sending but to check the +sndbuf_desc with care in smc_cdc_tx_handler(). + +Fixes: 22a825c541d7 ("net/smc: fix NULL sndbuf_desc in smc_cdc_tx_handler()") +Signed-off-by: D. Wythe +Reviewed-by: Dust Li +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/smc/smc_cdc.c | 9 ++++----- + 1 file changed, 4 insertions(+), 5 deletions(-) + +diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c +index 01bdb7909a14b..3c06625ceb200 100644 +--- a/net/smc/smc_cdc.c ++++ b/net/smc/smc_cdc.c +@@ -28,13 +28,15 @@ static void smc_cdc_tx_handler(struct smc_wr_tx_pend_priv *pnd_snd, + { + struct smc_cdc_tx_pend *cdcpend = (struct smc_cdc_tx_pend *)pnd_snd; + struct smc_connection *conn = cdcpend->conn; ++ struct smc_buf_desc *sndbuf_desc; + struct smc_sock *smc; + int diff; + ++ sndbuf_desc = conn->sndbuf_desc; + smc = container_of(conn, struct smc_sock, conn); + bh_lock_sock(&smc->sk); +- if (!wc_status) { +- diff = smc_curs_diff(cdcpend->conn->sndbuf_desc->len, ++ if (!wc_status && sndbuf_desc) { ++ diff = smc_curs_diff(sndbuf_desc->len, + &cdcpend->conn->tx_curs_fin, + &cdcpend->cursor); + /* sndbuf_space is decreased in smc_sendmsg */ +@@ -114,9 +116,6 @@ int smc_cdc_msg_send(struct smc_connection *conn, + union smc_host_cursor cfed; + int rc; + +- if (unlikely(!READ_ONCE(conn->sndbuf_desc))) +- return -ENOBUFS; +- + smc_cdc_add_pending_send(conn, pend); + + conn->tx_cdc_seq++; +-- +2.42.0 + diff --git a/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch b/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch new file mode 100644 index 00000000000..fdb706b5d88 --- /dev/null +++ b/queue-6.1/net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch @@ -0,0 +1,111 @@ +From 5f8336187cd90bc38a9010e71ebfce68e99810aa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Nov 2023 14:07:38 +0800 +Subject: net/smc: fix dangling sock under state SMC_APPFINCLOSEWAIT + +From: D. Wythe + +[ Upstream commit 5211c9729484c923f8d2e06bd29f9322cc42bb8f ] + +Considering scenario: + + smc_cdc_rx_handler +__smc_release + sock_set_flag +smc_close_active() +sock_set_flag + +__set_bit(DEAD) __set_bit(DONE) + +Dues to __set_bit is not atomic, the DEAD or DONE might be lost. +if the DEAD flag lost, the state SMC_CLOSED will be never be reached +in smc_close_passive_work: + +if (sock_flag(sk, SOCK_DEAD) && + smc_close_sent_any_close(conn)) { + sk->sk_state = SMC_CLOSED; +} else { + /* just shutdown, but not yet closed locally */ + sk->sk_state = SMC_APPFINCLOSEWAIT; +} + +Replace sock_set_flags or __set_bit to set_bit will fix this problem. +Since set_bit is atomic. + +Fixes: b38d732477e4 ("smc: socket closing and linkgroup cleanup") +Signed-off-by: D. Wythe +Reviewed-by: Dust Li +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/smc/af_smc.c | 4 ++-- + net/smc/smc.h | 5 +++++ + net/smc/smc_cdc.c | 2 +- + net/smc/smc_close.c | 2 +- + 4 files changed, 9 insertions(+), 4 deletions(-) + +diff --git a/net/smc/af_smc.c b/net/smc/af_smc.c +index 4ea41d6e36969..d676119984c09 100644 +--- a/net/smc/af_smc.c ++++ b/net/smc/af_smc.c +@@ -274,7 +274,7 @@ static int __smc_release(struct smc_sock *smc) + + if (!smc->use_fallback) { + rc = smc_close_active(smc); +- sock_set_flag(sk, SOCK_DEAD); ++ smc_sock_set_flag(sk, SOCK_DEAD); + sk->sk_shutdown |= SHUTDOWN_MASK; + } else { + if (sk->sk_state != SMC_CLOSED) { +@@ -1710,7 +1710,7 @@ static int smc_clcsock_accept(struct smc_sock *lsmc, struct smc_sock **new_smc) + if (new_clcsock) + sock_release(new_clcsock); + new_sk->sk_state = SMC_CLOSED; +- sock_set_flag(new_sk, SOCK_DEAD); ++ smc_sock_set_flag(new_sk, SOCK_DEAD); + sock_put(new_sk); /* final */ + *new_smc = NULL; + goto out; +diff --git a/net/smc/smc.h b/net/smc/smc.h +index 1d36720fc019c..bcb57e60b2155 100644 +--- a/net/smc/smc.h ++++ b/net/smc/smc.h +@@ -377,4 +377,9 @@ int smc_nl_dump_hs_limitation(struct sk_buff *skb, struct netlink_callback *cb); + int smc_nl_enable_hs_limitation(struct sk_buff *skb, struct genl_info *info); + int smc_nl_disable_hs_limitation(struct sk_buff *skb, struct genl_info *info); + ++static inline void smc_sock_set_flag(struct sock *sk, enum sock_flags flag) ++{ ++ set_bit(flag, &sk->sk_flags); ++} ++ + #endif /* __SMC_H */ +diff --git a/net/smc/smc_cdc.c b/net/smc/smc_cdc.c +index 89105e95b4523..01bdb7909a14b 100644 +--- a/net/smc/smc_cdc.c ++++ b/net/smc/smc_cdc.c +@@ -385,7 +385,7 @@ static void smc_cdc_msg_recv_action(struct smc_sock *smc, + smc->sk.sk_shutdown |= RCV_SHUTDOWN; + if (smc->clcsock && smc->clcsock->sk) + smc->clcsock->sk->sk_shutdown |= RCV_SHUTDOWN; +- sock_set_flag(&smc->sk, SOCK_DONE); ++ smc_sock_set_flag(&smc->sk, SOCK_DONE); + sock_hold(&smc->sk); /* sock_put in close_work */ + if (!queue_work(smc_close_wq, &conn->close_work)) + sock_put(&smc->sk); +diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c +index dbdf03e8aa5b5..449ef454b53be 100644 +--- a/net/smc/smc_close.c ++++ b/net/smc/smc_close.c +@@ -173,7 +173,7 @@ void smc_close_active_abort(struct smc_sock *smc) + break; + } + +- sock_set_flag(sk, SOCK_DEAD); ++ smc_sock_set_flag(sk, SOCK_DEAD); + sk->sk_state_change(sk); + + if (release_clcsock) { +-- +2.42.0 + diff --git a/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch b/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch new file mode 100644 index 00000000000..cab4fd7b22c --- /dev/null +++ b/queue-6.1/net-smc-put-sk-reference-if-close-work-was-canceled.patch @@ -0,0 +1,40 @@ +From 2daffade9ae01e94c5c2521448a0a5f202d39365 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Nov 2023 14:07:40 +0800 +Subject: net/smc: put sk reference if close work was canceled + +From: D. Wythe + +[ Upstream commit aa96fbd6d78d9770323b21e2c92bd38821be8852 ] + +Note that we always hold a reference to sock when attempting +to submit close_work. Therefore, if we have successfully +canceled close_work from pending, we MUST release that reference +to avoid potential leaks. + +Fixes: 42bfba9eaa33 ("net/smc: immediate termination for SMCD link groups") +Signed-off-by: D. Wythe +Reviewed-by: Dust Li +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/smc/smc_close.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/net/smc/smc_close.c b/net/smc/smc_close.c +index 449ef454b53be..10219f55aad14 100644 +--- a/net/smc/smc_close.c ++++ b/net/smc/smc_close.c +@@ -116,7 +116,8 @@ static void smc_close_cancel_work(struct smc_sock *smc) + struct sock *sk = &smc->sk; + + release_sock(sk); +- cancel_work_sync(&smc->conn.close_work); ++ if (cancel_work_sync(&smc->conn.close_work)) ++ sock_put(sk); + cancel_delayed_work_sync(&smc->conn.tx_work); + lock_sock(sk); + } +-- +2.42.0 + diff --git a/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch b/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch new file mode 100644 index 00000000000..b931db77663 --- /dev/null +++ b/queue-6.1/net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch @@ -0,0 +1,68 @@ +From e4eaab234e08ae1b7f732d7d07c38d25ed439eca Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Oct 2023 10:27:29 +0800 +Subject: net: stmmac: xgmac: Enable support for multiple Flexible PPS outputs + +From: Furong Xu <0x1207@gmail.com> + +[ Upstream commit db456d90a4c1b43b6251fa4348c8adc59b583274 ] + +From XGMAC Core 3.20 and later, each Flexible PPS has individual PPSEN bit +to select Fixed mode or Flexible mode. The PPSEN must be set, or it stays +in Fixed PPS mode by default. +XGMAC Core prior 3.20, only PPSEN0(bit 4) is writable. PPSEN{1,2,3} are +read-only reserved, and they are already in Flexible mode by default, our +new code always set PPSEN{1,2,3} do not make things worse ;-) + +Fixes: 95eaf3cd0a90 ("net: stmmac: dwxgmac: Add Flexible PPS support") +Reviewed-by: Serge Semin +Reviewed-by: Jacob Keller +Signed-off-by: Furong Xu <0x1207@gmail.com> +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h | 2 +- + .../net/ethernet/stmicro/stmmac/dwxgmac2_core.c | 14 +++++++++++++- + 2 files changed, 14 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +index 1913385df6856..880a75bf2eb1f 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h ++++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2.h +@@ -222,7 +222,7 @@ + ((val) << XGMAC_PPS_MINIDX(x)) + #define XGMAC_PPSCMD_START 0x2 + #define XGMAC_PPSCMD_STOP 0x5 +-#define XGMAC_PPSEN0 BIT(4) ++#define XGMAC_PPSENx(x) BIT(4 + (x) * 8) + #define XGMAC_PPSx_TARGET_TIME_SEC(x) (0x00000d80 + (x) * 0x10) + #define XGMAC_PPSx_TARGET_TIME_NSEC(x) (0x00000d84 + (x) * 0x10) + #define XGMAC_TRGTBUSY0 BIT(31) +diff --git a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +index c6c4d7948fe5f..f30e08a106cbe 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c ++++ b/drivers/net/ethernet/stmicro/stmmac/dwxgmac2_core.c +@@ -1135,7 +1135,19 @@ static int dwxgmac2_flex_pps_config(void __iomem *ioaddr, int index, + + val |= XGMAC_PPSCMDx(index, XGMAC_PPSCMD_START); + val |= XGMAC_TRGTMODSELx(index, XGMAC_PPSCMD_START); +- val |= XGMAC_PPSEN0; ++ ++ /* XGMAC Core has 4 PPS outputs at most. ++ * ++ * Prior XGMAC Core 3.20, Fixed mode or Flexible mode are selectable for ++ * PPS0 only via PPSEN0. PPS{1,2,3} are in Flexible mode by default, ++ * and can not be switched to Fixed mode, since PPSEN{1,2,3} are ++ * read-only reserved to 0. ++ * But we always set PPSEN{1,2,3} do not make things worse ;-) ++ * ++ * From XGMAC Core 3.20 and later, PPSEN{0,1,2,3} are writable and must ++ * be set, or the PPS outputs stay in Fixed PPS mode by default. ++ */ ++ val |= XGMAC_PPSENx(index); + + writel(cfg->start.tv_sec, ioaddr + XGMAC_PPSx_TARGET_TIME_SEC(index)); + +-- +2.42.0 + diff --git a/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch b/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch new file mode 100644 index 00000000000..4fb459c07e9 --- /dev/null +++ b/queue-6.1/netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch @@ -0,0 +1,97 @@ +From 7f92464ecb7569ffd1203fc661d6017b7e2e85b8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 8 Nov 2023 13:18:53 +0100 +Subject: netfilter: nat: fix ipv6 nat redirect with mapped and scoped + addresses + +From: Florian Westphal + +[ Upstream commit 80abbe8a8263106fe45a4f293b92b5c74cc9cc8a ] + +The ipv6 redirect target was derived from the ipv4 one, i.e. its +identical to a 'dnat' with the first (primary) address assigned to the +network interface. The code has been moved around to make it usable +from nf_tables too, but its still the same as it was back when this +was added in 2012. + +IPv6, however, has different types of addresses, if the 'wrong' address +comes first the redirection does not work. + +In Daniels case, the addresses are: + inet6 ::ffff:192 ... + inet6 2a01: ... + +... so the function attempts to redirect to the mapped address. + +Add more checks before the address is deemed correct: +1. If the packets' daddr is scoped, search for a scoped address too +2. skip tentative addresses +3. skip mapped addresses + +Use the first address that appears to match our needs. + +Reported-by: Daniel Huhardeaux +Closes: https://lore.kernel.org/netfilter/71be06b8-6aa0-4cf9-9e0b-e2839b01b22f@tootai.net/ +Fixes: 115e23ac78f8 ("netfilter: ip6tables: add REDIRECT target") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_nat_redirect.c | 27 ++++++++++++++++++++++++++- + 1 file changed, 26 insertions(+), 1 deletion(-) + +diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c +index 6616ba5d0b049..5b37487d9d11f 100644 +--- a/net/netfilter/nf_nat_redirect.c ++++ b/net/netfilter/nf_nat_redirect.c +@@ -80,6 +80,26 @@ EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); + + static const struct in6_addr loopback_addr = IN6ADDR_LOOPBACK_INIT; + ++static bool nf_nat_redirect_ipv6_usable(const struct inet6_ifaddr *ifa, unsigned int scope) ++{ ++ unsigned int ifa_addr_type = ipv6_addr_type(&ifa->addr); ++ ++ if (ifa_addr_type & IPV6_ADDR_MAPPED) ++ return false; ++ ++ if ((ifa->flags & IFA_F_TENTATIVE) && (!(ifa->flags & IFA_F_OPTIMISTIC))) ++ return false; ++ ++ if (scope) { ++ unsigned int ifa_scope = ifa_addr_type & IPV6_ADDR_SCOPE_MASK; ++ ++ if (!(scope & ifa_scope)) ++ return false; ++ } ++ ++ return true; ++} ++ + unsigned int + nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + unsigned int hooknum) +@@ -89,14 +109,19 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + if (hooknum == NF_INET_LOCAL_OUT) { + newdst.in6 = loopback_addr; + } else { ++ unsigned int scope = ipv6_addr_scope(&ipv6_hdr(skb)->daddr); + struct inet6_dev *idev; +- struct inet6_ifaddr *ifa; + bool addr = false; + + idev = __in6_dev_get(skb->dev); + if (idev != NULL) { ++ const struct inet6_ifaddr *ifa; ++ + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { ++ if (!nf_nat_redirect_ipv6_usable(ifa, scope)) ++ continue; ++ + newdst.in6 = ifa->addr; + addr = true; + break; +-- +2.42.0 + diff --git a/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch b/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch new file mode 100644 index 00000000000..8e9c4e2b733 --- /dev/null +++ b/queue-6.1/netfilter-nft_redir-use-struct-nf_nat_range2-through.patch @@ -0,0 +1,372 @@ +From 8d64f2d44d2141b8cbca5ef6876f6e12553d3dfb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 15 Mar 2023 21:48:01 +0000 +Subject: netfilter: nft_redir: use `struct nf_nat_range2` throughout and + deduplicate eval call-backs + +From: Jeremy Sowden + +[ Upstream commit 6f56ad1b92328997e1b1792047099df6f8d7acb5 ] + +`nf_nat_redirect_ipv4` takes a `struct nf_nat_ipv4_multi_range_compat`, +but converts it internally to a `struct nf_nat_range2`. Change the +function to take the latter, factor out the code now shared with +`nf_nat_redirect_ipv6`, move the conversion to the xt_REDIRECT module, +and update the ipv4 range initialization in the nft_redir module. + +Replace a bare hex constant for 127.0.0.1 with a macro. + +Remove `WARN_ON`. `nf_nat_setup_info` calls `nf_ct_is_confirmed`: + + /* Can't setup nat info for confirmed ct. */ + if (nf_ct_is_confirmed(ct)) + return NF_ACCEPT; + +This means that `ct` cannot be null or the kernel will crash, and +implies that `ctinfo` is `IP_CT_NEW` or `IP_CT_RELATED`. + +nft_redir has separate ipv4 and ipv6 call-backs which share much of +their code, and an inet one switch containing a switch that calls one of +the others based on the family of the packet. Merge the ipv4 and ipv6 +ones into the inet one in order to get rid of the duplicate code. + +Const-qualify the `priv` pointer since we don't need to write through +it. + +Assign `priv->flags` to the range instead of OR-ing it in. + +Set the `NF_NAT_RANGE_PROTO_SPECIFIED` flag once during init, rather +than on every eval. + +Signed-off-by: Jeremy Sowden +Signed-off-by: Florian Westphal +Stable-dep-of: 80abbe8a8263 ("netfilter: nat: fix ipv6 nat redirect with mapped and scoped addresses") +Signed-off-by: Sasha Levin +--- + include/net/netfilter/nf_nat_redirect.h | 3 +- + net/netfilter/nf_nat_redirect.c | 71 ++++++++++----------- + net/netfilter/nft_redir.c | 84 +++++++++---------------- + net/netfilter/xt_REDIRECT.c | 10 ++- + 4 files changed, 72 insertions(+), 96 deletions(-) + +diff --git a/include/net/netfilter/nf_nat_redirect.h b/include/net/netfilter/nf_nat_redirect.h +index 2418653a66db1..279380de904c8 100644 +--- a/include/net/netfilter/nf_nat_redirect.h ++++ b/include/net/netfilter/nf_nat_redirect.h +@@ -6,8 +6,7 @@ + #include + + unsigned int +-nf_nat_redirect_ipv4(struct sk_buff *skb, +- const struct nf_nat_ipv4_multi_range_compat *mr, ++nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, + unsigned int hooknum); + unsigned int + nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, +diff --git a/net/netfilter/nf_nat_redirect.c b/net/netfilter/nf_nat_redirect.c +index f91579c821e9a..6616ba5d0b049 100644 +--- a/net/netfilter/nf_nat_redirect.c ++++ b/net/netfilter/nf_nat_redirect.c +@@ -10,6 +10,7 @@ + + #include + #include ++#include + #include + #include + #include +@@ -24,54 +25,56 @@ + #include + #include + ++static unsigned int ++nf_nat_redirect(struct sk_buff *skb, const struct nf_nat_range2 *range, ++ const union nf_inet_addr *newdst) ++{ ++ struct nf_nat_range2 newrange; ++ enum ip_conntrack_info ctinfo; ++ struct nf_conn *ct; ++ ++ ct = nf_ct_get(skb, &ctinfo); ++ ++ memset(&newrange, 0, sizeof(newrange)); ++ ++ newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; ++ newrange.min_addr = *newdst; ++ newrange.max_addr = *newdst; ++ newrange.min_proto = range->min_proto; ++ newrange.max_proto = range->max_proto; ++ ++ return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); ++} ++ + unsigned int +-nf_nat_redirect_ipv4(struct sk_buff *skb, +- const struct nf_nat_ipv4_multi_range_compat *mr, ++nf_nat_redirect_ipv4(struct sk_buff *skb, const struct nf_nat_range2 *range, + unsigned int hooknum) + { +- struct nf_conn *ct; +- enum ip_conntrack_info ctinfo; +- __be32 newdst; +- struct nf_nat_range2 newrange; ++ union nf_inet_addr newdst = {}; + + WARN_ON(hooknum != NF_INET_PRE_ROUTING && + hooknum != NF_INET_LOCAL_OUT); + +- ct = nf_ct_get(skb, &ctinfo); +- WARN_ON(!(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED))); +- + /* Local packets: make them go to loopback */ + if (hooknum == NF_INET_LOCAL_OUT) { +- newdst = htonl(0x7F000001); ++ newdst.ip = htonl(INADDR_LOOPBACK); + } else { + const struct in_device *indev; + +- newdst = 0; +- + indev = __in_dev_get_rcu(skb->dev); + if (indev) { + const struct in_ifaddr *ifa; + + ifa = rcu_dereference(indev->ifa_list); + if (ifa) +- newdst = ifa->ifa_local; ++ newdst.ip = ifa->ifa_local; + } + +- if (!newdst) ++ if (!newdst.ip) + return NF_DROP; + } + +- /* Transfer from original range. */ +- memset(&newrange.min_addr, 0, sizeof(newrange.min_addr)); +- memset(&newrange.max_addr, 0, sizeof(newrange.max_addr)); +- newrange.flags = mr->range[0].flags | NF_NAT_RANGE_MAP_IPS; +- newrange.min_addr.ip = newdst; +- newrange.max_addr.ip = newdst; +- newrange.min_proto = mr->range[0].min; +- newrange.max_proto = mr->range[0].max; +- +- /* Hand modified range to generic setup. */ +- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); ++ return nf_nat_redirect(skb, range, &newdst); + } + EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv4); + +@@ -81,14 +84,10 @@ unsigned int + nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + unsigned int hooknum) + { +- struct nf_nat_range2 newrange; +- struct in6_addr newdst; +- enum ip_conntrack_info ctinfo; +- struct nf_conn *ct; ++ union nf_inet_addr newdst = {}; + +- ct = nf_ct_get(skb, &ctinfo); + if (hooknum == NF_INET_LOCAL_OUT) { +- newdst = loopback_addr; ++ newdst.in6 = loopback_addr; + } else { + struct inet6_dev *idev; + struct inet6_ifaddr *ifa; +@@ -98,7 +97,7 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + if (idev != NULL) { + read_lock_bh(&idev->lock); + list_for_each_entry(ifa, &idev->addr_list, if_list) { +- newdst = ifa->addr; ++ newdst.in6 = ifa->addr; + addr = true; + break; + } +@@ -109,12 +108,6 @@ nf_nat_redirect_ipv6(struct sk_buff *skb, const struct nf_nat_range2 *range, + return NF_DROP; + } + +- newrange.flags = range->flags | NF_NAT_RANGE_MAP_IPS; +- newrange.min_addr.in6 = newdst; +- newrange.max_addr.in6 = newdst; +- newrange.min_proto = range->min_proto; +- newrange.max_proto = range->max_proto; +- +- return nf_nat_setup_info(ct, &newrange, NF_NAT_MANIP_DST); ++ return nf_nat_redirect(skb, range, &newdst); + } + EXPORT_SYMBOL_GPL(nf_nat_redirect_ipv6); +diff --git a/net/netfilter/nft_redir.c b/net/netfilter/nft_redir.c +index 5ed64b2bd15e8..08b408d3e113d 100644 +--- a/net/netfilter/nft_redir.c ++++ b/net/netfilter/nft_redir.c +@@ -64,6 +64,8 @@ static int nft_redir_init(const struct nft_ctx *ctx, + } else { + priv->sreg_proto_max = priv->sreg_proto_min; + } ++ ++ priv->flags |= NF_NAT_RANGE_PROTO_SPECIFIED; + } + + if (tb[NFTA_REDIR_FLAGS]) { +@@ -98,25 +100,37 @@ static int nft_redir_dump(struct sk_buff *skb, const struct nft_expr *expr) + return -1; + } + +-static void nft_redir_ipv4_eval(const struct nft_expr *expr, +- struct nft_regs *regs, +- const struct nft_pktinfo *pkt) ++static void nft_redir_eval(const struct nft_expr *expr, ++ struct nft_regs *regs, ++ const struct nft_pktinfo *pkt) + { +- struct nft_redir *priv = nft_expr_priv(expr); +- struct nf_nat_ipv4_multi_range_compat mr; ++ const struct nft_redir *priv = nft_expr_priv(expr); ++ struct nf_nat_range2 range; + +- memset(&mr, 0, sizeof(mr)); ++ memset(&range, 0, sizeof(range)); ++ range.flags = priv->flags; + if (priv->sreg_proto_min) { +- mr.range[0].min.all = (__force __be16)nft_reg_load16( +- ®s->data[priv->sreg_proto_min]); +- mr.range[0].max.all = (__force __be16)nft_reg_load16( +- ®s->data[priv->sreg_proto_max]); +- mr.range[0].flags |= NF_NAT_RANGE_PROTO_SPECIFIED; ++ range.min_proto.all = (__force __be16) ++ nft_reg_load16(®s->data[priv->sreg_proto_min]); ++ range.max_proto.all = (__force __be16) ++ nft_reg_load16(®s->data[priv->sreg_proto_max]); + } + +- mr.range[0].flags |= priv->flags; +- +- regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &mr, nft_hook(pkt)); ++ switch (nft_pf(pkt)) { ++ case NFPROTO_IPV4: ++ regs->verdict.code = nf_nat_redirect_ipv4(pkt->skb, &range, ++ nft_hook(pkt)); ++ break; ++#ifdef CONFIG_NF_TABLES_IPV6 ++ case NFPROTO_IPV6: ++ regs->verdict.code = nf_nat_redirect_ipv6(pkt->skb, &range, ++ nft_hook(pkt)); ++ break; ++#endif ++ default: ++ WARN_ON_ONCE(1); ++ break; ++ } + } + + static void +@@ -129,7 +143,7 @@ static struct nft_expr_type nft_redir_ipv4_type; + static const struct nft_expr_ops nft_redir_ipv4_ops = { + .type = &nft_redir_ipv4_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), +- .eval = nft_redir_ipv4_eval, ++ .eval = nft_redir_eval, + .init = nft_redir_init, + .destroy = nft_redir_ipv4_destroy, + .dump = nft_redir_dump, +@@ -147,28 +161,6 @@ static struct nft_expr_type nft_redir_ipv4_type __read_mostly = { + }; + + #ifdef CONFIG_NF_TABLES_IPV6 +-static void nft_redir_ipv6_eval(const struct nft_expr *expr, +- struct nft_regs *regs, +- const struct nft_pktinfo *pkt) +-{ +- struct nft_redir *priv = nft_expr_priv(expr); +- struct nf_nat_range2 range; +- +- memset(&range, 0, sizeof(range)); +- if (priv->sreg_proto_min) { +- range.min_proto.all = (__force __be16)nft_reg_load16( +- ®s->data[priv->sreg_proto_min]); +- range.max_proto.all = (__force __be16)nft_reg_load16( +- ®s->data[priv->sreg_proto_max]); +- range.flags |= NF_NAT_RANGE_PROTO_SPECIFIED; +- } +- +- range.flags |= priv->flags; +- +- regs->verdict.code = +- nf_nat_redirect_ipv6(pkt->skb, &range, nft_hook(pkt)); +-} +- + static void + nft_redir_ipv6_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) + { +@@ -179,7 +171,7 @@ static struct nft_expr_type nft_redir_ipv6_type; + static const struct nft_expr_ops nft_redir_ipv6_ops = { + .type = &nft_redir_ipv6_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), +- .eval = nft_redir_ipv6_eval, ++ .eval = nft_redir_eval, + .init = nft_redir_init, + .destroy = nft_redir_ipv6_destroy, + .dump = nft_redir_dump, +@@ -198,20 +190,6 @@ static struct nft_expr_type nft_redir_ipv6_type __read_mostly = { + #endif + + #ifdef CONFIG_NF_TABLES_INET +-static void nft_redir_inet_eval(const struct nft_expr *expr, +- struct nft_regs *regs, +- const struct nft_pktinfo *pkt) +-{ +- switch (nft_pf(pkt)) { +- case NFPROTO_IPV4: +- return nft_redir_ipv4_eval(expr, regs, pkt); +- case NFPROTO_IPV6: +- return nft_redir_ipv6_eval(expr, regs, pkt); +- } +- +- WARN_ON_ONCE(1); +-} +- + static void + nft_redir_inet_destroy(const struct nft_ctx *ctx, const struct nft_expr *expr) + { +@@ -222,7 +200,7 @@ static struct nft_expr_type nft_redir_inet_type; + static const struct nft_expr_ops nft_redir_inet_ops = { + .type = &nft_redir_inet_type, + .size = NFT_EXPR_SIZE(sizeof(struct nft_redir)), +- .eval = nft_redir_inet_eval, ++ .eval = nft_redir_eval, + .init = nft_redir_init, + .destroy = nft_redir_inet_destroy, + .dump = nft_redir_dump, +diff --git a/net/netfilter/xt_REDIRECT.c b/net/netfilter/xt_REDIRECT.c +index 353ca7801251a..ff66b56a3f97d 100644 +--- a/net/netfilter/xt_REDIRECT.c ++++ b/net/netfilter/xt_REDIRECT.c +@@ -46,7 +46,6 @@ static void redirect_tg_destroy(const struct xt_tgdtor_param *par) + nf_ct_netns_put(par->net, par->family); + } + +-/* FIXME: Take multiple ranges --RR */ + static int redirect_tg4_check(const struct xt_tgchk_param *par) + { + const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; +@@ -65,7 +64,14 @@ static int redirect_tg4_check(const struct xt_tgchk_param *par) + static unsigned int + redirect_tg4(struct sk_buff *skb, const struct xt_action_param *par) + { +- return nf_nat_redirect_ipv4(skb, par->targinfo, xt_hooknum(par)); ++ const struct nf_nat_ipv4_multi_range_compat *mr = par->targinfo; ++ struct nf_nat_range2 range = { ++ .flags = mr->range[0].flags, ++ .min_proto = mr->range[0].min, ++ .max_proto = mr->range[0].max, ++ }; ++ ++ return nf_nat_redirect_ipv4(skb, &range, xt_hooknum(par)); + } + + static struct xt_target redirect_tg_reg[] __read_mostly = { +-- +2.42.0 + diff --git a/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch b/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch new file mode 100644 index 00000000000..69dfab985ac --- /dev/null +++ b/queue-6.1/netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch @@ -0,0 +1,49 @@ +From 128703facde1966f223048219a98e742fabbc063 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 5 Nov 2023 11:56:00 -0800 +Subject: netfilter: xt_recent: fix (increase) ipv6 literal buffer length +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Maciej Å»enczykowski + +[ Upstream commit 7b308feb4fd2d1c06919445c65c8fbf8e9fd1781 ] + +in6_pton() supports 'low-32-bit dot-decimal representation' +(this is useful with DNS64/NAT64 networks for example): + + # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:1.2.3.4 > /proc/self/net/xt_recent/DEFAULT + # cat /proc/self/net/xt_recent/DEFAULT + src=aaaa:bbbb:cccc:dddd:eeee:ffff:0102:0304 ttl: 0 last_seen: 9733848829 oldest_pkt: 1 9733848829 + +but the provided buffer is too short: + + # echo +aaaa:bbbb:cccc:dddd:eeee:ffff:255.255.255.255 > /proc/self/net/xt_recent/DEFAULT + -bash: echo: write error: Invalid argument + +Fixes: 079aa88fe717 ("netfilter: xt_recent: IPv6 support") +Signed-off-by: Maciej Å»enczykowski +Reviewed-by: Simon Horman +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/xt_recent.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netfilter/xt_recent.c b/net/netfilter/xt_recent.c +index 7ddb9a78e3fc8..ef93e0d3bee04 100644 +--- a/net/netfilter/xt_recent.c ++++ b/net/netfilter/xt_recent.c +@@ -561,7 +561,7 @@ recent_mt_proc_write(struct file *file, const char __user *input, + { + struct recent_table *t = pde_data(file_inode(file)); + struct recent_entry *e; +- char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:5afe:c0de")]; ++ char buf[sizeof("+b335:1d35:1e55:dead:c0de:1715:255.255.255.255")]; + const char *c = buf; + union nf_inet_addr addr = {}; + u_int16_t family; +-- +2.42.0 + diff --git a/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch b/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch new file mode 100644 index 00000000000..0e531487ade --- /dev/null +++ b/queue-6.1/nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch @@ -0,0 +1,46 @@ +From 939f52cda3d6f05a0dcbfbbdfa7f6378eb95d8e0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 19 Oct 2023 00:54:30 +0530 +Subject: nvme: fix error-handling for io_uring nvme-passthrough + +From: Anuj Gupta + +[ Upstream commit 1147dd0503564fa0e03489a039f9e0c748a03db4 ] + +Driver may return an error before submitting the command to the device. +Ensure that such error is propagated up. + +Fixes: 456cba386e94 ("nvme: wire-up uring-cmd support for io-passthru on char-device.") +Signed-off-by: Anuj Gupta +Signed-off-by: Kanchan Joshi +Reviewed-by: Niklas Cassel +Reviewed-by: Christoph Hellwig +Signed-off-by: Keith Busch +Signed-off-by: Sasha Levin +--- + drivers/nvme/host/ioctl.c | 7 +++++-- + 1 file changed, 5 insertions(+), 2 deletions(-) + +diff --git a/drivers/nvme/host/ioctl.c b/drivers/nvme/host/ioctl.c +index b33004a4bcb5a..91e6d03475798 100644 +--- a/drivers/nvme/host/ioctl.c ++++ b/drivers/nvme/host/ioctl.c +@@ -435,10 +435,13 @@ static enum rq_end_io_ret nvme_uring_cmd_end_io(struct request *req, + void *cookie = READ_ONCE(ioucmd->cookie); + + req->bio = pdu->bio; +- if (nvme_req(req)->flags & NVME_REQ_CANCELLED) ++ if (nvme_req(req)->flags & NVME_REQ_CANCELLED) { + pdu->nvme_status = -EINTR; +- else ++ } else { + pdu->nvme_status = nvme_req(req)->status; ++ if (!pdu->nvme_status) ++ pdu->nvme_status = blk_status_to_errno(err); ++ } + pdu->u.result = le64_to_cpu(nvme_req(req)->result.u64); + + /* +-- +2.42.0 + diff --git a/queue-6.1/octeontx2-pf-fix-error-codes.patch b/queue-6.1/octeontx2-pf-fix-error-codes.patch new file mode 100644 index 00000000000..569d6c9787c --- /dev/null +++ b/queue-6.1/octeontx2-pf-fix-error-codes.patch @@ -0,0 +1,69 @@ +From 2e089a867a9f6ed8df0c8b4385a9e45f09c3cc30 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Oct 2023 07:49:52 +0530 +Subject: octeontx2-pf: Fix error codes + +From: Ratheesh Kannoth + +[ Upstream commit 96b9a68d1a6e4f889d453874c9e359aa720b520f ] + +Some of error codes were wrong. Fix the same. + +Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]") +Signed-off-by: Ratheesh Kannoth +Reviewed-by: Wojciech Drewek +Link: https://lore.kernel.org/r/20231027021953.1819959-1-rkannoth@marvell.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + .../marvell/octeontx2/nic/otx2_struct.h | 34 +++++++++---------- + 1 file changed, 17 insertions(+), 17 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h +index fa37b9f312cae..4e5899d8fa2e6 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_struct.h +@@ -318,23 +318,23 @@ enum nix_snd_status_e { + NIX_SND_STATUS_EXT_ERR = 0x6, + NIX_SND_STATUS_JUMP_FAULT = 0x7, + NIX_SND_STATUS_JUMP_POISON = 0x8, +- NIX_SND_STATUS_CRC_ERR = 0x9, +- NIX_SND_STATUS_IMM_ERR = 0x10, +- NIX_SND_STATUS_SG_ERR = 0x11, +- NIX_SND_STATUS_MEM_ERR = 0x12, +- NIX_SND_STATUS_INVALID_SUBDC = 0x13, +- NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x14, +- NIX_SND_STATUS_DATA_FAULT = 0x15, +- NIX_SND_STATUS_DATA_POISON = 0x16, +- NIX_SND_STATUS_NPC_DROP_ACTION = 0x17, +- NIX_SND_STATUS_LOCK_VIOL = 0x18, +- NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x19, +- NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x20, +- NIX_SND_STATUS_NPC_MCAST_ABORT = 0x21, +- NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x22, +- NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x23, +- NIX_SND_STATUS_SEND_MEM_FAULT = 0x24, +- NIX_SND_STATUS_SEND_STATS_ERR = 0x25, ++ NIX_SND_STATUS_CRC_ERR = 0x10, ++ NIX_SND_STATUS_IMM_ERR = 0x11, ++ NIX_SND_STATUS_SG_ERR = 0x12, ++ NIX_SND_STATUS_MEM_ERR = 0x13, ++ NIX_SND_STATUS_INVALID_SUBDC = 0x14, ++ NIX_SND_STATUS_SUBDC_ORDER_ERR = 0x15, ++ NIX_SND_STATUS_DATA_FAULT = 0x16, ++ NIX_SND_STATUS_DATA_POISON = 0x17, ++ NIX_SND_STATUS_NPC_DROP_ACTION = 0x20, ++ NIX_SND_STATUS_LOCK_VIOL = 0x21, ++ NIX_SND_STATUS_NPC_UCAST_CHAN_ERR = 0x22, ++ NIX_SND_STATUS_NPC_MCAST_CHAN_ERR = 0x23, ++ NIX_SND_STATUS_NPC_MCAST_ABORT = 0x24, ++ NIX_SND_STATUS_NPC_VTAG_PTR_ERR = 0x25, ++ NIX_SND_STATUS_NPC_VTAG_SIZE_ERR = 0x26, ++ NIX_SND_STATUS_SEND_MEM_FAULT = 0x27, ++ NIX_SND_STATUS_SEND_STATS_ERR = 0x28, + NIX_SND_STATUS_MAX, + }; + +-- +2.42.0 + diff --git a/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch b/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch new file mode 100644 index 00000000000..df4f3e7f1ea --- /dev/null +++ b/queue-6.1/octeontx2-pf-fix-holes-in-error-code.patch @@ -0,0 +1,156 @@ +From 5a8654a938e41485de1b43de81286f0f4a47f6ff Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Oct 2023 07:49:53 +0530 +Subject: octeontx2-pf: Fix holes in error code + +From: Ratheesh Kannoth + +[ Upstream commit 7aeeb2cb7a2570bb69a87ad14018b03e06ce5be5 ] + +Error code strings are not getting printed properly +due to holes. Print error code as well. + +Fixes: 51afe9026d0c ("octeontx2-pf: NIX TX overwrites SQ_CTX_HW_S[SQ_INT]") +Signed-off-by: Ratheesh Kannoth +Reviewed-by: Wojciech Drewek +Link: https://lore.kernel.org/r/20231027021953.1819959-2-rkannoth@marvell.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 80 +++++++++++-------- + 1 file changed, 46 insertions(+), 34 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +index 17e546d0d7e55..101d79a0bb436 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -1194,31 +1194,32 @@ static char *nix_mnqerr_e_str[NIX_MNQERR_MAX] = { + }; + + static char *nix_snd_status_e_str[NIX_SND_STATUS_MAX] = { +- "NIX_SND_STATUS_GOOD", +- "NIX_SND_STATUS_SQ_CTX_FAULT", +- "NIX_SND_STATUS_SQ_CTX_POISON", +- "NIX_SND_STATUS_SQB_FAULT", +- "NIX_SND_STATUS_SQB_POISON", +- "NIX_SND_STATUS_HDR_ERR", +- "NIX_SND_STATUS_EXT_ERR", +- "NIX_SND_STATUS_JUMP_FAULT", +- "NIX_SND_STATUS_JUMP_POISON", +- "NIX_SND_STATUS_CRC_ERR", +- "NIX_SND_STATUS_IMM_ERR", +- "NIX_SND_STATUS_SG_ERR", +- "NIX_SND_STATUS_MEM_ERR", +- "NIX_SND_STATUS_INVALID_SUBDC", +- "NIX_SND_STATUS_SUBDC_ORDER_ERR", +- "NIX_SND_STATUS_DATA_FAULT", +- "NIX_SND_STATUS_DATA_POISON", +- "NIX_SND_STATUS_NPC_DROP_ACTION", +- "NIX_SND_STATUS_LOCK_VIOL", +- "NIX_SND_STATUS_NPC_UCAST_CHAN_ERR", +- "NIX_SND_STATUS_NPC_MCAST_CHAN_ERR", +- "NIX_SND_STATUS_NPC_MCAST_ABORT", +- "NIX_SND_STATUS_NPC_VTAG_PTR_ERR", +- "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR", +- "NIX_SND_STATUS_SEND_STATS_ERR", ++ [NIX_SND_STATUS_GOOD] = "NIX_SND_STATUS_GOOD", ++ [NIX_SND_STATUS_SQ_CTX_FAULT] = "NIX_SND_STATUS_SQ_CTX_FAULT", ++ [NIX_SND_STATUS_SQ_CTX_POISON] = "NIX_SND_STATUS_SQ_CTX_POISON", ++ [NIX_SND_STATUS_SQB_FAULT] = "NIX_SND_STATUS_SQB_FAULT", ++ [NIX_SND_STATUS_SQB_POISON] = "NIX_SND_STATUS_SQB_POISON", ++ [NIX_SND_STATUS_HDR_ERR] = "NIX_SND_STATUS_HDR_ERR", ++ [NIX_SND_STATUS_EXT_ERR] = "NIX_SND_STATUS_EXT_ERR", ++ [NIX_SND_STATUS_JUMP_FAULT] = "NIX_SND_STATUS_JUMP_FAULT", ++ [NIX_SND_STATUS_JUMP_POISON] = "NIX_SND_STATUS_JUMP_POISON", ++ [NIX_SND_STATUS_CRC_ERR] = "NIX_SND_STATUS_CRC_ERR", ++ [NIX_SND_STATUS_IMM_ERR] = "NIX_SND_STATUS_IMM_ERR", ++ [NIX_SND_STATUS_SG_ERR] = "NIX_SND_STATUS_SG_ERR", ++ [NIX_SND_STATUS_MEM_ERR] = "NIX_SND_STATUS_MEM_ERR", ++ [NIX_SND_STATUS_INVALID_SUBDC] = "NIX_SND_STATUS_INVALID_SUBDC", ++ [NIX_SND_STATUS_SUBDC_ORDER_ERR] = "NIX_SND_STATUS_SUBDC_ORDER_ERR", ++ [NIX_SND_STATUS_DATA_FAULT] = "NIX_SND_STATUS_DATA_FAULT", ++ [NIX_SND_STATUS_DATA_POISON] = "NIX_SND_STATUS_DATA_POISON", ++ [NIX_SND_STATUS_NPC_DROP_ACTION] = "NIX_SND_STATUS_NPC_DROP_ACTION", ++ [NIX_SND_STATUS_LOCK_VIOL] = "NIX_SND_STATUS_LOCK_VIOL", ++ [NIX_SND_STATUS_NPC_UCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_UCAST_CHAN_ERR", ++ [NIX_SND_STATUS_NPC_MCAST_CHAN_ERR] = "NIX_SND_STAT_NPC_MCAST_CHAN_ERR", ++ [NIX_SND_STATUS_NPC_MCAST_ABORT] = "NIX_SND_STATUS_NPC_MCAST_ABORT", ++ [NIX_SND_STATUS_NPC_VTAG_PTR_ERR] = "NIX_SND_STATUS_NPC_VTAG_PTR_ERR", ++ [NIX_SND_STATUS_NPC_VTAG_SIZE_ERR] = "NIX_SND_STATUS_NPC_VTAG_SIZE_ERR", ++ [NIX_SND_STATUS_SEND_MEM_FAULT] = "NIX_SND_STATUS_SEND_MEM_FAULT", ++ [NIX_SND_STATUS_SEND_STATS_ERR] = "NIX_SND_STATUS_SEND_STATS_ERR", + }; + + static irqreturn_t otx2_q_intr_handler(int irq, void *data) +@@ -1238,14 +1239,16 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + continue; + + if (val & BIT_ULL(42)) { +- netdev_err(pf->netdev, "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", ++ netdev_err(pf->netdev, ++ "CQ%lld: error reading NIX_LF_CQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", + qidx, otx2_read64(pf, NIX_LF_ERR_INT)); + } else { + if (val & BIT_ULL(NIX_CQERRINT_DOOR_ERR)) + netdev_err(pf->netdev, "CQ%lld: Doorbell error", + qidx); + if (val & BIT_ULL(NIX_CQERRINT_CQE_FAULT)) +- netdev_err(pf->netdev, "CQ%lld: Memory fault on CQE write to LLC/DRAM", ++ netdev_err(pf->netdev, ++ "CQ%lld: Memory fault on CQE write to LLC/DRAM", + qidx); + } + +@@ -1268,7 +1271,8 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + (val & NIX_SQINT_BITS)); + + if (val & BIT_ULL(42)) { +- netdev_err(pf->netdev, "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", ++ netdev_err(pf->netdev, ++ "SQ%lld: error reading NIX_LF_SQ_OP_INT, NIX_LF_ERR_INT 0x%llx\n", + qidx, otx2_read64(pf, NIX_LF_ERR_INT)); + goto done; + } +@@ -1278,8 +1282,11 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + goto chk_mnq_err_dbg; + + sq_op_err_code = FIELD_GET(GENMASK(7, 0), sq_op_err_dbg); +- netdev_err(pf->netdev, "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(%llx) err=%s\n", +- qidx, sq_op_err_dbg, nix_sqoperr_e_str[sq_op_err_code]); ++ netdev_err(pf->netdev, ++ "SQ%lld: NIX_LF_SQ_OP_ERR_DBG(0x%llx) err=%s(%#x)\n", ++ qidx, sq_op_err_dbg, ++ nix_sqoperr_e_str[sq_op_err_code], ++ sq_op_err_code); + + otx2_write64(pf, NIX_LF_SQ_OP_ERR_DBG, BIT_ULL(44)); + +@@ -1296,16 +1303,21 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + goto chk_snd_err_dbg; + + mnq_err_code = FIELD_GET(GENMASK(7, 0), mnq_err_dbg); +- netdev_err(pf->netdev, "SQ%lld: NIX_LF_MNQ_ERR_DBG(%llx) err=%s\n", +- qidx, mnq_err_dbg, nix_mnqerr_e_str[mnq_err_code]); ++ netdev_err(pf->netdev, ++ "SQ%lld: NIX_LF_MNQ_ERR_DBG(0x%llx) err=%s(%#x)\n", ++ qidx, mnq_err_dbg, nix_mnqerr_e_str[mnq_err_code], ++ mnq_err_code); + otx2_write64(pf, NIX_LF_MNQ_ERR_DBG, BIT_ULL(44)); + + chk_snd_err_dbg: + snd_err_dbg = otx2_read64(pf, NIX_LF_SEND_ERR_DBG); + if (snd_err_dbg & BIT(44)) { + snd_err_code = FIELD_GET(GENMASK(7, 0), snd_err_dbg); +- netdev_err(pf->netdev, "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s\n", +- qidx, snd_err_dbg, nix_snd_status_e_str[snd_err_code]); ++ netdev_err(pf->netdev, ++ "SQ%lld: NIX_LF_SND_ERR_DBG:0x%llx err=%s(%#x)\n", ++ qidx, snd_err_dbg, ++ nix_snd_status_e_str[snd_err_code], ++ snd_err_code); + otx2_write64(pf, NIX_LF_SEND_ERR_DBG, BIT_ULL(44)); + } + +-- +2.42.0 + diff --git a/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch b/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch new file mode 100644 index 00000000000..5c9a481fc26 --- /dev/null +++ b/queue-6.1/octeontx2-pf-free-pending-and-dropped-sqes.patch @@ -0,0 +1,162 @@ +From e11cc39744ac02b64560bce825fdfe5810ae0645 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Oct 2023 16:53:45 +0530 +Subject: octeontx2-pf: Free pending and dropped SQEs + +From: Geetha sowjanya + +[ Upstream commit 3423ca23e08bf285a324237abe88e7e7d9becfe6 ] + +On interface down, the pending SQEs in the NIX get dropped +or drained out during SMQ flush. But skb's pointed by these +SQEs never get free or updated to the stack as respective CQE +never get added. +This patch fixes the issue by freeing all valid skb's in SQ SG list. + +Fixes: b1bc8457e9d0 ("octeontx2-pf: Cleanup all receive buffers in SG descriptor") +Signed-off-by: Geetha sowjanya +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + .../marvell/octeontx2/nic/otx2_common.c | 15 +++---- + .../marvell/octeontx2/nic/otx2_common.h | 1 + + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 1 + + .../marvell/octeontx2/nic/otx2_txrx.c | 42 +++++++++++++++++++ + 4 files changed, 49 insertions(+), 10 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +index c76dad78c26eb..0f896f606c3e6 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +@@ -797,7 +797,6 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) + int qidx, sqe_tail, sqe_head; + struct otx2_snd_queue *sq; + u64 incr, *ptr, val; +- int timeout = 1000; + + ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS); + for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) { +@@ -806,15 +805,11 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) + continue; + + incr = (u64)qidx << 32; +- while (timeout) { +- val = otx2_atomic64_add(incr, ptr); +- sqe_head = (val >> 20) & 0x3F; +- sqe_tail = (val >> 28) & 0x3F; +- if (sqe_head == sqe_tail) +- break; +- usleep_range(1, 3); +- timeout--; +- } ++ val = otx2_atomic64_add(incr, ptr); ++ sqe_head = (val >> 20) & 0x3F; ++ sqe_tail = (val >> 28) & 0x3F; ++ if (sqe_head != sqe_tail) ++ usleep_range(50, 60); + } + } + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +index 876a7b51b8e51..efd66224b3dbf 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +@@ -933,6 +933,7 @@ int otx2_txschq_config(struct otx2_nic *pfvf, int lvl, int prio, bool pfc_en); + int otx2_txsch_alloc(struct otx2_nic *pfvf); + void otx2_txschq_stop(struct otx2_nic *pfvf); + void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq); ++void otx2_free_pending_sqe(struct otx2_nic *pfvf); + void otx2_sqb_flush(struct otx2_nic *pfvf); + int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, + dma_addr_t *dma); +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +index c558c9b64f5be..c724131172f3f 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -1596,6 +1596,7 @@ static void otx2_free_hw_resources(struct otx2_nic *pf) + else + otx2_cleanup_tx_cqes(pf, cq); + } ++ otx2_free_pending_sqe(pf); + + otx2_free_sq_res(pf); + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +index d005434e1e037..20d801d30c732 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +@@ -1224,9 +1224,11 @@ void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) + + void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) + { ++ int tx_pkts = 0, tx_bytes = 0; + struct sk_buff *skb = NULL; + struct otx2_snd_queue *sq; + struct nix_cqe_tx_s *cqe; ++ struct netdev_queue *txq; + int processed_cqe = 0; + struct sg_list *sg; + int qidx; +@@ -1247,12 +1249,20 @@ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) + sg = &sq->sg[cqe->comp.sqe_id]; + skb = (struct sk_buff *)sg->skb; + if (skb) { ++ tx_bytes += skb->len; ++ tx_pkts++; + otx2_dma_unmap_skb_frags(pfvf, sg); + dev_kfree_skb_any(skb); + sg->skb = (u64)NULL; + } + } + ++ if (likely(tx_pkts)) { ++ if (qidx >= pfvf->hw.tx_queues) ++ qidx -= pfvf->hw.xdp_queues; ++ txq = netdev_get_tx_queue(pfvf->netdev, qidx); ++ netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); ++ } + /* Free CQEs to HW */ + otx2_write64(pfvf, NIX_LF_CQ_OP_DOOR, + ((u64)cq->cq_idx << 32) | processed_cqe); +@@ -1279,6 +1289,38 @@ int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable) + return err; + } + ++void otx2_free_pending_sqe(struct otx2_nic *pfvf) ++{ ++ int tx_pkts = 0, tx_bytes = 0; ++ struct sk_buff *skb = NULL; ++ struct otx2_snd_queue *sq; ++ struct netdev_queue *txq; ++ struct sg_list *sg; ++ int sq_idx, sqe; ++ ++ for (sq_idx = 0; sq_idx < pfvf->hw.tx_queues; sq_idx++) { ++ sq = &pfvf->qset.sq[sq_idx]; ++ for (sqe = 0; sqe < sq->sqe_cnt; sqe++) { ++ sg = &sq->sg[sqe]; ++ skb = (struct sk_buff *)sg->skb; ++ if (skb) { ++ tx_bytes += skb->len; ++ tx_pkts++; ++ otx2_dma_unmap_skb_frags(pfvf, sg); ++ dev_kfree_skb_any(skb); ++ sg->skb = (u64)NULL; ++ } ++ } ++ ++ if (!tx_pkts) ++ continue; ++ txq = netdev_get_tx_queue(pfvf->netdev, sq_idx); ++ netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); ++ tx_pkts = 0; ++ tx_bytes = 0; ++ } ++} ++ + static void otx2_xdp_sqe_add_sg(struct otx2_snd_queue *sq, u64 dma_addr, + int len, int *offset) + { +-- +2.42.0 + diff --git a/queue-6.1/octeontx2-pf-qos-send-queues-management.patch b/queue-6.1/octeontx2-pf-qos-send-queues-management.patch new file mode 100644 index 00000000000..da2f5e68082 --- /dev/null +++ b/queue-6.1/octeontx2-pf-qos-send-queues-management.patch @@ -0,0 +1,874 @@ +From f6a2d4a39e969d2c49f8cceb5825a7a2d740ea15 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 13 May 2023 14:21:38 +0530 +Subject: octeontx2-pf: qos send queues management + +From: Subbaraya Sundeep + +[ Upstream commit ab6dddd2a669a0ecc2ce07485c7a15fadbb5a0aa ] + +Current implementation is such that the number of Send queues (SQs) +are decided on the device probe which is equal to the number of online +cpus. These SQs are allocated and deallocated in interface open and c +lose calls respectively. + +This patch defines new APIs for initializing and deinitializing Send +queues dynamically and allocates more number of transmit queues for +QOS feature. + +Signed-off-by: Subbaraya Sundeep +Signed-off-by: Hariprasad Kelam +Signed-off-by: Sunil Kovvuri Goutham +Reviewed-by: Simon Horman +Reviewed-by: Jacob Keller +Signed-off-by: David S. Miller +Stable-dep-of: 3423ca23e08b ("octeontx2-pf: Free pending and dropped SQEs") +Signed-off-by: Sasha Levin +--- + .../marvell/octeontx2/af/rvu_debugfs.c | 5 + + .../ethernet/marvell/octeontx2/nic/Makefile | 2 +- + .../marvell/octeontx2/nic/otx2_common.c | 43 ++- + .../marvell/octeontx2/nic/otx2_common.h | 39 ++- + .../ethernet/marvell/octeontx2/nic/otx2_pf.c | 44 ++- + .../marvell/octeontx2/nic/otx2_txrx.c | 24 +- + .../marvell/octeontx2/nic/otx2_txrx.h | 3 +- + .../ethernet/marvell/octeontx2/nic/otx2_vf.c | 7 +- + .../net/ethernet/marvell/octeontx2/nic/qos.h | 19 ++ + .../ethernet/marvell/octeontx2/nic/qos_sq.c | 282 ++++++++++++++++++ + 10 files changed, 426 insertions(+), 42 deletions(-) + create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos.h + create mode 100644 drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c + +diff --git a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +index aadc352c2ffbd..5c9dc3f9262f5 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c ++++ b/drivers/net/ethernet/marvell/octeontx2/af/rvu_debugfs.c +@@ -1222,6 +1222,11 @@ static int rvu_dbg_npa_ctx_display(struct seq_file *m, void *unused, int ctype) + + for (aura = id; aura < max_id; aura++) { + aq_req.aura_id = aura; ++ ++ /* Skip if queue is uninitialized */ ++ if (ctype == NPA_AQ_CTYPE_POOL && !test_bit(aura, pfvf->pool_bmap)) ++ continue; ++ + seq_printf(m, "======%s : %d=======\n", + (ctype == NPA_AQ_CTYPE_AURA) ? "AURA" : "POOL", + aq_req.aura_id); +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +index 73fdb87986148..3d31ddf7c652e 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/Makefile ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/Makefile +@@ -8,7 +8,7 @@ obj-$(CONFIG_OCTEONTX2_VF) += rvu_nicvf.o otx2_ptp.o + + rvu_nicpf-y := otx2_pf.o otx2_common.o otx2_txrx.o otx2_ethtool.o \ + otx2_flows.o otx2_tc.o cn10k.o otx2_dmac_flt.o \ +- otx2_devlink.o ++ otx2_devlink.o qos_sq.o + rvu_nicvf-y := otx2_vf.o otx2_devlink.o + + rvu_nicpf-$(CONFIG_DCB) += otx2_dcbnl.o +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +index 2575c207150e1..c76dad78c26eb 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +@@ -513,8 +513,8 @@ void otx2_config_irq_coalescing(struct otx2_nic *pfvf, int qidx) + (pfvf->hw.cq_ecount_wait - 1)); + } + +-int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, +- dma_addr_t *dma) ++static int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, ++ dma_addr_t *dma) + { + u8 *buf; + +@@ -532,8 +532,8 @@ int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, + return 0; + } + +-static int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, +- dma_addr_t *dma) ++int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, ++ dma_addr_t *dma) + { + int ret; + +@@ -795,11 +795,16 @@ void otx2_txschq_stop(struct otx2_nic *pfvf) + void otx2_sqb_flush(struct otx2_nic *pfvf) + { + int qidx, sqe_tail, sqe_head; ++ struct otx2_snd_queue *sq; + u64 incr, *ptr, val; + int timeout = 1000; + + ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS); +- for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) { ++ for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) { ++ sq = &pfvf->qset.sq[qidx]; ++ if (!sq->sqb_ptrs) ++ continue; ++ + incr = (u64)qidx << 32; + while (timeout) { + val = otx2_atomic64_add(incr, ptr); +@@ -899,7 +904,7 @@ int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura) + return otx2_sync_mbox_msg(&pfvf->mbox); + } + +-static int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) ++int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura) + { + struct otx2_qset *qset = &pfvf->qset; + struct otx2_snd_queue *sq; +@@ -972,9 +977,17 @@ static int otx2_cq_init(struct otx2_nic *pfvf, u16 qidx) + cq->cint_idx = qidx - pfvf->hw.rx_queues; + cq->cqe_cnt = qset->sqe_cnt; + } else { +- cq->cq_type = CQ_XDP; +- cq->cint_idx = qidx - non_xdp_queues; +- cq->cqe_cnt = qset->sqe_cnt; ++ if (pfvf->hw.xdp_queues && ++ qidx < non_xdp_queues + pfvf->hw.xdp_queues) { ++ cq->cq_type = CQ_XDP; ++ cq->cint_idx = qidx - non_xdp_queues; ++ cq->cqe_cnt = qset->sqe_cnt; ++ } else { ++ cq->cq_type = CQ_QOS; ++ cq->cint_idx = qidx - non_xdp_queues - ++ pfvf->hw.xdp_queues; ++ cq->cqe_cnt = qset->sqe_cnt; ++ } + } + cq->cqe_size = pfvf->qset.xqe_size; + +@@ -1132,7 +1145,7 @@ int otx2_config_nix(struct otx2_nic *pfvf) + + /* Set RQ/SQ/CQ counts */ + nixlf->rq_cnt = pfvf->hw.rx_queues; +- nixlf->sq_cnt = pfvf->hw.non_qos_queues; ++ nixlf->sq_cnt = otx2_get_total_tx_queues(pfvf); + nixlf->cq_cnt = pfvf->qset.cq_cnt; + nixlf->rss_sz = MAX_RSS_INDIR_TBL_SIZE; + nixlf->rss_grps = MAX_RSS_GROUPS; +@@ -1170,7 +1183,7 @@ void otx2_sq_free_sqbs(struct otx2_nic *pfvf) + int sqb, qidx; + u64 iova, pa; + +- for (qidx = 0; qidx < hw->non_qos_queues; qidx++) { ++ for (qidx = 0; qidx < otx2_get_total_tx_queues(pfvf); qidx++) { + sq = &qset->sq[qidx]; + if (!sq->sqb_ptrs) + continue; +@@ -1238,8 +1251,8 @@ void otx2_aura_pool_free(struct otx2_nic *pfvf) + pfvf->qset.pool = NULL; + } + +-static int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, +- int pool_id, int numptrs) ++int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, ++ int pool_id, int numptrs) + { + struct npa_aq_enq_req *aq; + struct otx2_pool *pool; +@@ -1315,8 +1328,8 @@ static int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, + return 0; + } + +-static int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, +- int stack_pages, int numptrs, int buf_size) ++int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, ++ int stack_pages, int numptrs, int buf_size) + { + struct npa_aq_enq_req *aq; + struct otx2_pool *pool; +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +index 6c81d09798914..876a7b51b8e51 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +@@ -27,6 +27,7 @@ + #include "otx2_txrx.h" + #include "otx2_devlink.h" + #include ++#include "qos.h" + + /* PCI device IDs */ + #define PCI_DEVID_OCTEONTX2_RVU_PF 0xA063 +@@ -186,6 +187,7 @@ struct otx2_hw { + u16 rx_queues; + u16 tx_queues; + u16 xdp_queues; ++ u16 tc_tx_queues; + u16 non_qos_queues; /* tx queues plus xdp queues */ + u16 max_queues; + u16 pool_cnt; +@@ -498,6 +500,8 @@ struct otx2_nic { + u16 pfc_schq_list[NIX_TXSCH_LVL_CNT][MAX_TXSCHQ_PER_FUNC]; + bool pfc_alloc_status[NIX_PF_PFC_PRIO_MAX]; + #endif ++ /* qos */ ++ struct otx2_qos qos; + + /* napi event count. It is needed for adaptive irq coalescing. */ + u32 napi_events; +@@ -742,8 +746,7 @@ static inline void cn10k_aura_freeptr(void *dev, int aura, u64 buf) + /* Alloc pointer from pool/aura */ + static inline u64 otx2_aura_allocptr(struct otx2_nic *pfvf, int aura) + { +- u64 *ptr = (u64 *)otx2_get_regaddr(pfvf, +- NPA_LF_AURA_OP_ALLOCX(0)); ++ u64 *ptr = (__force u64 *)otx2_get_regaddr(pfvf, NPA_LF_AURA_OP_ALLOCX(0)); + u64 incr = (u64)aura | BIT_ULL(63); + + return otx2_atomic64_add(incr, ptr); +@@ -885,12 +888,23 @@ static inline void otx2_dma_unmap_page(struct otx2_nic *pfvf, + + static inline u16 otx2_get_smq_idx(struct otx2_nic *pfvf, u16 qidx) + { ++ u16 smq; + #ifdef CONFIG_DCB + if (qidx < NIX_PF_PFC_PRIO_MAX && pfvf->pfc_alloc_status[qidx]) + return pfvf->pfc_schq_list[NIX_TXSCH_LVL_SMQ][qidx]; + #endif ++ /* check if qidx falls under QOS queues */ ++ if (qidx >= pfvf->hw.non_qos_queues) ++ smq = pfvf->qos.qid_to_sqmap[qidx - pfvf->hw.non_qos_queues]; ++ else ++ smq = pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0]; + +- return pfvf->hw.txschq_list[NIX_TXSCH_LVL_SMQ][0]; ++ return smq; ++} ++ ++static inline u16 otx2_get_total_tx_queues(struct otx2_nic *pfvf) ++{ ++ return pfvf->hw.non_qos_queues + pfvf->hw.tc_tx_queues; + } + + /* MSI-X APIs */ +@@ -920,17 +934,22 @@ int otx2_txsch_alloc(struct otx2_nic *pfvf); + void otx2_txschq_stop(struct otx2_nic *pfvf); + void otx2_txschq_free_one(struct otx2_nic *pfvf, u16 lvl, u16 schq); + void otx2_sqb_flush(struct otx2_nic *pfvf); +-int __otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, +- dma_addr_t *dma); ++int otx2_alloc_rbuf(struct otx2_nic *pfvf, struct otx2_pool *pool, ++ dma_addr_t *dma); + int otx2_rxtx_enable(struct otx2_nic *pfvf, bool enable); + void otx2_ctx_disable(struct mbox *mbox, int type, bool npa); + int otx2_nix_config_bp(struct otx2_nic *pfvf, bool enable); + void otx2_cleanup_rx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); + void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq); ++int otx2_sq_init(struct otx2_nic *pfvf, u16 qidx, u16 sqb_aura); + int otx2_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); + int cn10k_sq_aq_init(void *dev, u16 qidx, u16 sqb_aura); + int otx2_alloc_buffer(struct otx2_nic *pfvf, struct otx2_cq_queue *cq, + dma_addr_t *dma); ++int otx2_pool_init(struct otx2_nic *pfvf, u16 pool_id, ++ int stack_pages, int numptrs, int buf_size); ++int otx2_aura_init(struct otx2_nic *pfvf, int aura_id, ++ int pool_id, int numptrs); + + /* RSS configuration APIs*/ + int otx2_rss_init(struct otx2_nic *pfvf); +@@ -1038,4 +1057,14 @@ static inline void cn10k_handle_mcs_event(struct otx2_nic *pfvf, + {} + #endif /* CONFIG_MACSEC */ + ++/* qos support */ ++static inline void otx2_qos_init(struct otx2_nic *pfvf, int qos_txqs) ++{ ++ struct otx2_hw *hw = &pfvf->hw; ++ ++ hw->tc_tx_queues = qos_txqs; ++} ++ ++u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb, ++ struct net_device *sb_dev); + #endif /* OTX2_COMMON_H */ +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +index 545984a86f235..c558c9b64f5be 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -23,6 +23,7 @@ + #include "otx2_struct.h" + #include "otx2_ptp.h" + #include "cn10k.h" ++#include "qos.h" + #include + + #define DRV_NAME "rvu_nicpf" +@@ -1225,6 +1226,7 @@ static char *nix_snd_status_e_str[NIX_SND_STATUS_MAX] = { + static irqreturn_t otx2_q_intr_handler(int irq, void *data) + { + struct otx2_nic *pf = data; ++ struct otx2_snd_queue *sq; + u64 val, *ptr; + u64 qidx = 0; + +@@ -1256,10 +1258,14 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + } + + /* SQ */ +- for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) { ++ for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) { + u64 sq_op_err_dbg, mnq_err_dbg, snd_err_dbg; + u8 sq_op_err_code, mnq_err_code, snd_err_code; + ++ sq = &pf->qset.sq[qidx]; ++ if (!sq->sqb_ptrs) ++ continue; ++ + /* Below debug registers captures first errors corresponding to + * those registers. We don't have to check against SQ qid as + * these are fatal errors. +@@ -1391,7 +1397,7 @@ static void otx2_free_sq_res(struct otx2_nic *pf) + otx2_ctx_disable(&pf->mbox, NIX_AQ_CTYPE_SQ, false); + /* Free SQB pointers */ + otx2_sq_free_sqbs(pf); +- for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) { ++ for (qidx = 0; qidx < otx2_get_total_tx_queues(pf); qidx++) { + sq = &qset->sq[qidx]; + qmem_free(pf->dev, sq->sqe); + qmem_free(pf->dev, sq->tso_hdrs); +@@ -1441,7 +1447,7 @@ static int otx2_init_hw_resources(struct otx2_nic *pf) + * so, aura count = pool count. + */ + hw->rqpool_cnt = hw->rx_queues; +- hw->sqpool_cnt = hw->non_qos_queues; ++ hw->sqpool_cnt = otx2_get_total_tx_queues(pf); + hw->pool_cnt = hw->rqpool_cnt + hw->sqpool_cnt; + + /* Maximum hardware supported transmit length */ +@@ -1694,11 +1700,14 @@ int otx2_open(struct net_device *netdev) + + netif_carrier_off(netdev); + +- pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.non_qos_queues; + /* RQ and SQs are mapped to different CQs, + * so find out max CQ IRQs (i.e CINTs) needed. + */ +- pf->hw.cint_cnt = max(pf->hw.rx_queues, pf->hw.tx_queues); ++ pf->hw.cint_cnt = max3(pf->hw.rx_queues, pf->hw.tx_queues, ++ pf->hw.tc_tx_queues); ++ ++ pf->qset.cq_cnt = pf->hw.rx_queues + otx2_get_total_tx_queues(pf); ++ + qset->napi = kcalloc(pf->hw.cint_cnt, sizeof(*cq_poll), GFP_KERNEL); + if (!qset->napi) + return -ENOMEM; +@@ -1749,6 +1758,11 @@ int otx2_open(struct net_device *netdev) + else + cq_poll->cq_ids[CQ_XDP] = CINT_INVALID_CQ; + ++ cq_poll->cq_ids[CQ_QOS] = (qidx < pf->hw.tc_tx_queues) ? ++ (qidx + pf->hw.rx_queues + ++ pf->hw.non_qos_queues) : ++ CINT_INVALID_CQ; ++ + cq_poll->dev = (void *)pf; + cq_poll->dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_CQE; + INIT_WORK(&cq_poll->dim.work, otx2_dim_work); +@@ -1953,6 +1967,12 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev) + int qidx = skb_get_queue_mapping(skb); + struct otx2_snd_queue *sq; + struct netdev_queue *txq; ++ int sq_idx; ++ ++ /* XDP SQs are not mapped with TXQs ++ * advance qid to derive correct sq mapped with QOS ++ */ ++ sq_idx = (qidx >= pf->hw.tx_queues) ? (qidx + pf->hw.xdp_queues) : qidx; + + /* Check for minimum and maximum packet length */ + if (skb->len <= ETH_HLEN || +@@ -1961,7 +1981,7 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev) + return NETDEV_TX_OK; + } + +- sq = &pf->qset.sq[qidx]; ++ sq = &pf->qset.sq[sq_idx]; + txq = netdev_get_tx_queue(netdev, qidx); + + if (!otx2_sq_append_skb(netdev, sq, skb, qidx)) { +@@ -1979,8 +1999,8 @@ static netdev_tx_t otx2_xmit(struct sk_buff *skb, struct net_device *netdev) + return NETDEV_TX_OK; + } + +-static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb, +- struct net_device *sb_dev) ++u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb, ++ struct net_device *sb_dev) + { + #ifdef CONFIG_DCB + struct otx2_nic *pf = netdev_priv(netdev); +@@ -2002,6 +2022,7 @@ static u16 otx2_select_queue(struct net_device *netdev, struct sk_buff *skb, + #endif + return netdev_pick_tx(netdev, skb, NULL); + } ++EXPORT_SYMBOL(otx2_select_queue); + + static netdev_features_t otx2_fix_features(struct net_device *dev, + netdev_features_t features) +@@ -2715,10 +2736,10 @@ static void otx2_sriov_vfcfg_cleanup(struct otx2_nic *pf) + static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) + { + struct device *dev = &pdev->dev; ++ int err, qcount, qos_txqs; + struct net_device *netdev; + struct otx2_nic *pf; + struct otx2_hw *hw; +- int err, qcount; + int num_vec; + + err = pcim_enable_device(pdev); +@@ -2743,8 +2764,9 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) + + /* Set number of queues */ + qcount = min_t(int, num_online_cpus(), OTX2_MAX_CQ_CNT); ++ qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES); + +- netdev = alloc_etherdev_mqs(sizeof(*pf), qcount, qcount); ++ netdev = alloc_etherdev_mqs(sizeof(*pf), qcount + qos_txqs, qcount); + if (!netdev) { + err = -ENOMEM; + goto err_release_regions; +@@ -2931,6 +2953,8 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) + goto err_pf_sriov_init; + #endif + ++ otx2_qos_init(pf, qos_txqs); ++ + return 0; + + err_pf_sriov_init: +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +index 5704fb75fa477..d005434e1e037 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.c +@@ -468,12 +468,13 @@ static int otx2_tx_napi_handler(struct otx2_nic *pfvf, + break; + } + +- if (cq->cq_type == CQ_XDP) { ++ qidx = cq->cq_idx - pfvf->hw.rx_queues; ++ ++ if (cq->cq_type == CQ_XDP) + otx2_xdp_snd_pkt_handler(pfvf, sq, cqe); +- } else { +- otx2_snd_pkt_handler(pfvf, cq, sq, cqe, budget, +- &tx_pkts, &tx_bytes); +- } ++ else ++ otx2_snd_pkt_handler(pfvf, cq, &pfvf->qset.sq[qidx], ++ cqe, budget, &tx_pkts, &tx_bytes); + + cqe->hdr.cqe_type = NIX_XQE_TYPE_INVALID; + processed_cqe++; +@@ -490,7 +491,11 @@ static int otx2_tx_napi_handler(struct otx2_nic *pfvf, + if (likely(tx_pkts)) { + struct netdev_queue *txq; + +- txq = netdev_get_tx_queue(pfvf->netdev, cq->cint_idx); ++ qidx = cq->cq_idx - pfvf->hw.rx_queues; ++ ++ if (qidx >= pfvf->hw.tx_queues) ++ qidx -= pfvf->hw.xdp_queues; ++ txq = netdev_get_tx_queue(pfvf->netdev, qidx); + netdev_tx_completed_queue(txq, tx_pkts, tx_bytes); + /* Check if queue was stopped earlier due to ring full */ + smp_mb(); +@@ -738,7 +743,8 @@ static void otx2_sqe_add_hdr(struct otx2_nic *pfvf, struct otx2_snd_queue *sq, + sqe_hdr->aura = sq->aura_id; + /* Post a CQE Tx after pkt transmission */ + sqe_hdr->pnc = 1; +- sqe_hdr->sq = qidx; ++ sqe_hdr->sq = (qidx >= pfvf->hw.tx_queues) ? ++ qidx + pfvf->hw.xdp_queues : qidx; + } + sqe_hdr->total = skb->len; + /* Set SQE identifier which will be used later for freeing SKB */ +@@ -1223,8 +1229,10 @@ void otx2_cleanup_tx_cqes(struct otx2_nic *pfvf, struct otx2_cq_queue *cq) + struct nix_cqe_tx_s *cqe; + int processed_cqe = 0; + struct sg_list *sg; ++ int qidx; + +- sq = &pfvf->qset.sq[cq->cint_idx]; ++ qidx = cq->cq_idx - pfvf->hw.rx_queues; ++ sq = &pfvf->qset.sq[qidx]; + + if (otx2_nix_cq_op_status(pfvf, cq) || !cq->pend_cqe) + return; +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +index 93cac2c2664c2..7ab6db9a986fa 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_txrx.h +@@ -102,7 +102,8 @@ enum cq_type { + CQ_RX, + CQ_TX, + CQ_XDP, +- CQS_PER_CINT = 3, /* RQ + SQ + XDP */ ++ CQ_QOS, ++ CQS_PER_CINT = 4, /* RQ + SQ + XDP + QOS_SQ */ + }; + + struct otx2_cq_poll { +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +index ad90f8f2aad1f..404855bccb4b6 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +@@ -475,6 +475,7 @@ static const struct net_device_ops otx2vf_netdev_ops = { + .ndo_open = otx2vf_open, + .ndo_stop = otx2vf_stop, + .ndo_start_xmit = otx2vf_xmit, ++ .ndo_select_queue = otx2_select_queue, + .ndo_set_rx_mode = otx2vf_set_rx_mode, + .ndo_set_mac_address = otx2_set_mac_address, + .ndo_change_mtu = otx2vf_change_mtu, +@@ -520,10 +521,10 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) + { + int num_vec = pci_msix_vec_count(pdev); + struct device *dev = &pdev->dev; ++ int err, qcount, qos_txqs; + struct net_device *netdev; + struct otx2_nic *vf; + struct otx2_hw *hw; +- int err, qcount; + + err = pcim_enable_device(pdev); + if (err) { +@@ -546,7 +547,8 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) + pci_set_master(pdev); + + qcount = num_online_cpus(); +- netdev = alloc_etherdev_mqs(sizeof(*vf), qcount, qcount); ++ qos_txqs = min_t(int, qcount, OTX2_QOS_MAX_LEAF_NODES); ++ netdev = alloc_etherdev_mqs(sizeof(*vf), qcount + qos_txqs, qcount); + if (!netdev) { + err = -ENOMEM; + goto err_release_regions; +@@ -695,6 +697,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) + if (err) + goto err_shutdown_tc; + #endif ++ otx2_qos_init(vf, qos_txqs); + + return 0; + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos.h b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h +new file mode 100644 +index 0000000000000..73a62d092e99a +--- /dev/null ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos.h +@@ -0,0 +1,19 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++/* Marvell RVU Ethernet driver ++ * ++ * Copyright (C) 2023 Marvell. ++ * ++ */ ++#ifndef OTX2_QOS_H ++#define OTX2_QOS_H ++ ++#define OTX2_QOS_MAX_LEAF_NODES 16 ++ ++int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq); ++void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq); ++ ++struct otx2_qos { ++ u16 qid_to_sqmap[OTX2_QOS_MAX_LEAF_NODES]; ++ }; ++ ++#endif +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c +new file mode 100644 +index 0000000000000..e142d43f5a62c +--- /dev/null ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/qos_sq.c +@@ -0,0 +1,282 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* Marvell RVU Physical Function ethernet driver ++ * ++ * Copyright (C) 2023 Marvell. ++ * ++ */ ++ ++#include ++#include ++ ++#include "cn10k.h" ++#include "otx2_reg.h" ++#include "otx2_common.h" ++#include "otx2_txrx.h" ++#include "otx2_struct.h" ++ ++#define OTX2_QOS_MAX_LEAF_NODES 16 ++ ++static void otx2_qos_aura_pool_free(struct otx2_nic *pfvf, int pool_id) ++{ ++ struct otx2_pool *pool; ++ ++ if (!pfvf->qset.pool) ++ return; ++ ++ pool = &pfvf->qset.pool[pool_id]; ++ qmem_free(pfvf->dev, pool->stack); ++ qmem_free(pfvf->dev, pool->fc_addr); ++ pool->stack = NULL; ++ pool->fc_addr = NULL; ++} ++ ++static int otx2_qos_sq_aura_pool_init(struct otx2_nic *pfvf, int qidx) ++{ ++ struct otx2_qset *qset = &pfvf->qset; ++ int pool_id, stack_pages, num_sqbs; ++ struct otx2_hw *hw = &pfvf->hw; ++ struct otx2_snd_queue *sq; ++ struct otx2_pool *pool; ++ dma_addr_t bufptr; ++ int err, ptr; ++ u64 iova, pa; ++ ++ /* Calculate number of SQBs needed. ++ * ++ * For a 128byte SQE, and 4K size SQB, 31 SQEs will fit in one SQB. ++ * Last SQE is used for pointing to next SQB. ++ */ ++ num_sqbs = (hw->sqb_size / 128) - 1; ++ num_sqbs = (qset->sqe_cnt + num_sqbs) / num_sqbs; ++ ++ /* Get no of stack pages needed */ ++ stack_pages = ++ (num_sqbs + hw->stack_pg_ptrs - 1) / hw->stack_pg_ptrs; ++ ++ pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx); ++ pool = &pfvf->qset.pool[pool_id]; ++ ++ /* Initialize aura context */ ++ err = otx2_aura_init(pfvf, pool_id, pool_id, num_sqbs); ++ if (err) ++ return err; ++ ++ /* Initialize pool context */ ++ err = otx2_pool_init(pfvf, pool_id, stack_pages, ++ num_sqbs, hw->sqb_size); ++ if (err) ++ goto aura_free; ++ ++ /* Flush accumulated messages */ ++ err = otx2_sync_mbox_msg(&pfvf->mbox); ++ if (err) ++ goto pool_free; ++ ++ /* Allocate pointers and free them to aura/pool */ ++ sq = &qset->sq[qidx]; ++ sq->sqb_count = 0; ++ sq->sqb_ptrs = kcalloc(num_sqbs, sizeof(*sq->sqb_ptrs), GFP_KERNEL); ++ if (!sq->sqb_ptrs) { ++ err = -ENOMEM; ++ goto pool_free; ++ } ++ ++ for (ptr = 0; ptr < num_sqbs; ptr++) { ++ err = otx2_alloc_rbuf(pfvf, pool, &bufptr); ++ if (err) ++ goto sqb_free; ++ pfvf->hw_ops->aura_freeptr(pfvf, pool_id, bufptr); ++ sq->sqb_ptrs[sq->sqb_count++] = (u64)bufptr; ++ } ++ ++ return 0; ++ ++sqb_free: ++ while (ptr--) { ++ if (!sq->sqb_ptrs[ptr]) ++ continue; ++ iova = sq->sqb_ptrs[ptr]; ++ pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); ++ dma_unmap_page_attrs(pfvf->dev, iova, hw->sqb_size, ++ DMA_FROM_DEVICE, ++ DMA_ATTR_SKIP_CPU_SYNC); ++ put_page(virt_to_page(phys_to_virt(pa))); ++ otx2_aura_allocptr(pfvf, pool_id); ++ } ++ sq->sqb_count = 0; ++ kfree(sq->sqb_ptrs); ++pool_free: ++ qmem_free(pfvf->dev, pool->stack); ++aura_free: ++ qmem_free(pfvf->dev, pool->fc_addr); ++ otx2_mbox_reset(&pfvf->mbox.mbox, 0); ++ return err; ++} ++ ++static void otx2_qos_sq_free_sqbs(struct otx2_nic *pfvf, int qidx) ++{ ++ struct otx2_qset *qset = &pfvf->qset; ++ struct otx2_hw *hw = &pfvf->hw; ++ struct otx2_snd_queue *sq; ++ u64 iova, pa; ++ int sqb; ++ ++ sq = &qset->sq[qidx]; ++ if (!sq->sqb_ptrs) ++ return; ++ for (sqb = 0; sqb < sq->sqb_count; sqb++) { ++ if (!sq->sqb_ptrs[sqb]) ++ continue; ++ iova = sq->sqb_ptrs[sqb]; ++ pa = otx2_iova_to_phys(pfvf->iommu_domain, iova); ++ dma_unmap_page_attrs(pfvf->dev, iova, hw->sqb_size, ++ DMA_FROM_DEVICE, ++ DMA_ATTR_SKIP_CPU_SYNC); ++ put_page(virt_to_page(phys_to_virt(pa))); ++ } ++ ++ sq->sqb_count = 0; ++ ++ sq = &qset->sq[qidx]; ++ qmem_free(pfvf->dev, sq->sqe); ++ qmem_free(pfvf->dev, sq->tso_hdrs); ++ kfree(sq->sg); ++ kfree(sq->sqb_ptrs); ++ qmem_free(pfvf->dev, sq->timestamps); ++ ++ memset((void *)sq, 0, sizeof(*sq)); ++} ++ ++/* send queue id */ ++static void otx2_qos_sqb_flush(struct otx2_nic *pfvf, int qidx) ++{ ++ int sqe_tail, sqe_head; ++ u64 incr, *ptr, val; ++ ++ ptr = (__force u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS); ++ incr = (u64)qidx << 32; ++ val = otx2_atomic64_add(incr, ptr); ++ sqe_head = (val >> 20) & 0x3F; ++ sqe_tail = (val >> 28) & 0x3F; ++ if (sqe_head != sqe_tail) ++ usleep_range(50, 60); ++} ++ ++static int otx2_qos_ctx_disable(struct otx2_nic *pfvf, u16 qidx, int aura_id) ++{ ++ struct nix_cn10k_aq_enq_req *cn10k_sq_aq; ++ struct npa_aq_enq_req *aura_aq; ++ struct npa_aq_enq_req *pool_aq; ++ struct nix_aq_enq_req *sq_aq; ++ ++ if (test_bit(CN10K_LMTST, &pfvf->hw.cap_flag)) { ++ cn10k_sq_aq = otx2_mbox_alloc_msg_nix_cn10k_aq_enq(&pfvf->mbox); ++ if (!cn10k_sq_aq) ++ return -ENOMEM; ++ cn10k_sq_aq->qidx = qidx; ++ cn10k_sq_aq->sq.ena = 0; ++ cn10k_sq_aq->sq_mask.ena = 1; ++ cn10k_sq_aq->ctype = NIX_AQ_CTYPE_SQ; ++ cn10k_sq_aq->op = NIX_AQ_INSTOP_WRITE; ++ } else { ++ sq_aq = otx2_mbox_alloc_msg_nix_aq_enq(&pfvf->mbox); ++ if (!sq_aq) ++ return -ENOMEM; ++ sq_aq->qidx = qidx; ++ sq_aq->sq.ena = 0; ++ sq_aq->sq_mask.ena = 1; ++ sq_aq->ctype = NIX_AQ_CTYPE_SQ; ++ sq_aq->op = NIX_AQ_INSTOP_WRITE; ++ } ++ ++ aura_aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox); ++ if (!aura_aq) { ++ otx2_mbox_reset(&pfvf->mbox.mbox, 0); ++ return -ENOMEM; ++ } ++ ++ aura_aq->aura_id = aura_id; ++ aura_aq->aura.ena = 0; ++ aura_aq->aura_mask.ena = 1; ++ aura_aq->ctype = NPA_AQ_CTYPE_AURA; ++ aura_aq->op = NPA_AQ_INSTOP_WRITE; ++ ++ pool_aq = otx2_mbox_alloc_msg_npa_aq_enq(&pfvf->mbox); ++ if (!pool_aq) { ++ otx2_mbox_reset(&pfvf->mbox.mbox, 0); ++ return -ENOMEM; ++ } ++ ++ pool_aq->aura_id = aura_id; ++ pool_aq->pool.ena = 0; ++ pool_aq->pool_mask.ena = 1; ++ ++ pool_aq->ctype = NPA_AQ_CTYPE_POOL; ++ pool_aq->op = NPA_AQ_INSTOP_WRITE; ++ ++ return otx2_sync_mbox_msg(&pfvf->mbox); ++} ++ ++int otx2_qos_enable_sq(struct otx2_nic *pfvf, int qidx, u16 smq) ++{ ++ struct otx2_hw *hw = &pfvf->hw; ++ int pool_id, sq_idx, err; ++ ++ if (pfvf->flags & OTX2_FLAG_INTF_DOWN) ++ return -EPERM; ++ ++ sq_idx = hw->non_qos_queues + qidx; ++ ++ mutex_lock(&pfvf->mbox.lock); ++ err = otx2_qos_sq_aura_pool_init(pfvf, sq_idx); ++ if (err) ++ goto out; ++ ++ pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx); ++ pfvf->qos.qid_to_sqmap[qidx] = smq; ++ err = otx2_sq_init(pfvf, sq_idx, pool_id); ++ if (err) ++ goto out; ++out: ++ mutex_unlock(&pfvf->mbox.lock); ++ return err; ++} ++ ++void otx2_qos_disable_sq(struct otx2_nic *pfvf, int qidx, u16 mdq) ++{ ++ struct otx2_qset *qset = &pfvf->qset; ++ struct otx2_hw *hw = &pfvf->hw; ++ struct otx2_snd_queue *sq; ++ struct otx2_cq_queue *cq; ++ int pool_id, sq_idx; ++ ++ sq_idx = hw->non_qos_queues + qidx; ++ ++ /* If the DOWN flag is set SQs are already freed */ ++ if (pfvf->flags & OTX2_FLAG_INTF_DOWN) ++ return; ++ ++ sq = &pfvf->qset.sq[sq_idx]; ++ if (!sq->sqb_ptrs) ++ return; ++ ++ if (sq_idx < hw->non_qos_queues || ++ sq_idx >= otx2_get_total_tx_queues(pfvf)) { ++ netdev_err(pfvf->netdev, "Send Queue is not a QoS queue\n"); ++ return; ++ } ++ ++ cq = &qset->cq[pfvf->hw.rx_queues + sq_idx]; ++ pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, sq_idx); ++ ++ otx2_qos_sqb_flush(pfvf, sq_idx); ++ otx2_smq_flush(pfvf, otx2_get_smq_idx(pfvf, sq_idx)); ++ otx2_cleanup_tx_cqes(pfvf, cq); ++ ++ mutex_lock(&pfvf->mbox.lock); ++ otx2_qos_ctx_disable(pfvf, sq_idx, pool_id); ++ mutex_unlock(&pfvf->mbox.lock); ++ ++ otx2_qos_sq_free_sqbs(pfvf, sq_idx); ++ otx2_qos_aura_pool_free(pfvf, pool_id); ++} +-- +2.42.0 + diff --git a/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch b/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch new file mode 100644 index 00000000000..40ba57237a6 --- /dev/null +++ b/queue-6.1/octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch @@ -0,0 +1,184 @@ +From e00182d3cffef2a3d2f81c12a80094332e4d9a8b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 13 May 2023 14:21:37 +0530 +Subject: octeontx2-pf: Rename tot_tx_queues to non_qos_queues + +From: Hariprasad Kelam + +[ Upstream commit 508c58f76ca510956625c945f9b8eb104f2c8208 ] + +current implementation is such that tot_tx_queues contains both +xdp queues and normal tx queues. which will be allocated in interface +open calls and deallocated on interface down calls respectively. + +With addition of QOS, where send quees are allocated/deallacated upon +user request Qos send queues won't be part of tot_tx_queues. So this +patch renames tot_tx_queues to non_qos_queues. + +Signed-off-by: Hariprasad Kelam +Reviewed-by: Simon Horman +Reviewed-by: Jacob Keller +Signed-off-by: David S. Miller +Stable-dep-of: 3423ca23e08b ("octeontx2-pf: Free pending and dropped SQEs") +Signed-off-by: Sasha Levin +--- + .../ethernet/marvell/octeontx2/nic/otx2_common.c | 12 ++++++------ + .../ethernet/marvell/octeontx2/nic/otx2_common.h | 2 +- + .../net/ethernet/marvell/octeontx2/nic/otx2_pf.c | 14 +++++++------- + .../net/ethernet/marvell/octeontx2/nic/otx2_vf.c | 2 +- + 4 files changed, 15 insertions(+), 15 deletions(-) + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +index 011355e73696e..2575c207150e1 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.c +@@ -799,7 +799,7 @@ void otx2_sqb_flush(struct otx2_nic *pfvf) + int timeout = 1000; + + ptr = (u64 *)otx2_get_regaddr(pfvf, NIX_LF_SQ_OP_STATUS); +- for (qidx = 0; qidx < pfvf->hw.tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) { + incr = (u64)qidx << 32; + while (timeout) { + val = otx2_atomic64_add(incr, ptr); +@@ -1085,7 +1085,7 @@ int otx2_config_nix_queues(struct otx2_nic *pfvf) + } + + /* Initialize TX queues */ +- for (qidx = 0; qidx < pfvf->hw.tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < pfvf->hw.non_qos_queues; qidx++) { + u16 sqb_aura = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx); + + err = otx2_sq_init(pfvf, qidx, sqb_aura); +@@ -1132,7 +1132,7 @@ int otx2_config_nix(struct otx2_nic *pfvf) + + /* Set RQ/SQ/CQ counts */ + nixlf->rq_cnt = pfvf->hw.rx_queues; +- nixlf->sq_cnt = pfvf->hw.tot_tx_queues; ++ nixlf->sq_cnt = pfvf->hw.non_qos_queues; + nixlf->cq_cnt = pfvf->qset.cq_cnt; + nixlf->rss_sz = MAX_RSS_INDIR_TBL_SIZE; + nixlf->rss_grps = MAX_RSS_GROUPS; +@@ -1170,7 +1170,7 @@ void otx2_sq_free_sqbs(struct otx2_nic *pfvf) + int sqb, qidx; + u64 iova, pa; + +- for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < hw->non_qos_queues; qidx++) { + sq = &qset->sq[qidx]; + if (!sq->sqb_ptrs) + continue; +@@ -1386,7 +1386,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) + stack_pages = + (num_sqbs + hw->stack_pg_ptrs - 1) / hw->stack_pg_ptrs; + +- for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < hw->non_qos_queues; qidx++) { + pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx); + /* Initialize aura context */ + err = otx2_aura_init(pfvf, pool_id, pool_id, num_sqbs); +@@ -1406,7 +1406,7 @@ int otx2_sq_aura_pool_init(struct otx2_nic *pfvf) + goto fail; + + /* Allocate pointers and free them to aura/pool */ +- for (qidx = 0; qidx < hw->tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < hw->non_qos_queues; qidx++) { + pool_id = otx2_get_pool_idx(pfvf, AURA_NIX_SQ, qidx); + pool = &pfvf->qset.pool[pool_id]; + +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +index 8a9793b06769f..6c81d09798914 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_common.h +@@ -186,7 +186,7 @@ struct otx2_hw { + u16 rx_queues; + u16 tx_queues; + u16 xdp_queues; +- u16 tot_tx_queues; ++ u16 non_qos_queues; /* tx queues plus xdp queues */ + u16 max_queues; + u16 pool_cnt; + u16 rqpool_cnt; +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +index 101d79a0bb436..545984a86f235 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_pf.c +@@ -1256,7 +1256,7 @@ static irqreturn_t otx2_q_intr_handler(int irq, void *data) + } + + /* SQ */ +- for (qidx = 0; qidx < pf->hw.tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) { + u64 sq_op_err_dbg, mnq_err_dbg, snd_err_dbg; + u8 sq_op_err_code, mnq_err_code, snd_err_code; + +@@ -1391,7 +1391,7 @@ static void otx2_free_sq_res(struct otx2_nic *pf) + otx2_ctx_disable(&pf->mbox, NIX_AQ_CTYPE_SQ, false); + /* Free SQB pointers */ + otx2_sq_free_sqbs(pf); +- for (qidx = 0; qidx < pf->hw.tot_tx_queues; qidx++) { ++ for (qidx = 0; qidx < pf->hw.non_qos_queues; qidx++) { + sq = &qset->sq[qidx]; + qmem_free(pf->dev, sq->sqe); + qmem_free(pf->dev, sq->tso_hdrs); +@@ -1441,7 +1441,7 @@ static int otx2_init_hw_resources(struct otx2_nic *pf) + * so, aura count = pool count. + */ + hw->rqpool_cnt = hw->rx_queues; +- hw->sqpool_cnt = hw->tot_tx_queues; ++ hw->sqpool_cnt = hw->non_qos_queues; + hw->pool_cnt = hw->rqpool_cnt + hw->sqpool_cnt; + + /* Maximum hardware supported transmit length */ +@@ -1694,7 +1694,7 @@ int otx2_open(struct net_device *netdev) + + netif_carrier_off(netdev); + +- pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.tot_tx_queues; ++ pf->qset.cq_cnt = pf->hw.rx_queues + pf->hw.non_qos_queues; + /* RQ and SQs are mapped to different CQs, + * so find out max CQ IRQs (i.e CINTs) needed. + */ +@@ -1714,7 +1714,7 @@ int otx2_open(struct net_device *netdev) + if (!qset->cq) + goto err_free_mem; + +- qset->sq = kcalloc(pf->hw.tot_tx_queues, ++ qset->sq = kcalloc(pf->hw.non_qos_queues, + sizeof(struct otx2_snd_queue), GFP_KERNEL); + if (!qset->sq) + goto err_free_mem; +@@ -2532,7 +2532,7 @@ static int otx2_xdp_setup(struct otx2_nic *pf, struct bpf_prog *prog) + else + pf->hw.xdp_queues = 0; + +- pf->hw.tot_tx_queues += pf->hw.xdp_queues; ++ pf->hw.non_qos_queues += pf->hw.xdp_queues; + + if (if_up) + otx2_open(pf->netdev); +@@ -2763,7 +2763,7 @@ static int otx2_probe(struct pci_dev *pdev, const struct pci_device_id *id) + hw->pdev = pdev; + hw->rx_queues = qcount; + hw->tx_queues = qcount; +- hw->tot_tx_queues = qcount; ++ hw->non_qos_queues = qcount; + hw->max_queues = qcount; + hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN; + /* Use CQE of 128 byte descriptor size by default */ +diff --git a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +index f8f0c01f62a14..ad90f8f2aad1f 100644 +--- a/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c ++++ b/drivers/net/ethernet/marvell/octeontx2/nic/otx2_vf.c +@@ -566,7 +566,7 @@ static int otx2vf_probe(struct pci_dev *pdev, const struct pci_device_id *id) + hw->rx_queues = qcount; + hw->tx_queues = qcount; + hw->max_queues = qcount; +- hw->tot_tx_queues = qcount; ++ hw->non_qos_queues = qcount; + hw->rbuf_len = OTX2_DEFAULT_RBUF_LEN; + /* Use CQE of 128 byte descriptor size by default */ + hw->xqe_size = 128; +-- +2.42.0 + diff --git a/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch b/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch new file mode 100644 index 00000000000..1d397d70b07 --- /dev/null +++ b/queue-6.1/pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch @@ -0,0 +1,51 @@ +From 4034c635c55102451912c2907d82937d7443bef6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 4 Oct 2023 10:54:14 -0700 +Subject: pwm: brcmstb: Utilize appropriate clock APIs in suspend/resume +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Florian Fainelli + +[ Upstream commit e9bc4411548aaa738905d37851a0146c16b3bb21 ] + +The suspend/resume functions currently utilize +clk_disable()/clk_enable() respectively which may be no-ops with certain +clock providers such as SCMI. Fix this to use clk_disable_unprepare() +and clk_prepare_enable() respectively as we should. + +Fixes: 3a9f5957020f ("pwm: Add Broadcom BCM7038 PWM controller support") +Signed-off-by: Florian Fainelli +Acked-by: Uwe Kleine-König +Signed-off-by: Thierry Reding +Signed-off-by: Sasha Levin +--- + drivers/pwm/pwm-brcmstb.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/pwm/pwm-brcmstb.c b/drivers/pwm/pwm-brcmstb.c +index 3db3f96edf78d..6afd34d651c77 100644 +--- a/drivers/pwm/pwm-brcmstb.c ++++ b/drivers/pwm/pwm-brcmstb.c +@@ -290,7 +290,7 @@ static int brcmstb_pwm_suspend(struct device *dev) + { + struct brcmstb_pwm *p = dev_get_drvdata(dev); + +- clk_disable(p->clk); ++ clk_disable_unprepare(p->clk); + + return 0; + } +@@ -299,7 +299,7 @@ static int brcmstb_pwm_resume(struct device *dev) + { + struct brcmstb_pwm *p = dev_get_drvdata(dev); + +- clk_enable(p->clk); ++ clk_prepare_enable(p->clk); + + return 0; + } +-- +2.42.0 + diff --git a/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch b/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch new file mode 100644 index 00000000000..ab1ae5f76a8 --- /dev/null +++ b/queue-6.1/pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch @@ -0,0 +1,115 @@ +From f21ab36aee4e32227e5dc76c74ef752c2193b133 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 5 Jul 2023 10:06:48 +0200 +Subject: pwm: sti: Reduce number of allocations and drop usage of chip_data +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Uwe Kleine-König + +[ Upstream commit 2d6812b41e0d832919d72c72ebddf361df53ba1b ] + +Instead of using one allocation per capture channel, use a single one. Also +store it in driver data instead of chip data. + +This has several advantages: + + - driver data isn't cleared when pwm_put() is called + - Reduces memory fragmentation + +Also register the pwm chip only after the per capture channel data is +initialized as the capture callback relies on this initialization and it +might be called even before pwmchip_add() returns. + +It would be still better to have struct sti_pwm_compat_data and the +per-channel data struct sti_cpt_ddata in a single memory chunk, but that's +not easily possible because the number of capture channels isn't known yet +when the driver data struct is allocated. + +Fixes: e926b12c611c ("pwm: Clear chip_data in pwm_put()") +Reported-by: George Stark +Fixes: c97267ae831d ("pwm: sti: Add PWM capture callback") +Link: https://lore.kernel.org/r/20230705080650.2353391-7-u.kleine-koenig@pengutronix.de +Signed-off-by: Uwe Kleine-König +Signed-off-by: Thierry Reding +Signed-off-by: Sasha Levin +--- + drivers/pwm/pwm-sti.c | 29 ++++++++++++++--------------- + 1 file changed, 14 insertions(+), 15 deletions(-) + +diff --git a/drivers/pwm/pwm-sti.c b/drivers/pwm/pwm-sti.c +index 44b1f93256b36..652fdb8dc7bfa 100644 +--- a/drivers/pwm/pwm-sti.c ++++ b/drivers/pwm/pwm-sti.c +@@ -79,6 +79,7 @@ struct sti_pwm_compat_data { + unsigned int cpt_num_devs; + unsigned int max_pwm_cnt; + unsigned int max_prescale; ++ struct sti_cpt_ddata *ddata; + }; + + struct sti_pwm_chip { +@@ -314,7 +315,7 @@ static int sti_pwm_capture(struct pwm_chip *chip, struct pwm_device *pwm, + { + struct sti_pwm_chip *pc = to_sti_pwmchip(chip); + struct sti_pwm_compat_data *cdata = pc->cdata; +- struct sti_cpt_ddata *ddata = pwm_get_chip_data(pwm); ++ struct sti_cpt_ddata *ddata = &cdata->ddata[pwm->hwpwm]; + struct device *dev = pc->dev; + unsigned int effective_ticks; + unsigned long long high, low; +@@ -440,7 +441,7 @@ static irqreturn_t sti_pwm_interrupt(int irq, void *data) + while (cpt_int_stat) { + devicenum = ffs(cpt_int_stat) - 1; + +- ddata = pwm_get_chip_data(&pc->chip.pwms[devicenum]); ++ ddata = &pc->cdata->ddata[devicenum]; + + /* + * Capture input: +@@ -638,30 +639,28 @@ static int sti_pwm_probe(struct platform_device *pdev) + dev_err(dev, "failed to prepare clock\n"); + return ret; + } ++ ++ cdata->ddata = devm_kzalloc(dev, cdata->cpt_num_devs * sizeof(*cdata->ddata), GFP_KERNEL); ++ if (!cdata->ddata) ++ return -ENOMEM; + } + + pc->chip.dev = dev; + pc->chip.ops = &sti_pwm_ops; + pc->chip.npwm = pc->cdata->pwm_num_devs; + +- ret = pwmchip_add(&pc->chip); +- if (ret < 0) { +- clk_unprepare(pc->pwm_clk); +- clk_unprepare(pc->cpt_clk); +- return ret; +- } +- + for (i = 0; i < cdata->cpt_num_devs; i++) { +- struct sti_cpt_ddata *ddata; +- +- ddata = devm_kzalloc(dev, sizeof(*ddata), GFP_KERNEL); +- if (!ddata) +- return -ENOMEM; ++ struct sti_cpt_ddata *ddata = &cdata->ddata[i]; + + init_waitqueue_head(&ddata->wait); + mutex_init(&ddata->lock); ++ } + +- pwm_set_chip_data(&pc->chip.pwms[i], ddata); ++ ret = pwmchip_add(&pc->chip); ++ if (ret < 0) { ++ clk_unprepare(pc->pwm_clk); ++ clk_unprepare(pc->cpt_clk); ++ return ret; + } + + platform_set_drvdata(pdev, pc); +-- +2.42.0 + diff --git a/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch b/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch new file mode 100644 index 00000000000..64cec14a252 --- /dev/null +++ b/queue-6.1/r8169-respect-userspace-disabling-iff_multicast.patch @@ -0,0 +1,42 @@ +From 43453f1f795cd8c5e12ff2302a4d1e1c177f1f55 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 5 Nov 2023 23:43:36 +0100 +Subject: r8169: respect userspace disabling IFF_MULTICAST + +From: Heiner Kallweit + +[ Upstream commit 8999ce4cfc87e61b4143ec2e7b93d8e92e11fa7f ] + +So far we ignore the setting of IFF_MULTICAST. Fix this and clear bit +AcceptMulticast if IFF_MULTICAST isn't set. + +Note: Based on the implementations I've seen it doesn't seem to be 100% clear +what a driver is supposed to do if IFF_ALLMULTI is set but IFF_MULTICAST +is not. This patch is based on the understanding that IFF_MULTICAST has +precedence. + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Signed-off-by: Heiner Kallweit +Link: https://lore.kernel.org/r/4a57ba02-d52d-4369-9f14-3565e6c1f7dc@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/realtek/r8169_main.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c +index c56d3538889b6..d14706265d9cb 100644 +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -2512,6 +2512,8 @@ static void rtl_set_rx_mode(struct net_device *dev) + + if (dev->flags & IFF_PROMISC) { + rx_mode |= AcceptAllPhys; ++ } else if (!(dev->flags & IFF_MULTICAST)) { ++ rx_mode &= ~AcceptMulticast; + } else if (netdev_mc_count(dev) > MC_FILTER_LIMIT || + dev->flags & IFF_ALLMULTI || + tp->mac_version == RTL_GIGA_MAC_VER_35 || +-- +2.42.0 + diff --git a/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch b/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch new file mode 100644 index 00000000000..0e9d021b00e --- /dev/null +++ b/queue-6.1/risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch @@ -0,0 +1,56 @@ +From fe1e50b0024a7bd3424894e00fc66fd5271b2ec1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 27 Oct 2023 21:12:53 +0530 +Subject: RISC-V: Don't fail in riscv_of_parent_hartid() for disabled HARTs + +From: Anup Patel + +[ Upstream commit c4676f8dc1e12e68d6511f9ed89707fdad4c962c ] + +The riscv_of_processor_hartid() used by riscv_of_parent_hartid() fails +for HARTs disabled in the DT. This results in the following warning +thrown by the RISC-V INTC driver for the E-core on SiFive boards: + +[ 0.000000] riscv-intc: unable to find hart id for /cpus/cpu@0/interrupt-controller + +The riscv_of_parent_hartid() is only expected to read the hartid +from the DT so we directly call of_get_cpu_hwid() instead of calling +riscv_of_processor_hartid(). + +Fixes: ad635e723e17 ("riscv: cpu: Add 64bit hartid support on RV64") +Signed-off-by: Anup Patel +Reviewed-by: Atish Patra +Link: https://lore.kernel.org/r/20231027154254.355853-2-apatel@ventanamicro.com +Signed-off-by: Palmer Dabbelt +Signed-off-by: Sasha Levin +--- + arch/riscv/kernel/cpu.c | 11 ++++++----- + 1 file changed, 6 insertions(+), 5 deletions(-) + +diff --git a/arch/riscv/kernel/cpu.c b/arch/riscv/kernel/cpu.c +index 852ecccd8920f..0f76181dc634d 100644 +--- a/arch/riscv/kernel/cpu.c ++++ b/arch/riscv/kernel/cpu.c +@@ -57,13 +57,14 @@ int riscv_of_processor_hartid(struct device_node *node, unsigned long *hart) + */ + int riscv_of_parent_hartid(struct device_node *node, unsigned long *hartid) + { +- int rc; +- + for (; node; node = node->parent) { + if (of_device_is_compatible(node, "riscv")) { +- rc = riscv_of_processor_hartid(node, hartid); +- if (!rc) +- return 0; ++ *hartid = (unsigned long)of_get_cpu_hwid(node, 0); ++ if (*hartid == ~0UL) { ++ pr_warn("Found CPU without hart ID\n"); ++ return -ENODEV; ++ } ++ return 0; + } + } + +-- +2.42.0 + diff --git a/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch b/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch new file mode 100644 index 00000000000..bca9c5eec34 --- /dev/null +++ b/queue-6.1/selftests-pmtu.sh-fix-result-checking.patch @@ -0,0 +1,41 @@ +From 17a251fbf65b4aaa37dadd8958676d8a677fde8a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 31 Oct 2023 11:47:32 +0800 +Subject: selftests: pmtu.sh: fix result checking + +From: Hangbin Liu + +[ Upstream commit 63e201916b27260218e528a2f8758be47f99bbf4 ] + +In the PMTU test, when all previous tests are skipped and the new test +passes, the exit code is set to 0. However, the current check mistakenly +treats this as an assignment, causing the check to pass every time. + +Consequently, regardless of how many tests have failed, if the latest test +passes, the PMTU test will report a pass. + +Fixes: 2a9d3716b810 ("selftests: pmtu.sh: improve the test result processing") +Signed-off-by: Hangbin Liu +Acked-by: Po-Hsu Lin +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/pmtu.sh | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/net/pmtu.sh b/tools/testing/selftests/net/pmtu.sh +index dfe3d287f01d2..0d705fdcf3b76 100755 +--- a/tools/testing/selftests/net/pmtu.sh ++++ b/tools/testing/selftests/net/pmtu.sh +@@ -2013,7 +2013,7 @@ run_test() { + case $ret in + 0) + all_skipped=false +- [ $exitcode=$ksft_skip ] && exitcode=0 ++ [ $exitcode -eq $ksft_skip ] && exitcode=0 + ;; + $ksft_skip) + [ $all_skipped = true ] && exitcode=$ksft_skip +-- +2.42.0 + diff --git a/queue-6.1/series b/queue-6.1/series index 36c9103aa8a..fd0c2c18325 100644 --- a/queue-6.1/series +++ b/queue-6.1/series @@ -329,3 +329,39 @@ media-cadence-csi2rx-unregister-v4l2-async-notifier.patch media-dvb-usb-v2-af9035-fix-missing-unlock.patch media-cec-meson-always-include-meson-sub-directory-i.patch regmap-prevent-noinc-writes-from-clobbering-cache.patch +pwm-sti-reduce-number-of-allocations-and-drop-usage-.patch +pwm-brcmstb-utilize-appropriate-clock-apis-in-suspen.patch +input-synaptics-rmi4-fix-use-after-free-in-rmi_unreg.patch +watchdog-ixp4xx-make-sure-restart-always-works.patch +llc-verify-mac-len-before-reading-mac-header.patch +hsr-prevent-use-after-free-in-prp_create_tagged_fram.patch +tipc-change-nla_policy-for-bearer-related-names-to-n.patch +bpf-check-map-usercnt-after-timer-timer-is-assigned.patch +inet-shrink-struct-flowi_common.patch +octeontx2-pf-fix-error-codes.patch +octeontx2-pf-fix-holes-in-error-code.patch +net-page_pool-add-missing-free_percpu-when-page_pool.patch +dccp-call-security_inet_conn_request-after-setting-i.patch +dccp-tcp-call-security_inet_conn_request-after-setti.patch +net-r8169-disable-multicast-filter-for-rtl8168h-and-.patch +fix-termination-state-for-idr_for_each_entry_ul.patch +net-stmmac-xgmac-enable-support-for-multiple-flexibl.patch +selftests-pmtu.sh-fix-result-checking.patch +octeontx2-pf-rename-tot_tx_queues-to-non_qos_queues.patch +octeontx2-pf-qos-send-queues-management.patch +octeontx2-pf-free-pending-and-dropped-sqes.patch +net-smc-fix-dangling-sock-under-state-smc_appfinclos.patch +net-smc-allow-cdc-msg-send-rather-than-drop-it-with-.patch +net-smc-put-sk-reference-if-close-work-was-canceled.patch +nvme-fix-error-handling-for-io_uring-nvme-passthroug.patch +tg3-power-down-device-only-on-system_power_off.patch +nbd-fix-uaf-in-nbd_open.patch +blk-core-use-pr_warn_ratelimited-in-bio_check_ro.patch +virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch +vsock-virtio-remove-socket-from-connected-bound-list.patch +r8169-respect-userspace-disabling-iff_multicast.patch +i2c-iproc-handle-invalid-slave-state.patch +netfilter-xt_recent-fix-increase-ipv6-literal-buffer.patch +netfilter-nft_redir-use-struct-nf_nat_range2-through.patch +netfilter-nat-fix-ipv6-nat-redirect-with-mapped-and-.patch +risc-v-don-t-fail-in-riscv_of_parent_hartid-for-disa.patch diff --git a/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch b/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch new file mode 100644 index 00000000000..f6369cab16e --- /dev/null +++ b/queue-6.1/tg3-power-down-device-only-on-system_power_off.patch @@ -0,0 +1,46 @@ +From bb2266f47afc23ff00d5cf74034978d21d28dfd0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Nov 2023 13:50:29 +0200 +Subject: tg3: power down device only on SYSTEM_POWER_OFF + +From: George Shuklin + +[ Upstream commit 9fc3bc7643341dc5be7d269f3d3dbe441d8d7ac3 ] + +Dell R650xs servers hangs on reboot if tg3 driver calls +tg3_power_down. + +This happens only if network adapters (BCM5720 for R650xs) were +initialized using SNP (e.g. by booting ipxe.efi). + +The actual problem is on Dell side, but this fix allows servers +to come back alive after reboot. + +Signed-off-by: George Shuklin +Fixes: 2ca1c94ce0b6 ("tg3: Disable tg3 device on system reboot to avoid triggering AER") +Reviewed-by: Pavan Chebbi +Reviewed-by: Michael Chan +Link: https://lore.kernel.org/r/20231103115029.83273-1-george.shuklin@gmail.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/tg3.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/broadcom/tg3.c b/drivers/net/ethernet/broadcom/tg3.c +index 9609041016776..85570e40c8e9b 100644 +--- a/drivers/net/ethernet/broadcom/tg3.c ++++ b/drivers/net/ethernet/broadcom/tg3.c +@@ -18086,7 +18086,8 @@ static void tg3_shutdown(struct pci_dev *pdev) + if (netif_running(dev)) + dev_close(dev); + +- tg3_power_down(tp); ++ if (system_state == SYSTEM_POWER_OFF) ++ tg3_power_down(tp); + + rtnl_unlock(); + +-- +2.42.0 + diff --git a/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch b/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch new file mode 100644 index 00000000000..db0295a9c70 --- /dev/null +++ b/queue-6.1/tipc-change-nla_policy-for-bearer-related-names-to-n.patch @@ -0,0 +1,111 @@ +From 8159bf4cc90607234a192c9a6170a78a50801d0d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 30 Oct 2023 16:55:40 +0900 +Subject: tipc: Change nla_policy for bearer-related names to NLA_NUL_STRING + +From: Shigeru Yoshida + +[ Upstream commit 19b3f72a41a8751e26bffc093bb7e1cef29ad579 ] + +syzbot reported the following uninit-value access issue [1]: + +===================================================== +BUG: KMSAN: uninit-value in strlen lib/string.c:418 [inline] +BUG: KMSAN: uninit-value in strstr+0xb8/0x2f0 lib/string.c:756 + strlen lib/string.c:418 [inline] + strstr+0xb8/0x2f0 lib/string.c:756 + tipc_nl_node_reset_link_stats+0x3ea/0xb50 net/tipc/node.c:2595 + genl_family_rcv_msg_doit net/netlink/genetlink.c:971 [inline] + genl_family_rcv_msg net/netlink/genetlink.c:1051 [inline] + genl_rcv_msg+0x11ec/0x1290 net/netlink/genetlink.c:1066 + netlink_rcv_skb+0x371/0x650 net/netlink/af_netlink.c:2545 + genl_rcv+0x40/0x60 net/netlink/genetlink.c:1075 + netlink_unicast_kernel net/netlink/af_netlink.c:1342 [inline] + netlink_unicast+0xf47/0x1250 net/netlink/af_netlink.c:1368 + netlink_sendmsg+0x1238/0x13d0 net/netlink/af_netlink.c:1910 + sock_sendmsg_nosec net/socket.c:730 [inline] + sock_sendmsg net/socket.c:753 [inline] + ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541 + ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595 + __sys_sendmsg net/socket.c:2624 [inline] + __do_sys_sendmsg net/socket.c:2633 [inline] + __se_sys_sendmsg net/socket.c:2631 [inline] + __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +Uninit was created at: + slab_post_alloc_hook+0x12f/0xb70 mm/slab.h:767 + slab_alloc_node mm/slub.c:3478 [inline] + kmem_cache_alloc_node+0x577/0xa80 mm/slub.c:3523 + kmalloc_reserve+0x13d/0x4a0 net/core/skbuff.c:559 + __alloc_skb+0x318/0x740 net/core/skbuff.c:650 + alloc_skb include/linux/skbuff.h:1286 [inline] + netlink_alloc_large_skb net/netlink/af_netlink.c:1214 [inline] + netlink_sendmsg+0xb34/0x13d0 net/netlink/af_netlink.c:1885 + sock_sendmsg_nosec net/socket.c:730 [inline] + sock_sendmsg net/socket.c:753 [inline] + ____sys_sendmsg+0x9c2/0xd60 net/socket.c:2541 + ___sys_sendmsg+0x28d/0x3c0 net/socket.c:2595 + __sys_sendmsg net/socket.c:2624 [inline] + __do_sys_sendmsg net/socket.c:2633 [inline] + __se_sys_sendmsg net/socket.c:2631 [inline] + __x64_sys_sendmsg+0x307/0x490 net/socket.c:2631 + do_syscall_x64 arch/x86/entry/common.c:50 [inline] + do_syscall_64+0x41/0xc0 arch/x86/entry/common.c:80 + entry_SYSCALL_64_after_hwframe+0x63/0xcd + +TIPC bearer-related names including link names must be null-terminated +strings. If a link name which is not null-terminated is passed through +netlink, strstr() and similar functions can cause buffer overrun. This +causes the above issue. + +This patch changes the nla_policy for bearer-related names from NLA_STRING +to NLA_NUL_STRING. This resolves the issue by ensuring that only +null-terminated strings are accepted as bearer-related names. + +syzbot reported similar uninit-value issue related to bearer names [2]. The +root cause of this issue is that a non-null-terminated bearer name was +passed. This patch also resolved this issue. + +Fixes: 7be57fc69184 ("tipc: add link get/dump to new netlink api") +Fixes: 0655f6a8635b ("tipc: add bearer disable/enable to new netlink api") +Reported-and-tested-by: syzbot+5138ca807af9d2b42574@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=5138ca807af9d2b42574 [1] +Reported-and-tested-by: syzbot+9425c47dccbcb4c17d51@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=9425c47dccbcb4c17d51 [2] +Signed-off-by: Shigeru Yoshida +Reviewed-by: Jiri Pirko +Link: https://lore.kernel.org/r/20231030075540.3784537-1-syoshida@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/tipc/netlink.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/tipc/netlink.c b/net/tipc/netlink.c +index e8fd257c0e688..1a9a5bdaccf4f 100644 +--- a/net/tipc/netlink.c ++++ b/net/tipc/netlink.c +@@ -88,7 +88,7 @@ const struct nla_policy tipc_nl_net_policy[TIPC_NLA_NET_MAX + 1] = { + + const struct nla_policy tipc_nl_link_policy[TIPC_NLA_LINK_MAX + 1] = { + [TIPC_NLA_LINK_UNSPEC] = { .type = NLA_UNSPEC }, +- [TIPC_NLA_LINK_NAME] = { .type = NLA_STRING, ++ [TIPC_NLA_LINK_NAME] = { .type = NLA_NUL_STRING, + .len = TIPC_MAX_LINK_NAME }, + [TIPC_NLA_LINK_MTU] = { .type = NLA_U32 }, + [TIPC_NLA_LINK_BROADCAST] = { .type = NLA_FLAG }, +@@ -125,7 +125,7 @@ const struct nla_policy tipc_nl_prop_policy[TIPC_NLA_PROP_MAX + 1] = { + + const struct nla_policy tipc_nl_bearer_policy[TIPC_NLA_BEARER_MAX + 1] = { + [TIPC_NLA_BEARER_UNSPEC] = { .type = NLA_UNSPEC }, +- [TIPC_NLA_BEARER_NAME] = { .type = NLA_STRING, ++ [TIPC_NLA_BEARER_NAME] = { .type = NLA_NUL_STRING, + .len = TIPC_MAX_BEARER_NAME }, + [TIPC_NLA_BEARER_PROP] = { .type = NLA_NESTED }, + [TIPC_NLA_BEARER_DOMAIN] = { .type = NLA_U32 } +-- +2.42.0 + diff --git a/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch b/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch new file mode 100644 index 00000000000..82faff26c09 --- /dev/null +++ b/queue-6.1/virtio-vsock-replace-virtio_vsock_pkt-with-sk_buff.patch @@ -0,0 +1,1983 @@ +From dc5f3dc5e6910cd026685601ab84ffd77ceafc09 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 13 Jan 2023 22:21:37 +0000 +Subject: virtio/vsock: replace virtio_vsock_pkt with sk_buff + +From: Bobby Eshleman + +[ Upstream commit 71dc9ec9ac7d3eee785cdc986c3daeb821381e20 ] + +This commit changes virtio/vsock to use sk_buff instead of +virtio_vsock_pkt. Beyond better conforming to other net code, using +sk_buff allows vsock to use sk_buff-dependent features in the future +(such as sockmap) and improves throughput. + +This patch introduces the following performance changes: + +Tool: Uperf +Env: Phys Host + L1 Guest +Payload: 64k +Threads: 16 +Test Runs: 10 +Type: SOCK_STREAM +Before: commit b7bfaa761d760 ("Linux 6.2-rc3") + +Before +------ +g2h: 16.77Gb/s +h2g: 10.56Gb/s + +After +----- +g2h: 21.04Gb/s +h2g: 10.76Gb/s + +Signed-off-by: Bobby Eshleman +Reviewed-by: Stefano Garzarella +Signed-off-by: David S. Miller +Stable-dep-of: 3a5cc90a4d17 ("vsock/virtio: remove socket from connected/bound list on shutdown") +Signed-off-by: Sasha Levin +--- + drivers/vhost/vsock.c | 214 +++++------- + include/linux/virtio_vsock.h | 129 ++++++-- + net/vmw_vsock/virtio_transport.c | 149 +++------ + net/vmw_vsock/virtio_transport_common.c | 422 +++++++++++++----------- + net/vmw_vsock/vsock_loopback.c | 51 +-- + 5 files changed, 498 insertions(+), 467 deletions(-) + +diff --git a/drivers/vhost/vsock.c b/drivers/vhost/vsock.c +index a2b3743723639..1f3b89c885cca 100644 +--- a/drivers/vhost/vsock.c ++++ b/drivers/vhost/vsock.c +@@ -51,8 +51,7 @@ struct vhost_vsock { + struct hlist_node hash; + + struct vhost_work send_pkt_work; +- spinlock_t send_pkt_list_lock; +- struct list_head send_pkt_list; /* host->guest pending packets */ ++ struct sk_buff_head send_pkt_queue; /* host->guest pending packets */ + + atomic_t queued_replies; + +@@ -108,40 +107,31 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + vhost_disable_notify(&vsock->dev, vq); + + do { +- struct virtio_vsock_pkt *pkt; ++ struct virtio_vsock_hdr *hdr; ++ size_t iov_len, payload_len; + struct iov_iter iov_iter; ++ u32 flags_to_restore = 0; ++ struct sk_buff *skb; + unsigned out, in; + size_t nbytes; +- size_t iov_len, payload_len; + int head; +- u32 flags_to_restore = 0; + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- if (list_empty(&vsock->send_pkt_list)) { +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); ++ ++ if (!skb) { + vhost_enable_notify(&vsock->dev, vq); + break; + } + +- pkt = list_first_entry(&vsock->send_pkt_list, +- struct virtio_vsock_pkt, list); +- list_del_init(&pkt->list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- + head = vhost_get_vq_desc(vq, vq->iov, ARRAY_SIZE(vq->iov), + &out, &in, NULL, NULL); + if (head < 0) { +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); + break; + } + + if (head == vq->num) { +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- ++ virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); + /* We cannot finish yet if more buffers snuck in while + * re-enabling notify. + */ +@@ -153,26 +143,27 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + } + + if (out) { +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + vq_err(vq, "Expected 0 output buffers, got %u\n", out); + break; + } + + iov_len = iov_length(&vq->iov[out], in); +- if (iov_len < sizeof(pkt->hdr)) { +- virtio_transport_free_pkt(pkt); ++ if (iov_len < sizeof(*hdr)) { ++ kfree_skb(skb); + vq_err(vq, "Buffer len [%zu] too small\n", iov_len); + break; + } + + iov_iter_init(&iov_iter, ITER_DEST, &vq->iov[out], in, iov_len); +- payload_len = pkt->len - pkt->off; ++ payload_len = skb->len; ++ hdr = virtio_vsock_hdr(skb); + + /* If the packet is greater than the space available in the + * buffer, we split it using multiple buffers. + */ +- if (payload_len > iov_len - sizeof(pkt->hdr)) { +- payload_len = iov_len - sizeof(pkt->hdr); ++ if (payload_len > iov_len - sizeof(*hdr)) { ++ payload_len = iov_len - sizeof(*hdr); + + /* As we are copying pieces of large packet's buffer to + * small rx buffers, headers of packets in rx queue are +@@ -185,31 +176,30 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + * bits set. After initialized header will be copied to + * rx buffer, these required bits will be restored. + */ +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { +- pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { ++ hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + flags_to_restore |= VIRTIO_VSOCK_SEQ_EOM; + +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) { +- pkt->hdr.flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) { ++ hdr->flags &= ~cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + flags_to_restore |= VIRTIO_VSOCK_SEQ_EOR; + } + } + } + + /* Set the correct length in the header */ +- pkt->hdr.len = cpu_to_le32(payload_len); ++ hdr->len = cpu_to_le32(payload_len); + +- nbytes = copy_to_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); +- if (nbytes != sizeof(pkt->hdr)) { +- virtio_transport_free_pkt(pkt); ++ nbytes = copy_to_iter(hdr, sizeof(*hdr), &iov_iter); ++ if (nbytes != sizeof(*hdr)) { ++ kfree_skb(skb); + vq_err(vq, "Faulted on copying pkt hdr\n"); + break; + } + +- nbytes = copy_to_iter(pkt->buf + pkt->off, payload_len, +- &iov_iter); ++ nbytes = copy_to_iter(skb->data, payload_len, &iov_iter); + if (nbytes != payload_len) { +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + vq_err(vq, "Faulted on copying pkt buf\n"); + break; + } +@@ -217,31 +207,28 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + /* Deliver to monitoring devices all packets that we + * will transmit. + */ +- virtio_transport_deliver_tap_pkt(pkt); ++ virtio_transport_deliver_tap_pkt(skb); + +- vhost_add_used(vq, head, sizeof(pkt->hdr) + payload_len); ++ vhost_add_used(vq, head, sizeof(*hdr) + payload_len); + added = true; + +- pkt->off += payload_len; ++ skb_pull(skb, payload_len); + total_len += payload_len; + + /* If we didn't send all the payload we can requeue the packet + * to send it with the next available buffer. + */ +- if (pkt->off < pkt->len) { +- pkt->hdr.flags |= cpu_to_le32(flags_to_restore); ++ if (skb->len > 0) { ++ hdr->flags |= cpu_to_le32(flags_to_restore); + +- /* We are queueing the same virtio_vsock_pkt to handle ++ /* We are queueing the same skb to handle + * the remaining bytes, and we want to deliver it + * to monitoring devices in the next iteration. + */ +- pkt->tap_delivered = false; +- +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ virtio_vsock_skb_clear_tap_delivered(skb); ++ virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); + } else { +- if (pkt->reply) { ++ if (virtio_vsock_skb_reply(skb)) { + int val; + + val = atomic_dec_return(&vsock->queued_replies); +@@ -253,7 +240,7 @@ vhost_transport_do_send_pkt(struct vhost_vsock *vsock, + restart_tx = true; + } + +- virtio_transport_free_pkt(pkt); ++ consume_skb(skb); + } + } while(likely(!vhost_exceeds_weight(vq, ++pkts, total_len))); + if (added) +@@ -278,28 +265,26 @@ static void vhost_transport_send_pkt_work(struct vhost_work *work) + } + + static int +-vhost_transport_send_pkt(struct virtio_vsock_pkt *pkt) ++vhost_transport_send_pkt(struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vhost_vsock *vsock; +- int len = pkt->len; ++ int len = skb->len; + + rcu_read_lock(); + + /* Find the vhost_vsock according to guest context id */ +- vsock = vhost_vsock_get(le64_to_cpu(pkt->hdr.dst_cid)); ++ vsock = vhost_vsock_get(le64_to_cpu(hdr->dst_cid)); + if (!vsock) { + rcu_read_unlock(); +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + return -ENODEV; + } + +- if (pkt->reply) ++ if (virtio_vsock_skb_reply(skb)) + atomic_inc(&vsock->queued_replies); + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add_tail(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- ++ virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); + vhost_work_queue(&vsock->dev, &vsock->send_pkt_work); + + rcu_read_unlock(); +@@ -310,10 +295,8 @@ static int + vhost_transport_cancel_pkt(struct vsock_sock *vsk) + { + struct vhost_vsock *vsock; +- struct virtio_vsock_pkt *pkt, *n; + int cnt = 0; + int ret = -ENODEV; +- LIST_HEAD(freeme); + + rcu_read_lock(); + +@@ -322,20 +305,7 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk) + if (!vsock) + goto out; + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { +- if (pkt->vsk != vsk) +- continue; +- list_move(&pkt->list, &freeme); +- } +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- +- list_for_each_entry_safe(pkt, n, &freeme, list) { +- if (pkt->reply) +- cnt++; +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } ++ cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); + + if (cnt) { + struct vhost_virtqueue *tx_vq = &vsock->vqs[VSOCK_VQ_TX]; +@@ -352,12 +322,14 @@ vhost_transport_cancel_pkt(struct vsock_sock *vsk) + return ret; + } + +-static struct virtio_vsock_pkt * +-vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, ++static struct sk_buff * ++vhost_vsock_alloc_skb(struct vhost_virtqueue *vq, + unsigned int out, unsigned int in) + { +- struct virtio_vsock_pkt *pkt; ++ struct virtio_vsock_hdr *hdr; + struct iov_iter iov_iter; ++ struct sk_buff *skb; ++ size_t payload_len; + size_t nbytes; + size_t len; + +@@ -366,50 +338,48 @@ vhost_vsock_alloc_pkt(struct vhost_virtqueue *vq, + return NULL; + } + +- pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); +- if (!pkt) ++ len = iov_length(vq->iov, out); ++ ++ /* len contains both payload and hdr */ ++ skb = virtio_vsock_alloc_skb(len, GFP_KERNEL); ++ if (!skb) + return NULL; + +- len = iov_length(vq->iov, out); + iov_iter_init(&iov_iter, ITER_SOURCE, vq->iov, out, len); + +- nbytes = copy_from_iter(&pkt->hdr, sizeof(pkt->hdr), &iov_iter); +- if (nbytes != sizeof(pkt->hdr)) { ++ hdr = virtio_vsock_hdr(skb); ++ nbytes = copy_from_iter(hdr, sizeof(*hdr), &iov_iter); ++ if (nbytes != sizeof(*hdr)) { + vq_err(vq, "Expected %zu bytes for pkt->hdr, got %zu bytes\n", +- sizeof(pkt->hdr), nbytes); +- kfree(pkt); ++ sizeof(*hdr), nbytes); ++ kfree_skb(skb); + return NULL; + } + +- pkt->len = le32_to_cpu(pkt->hdr.len); ++ payload_len = le32_to_cpu(hdr->len); + + /* No payload */ +- if (!pkt->len) +- return pkt; ++ if (!payload_len) ++ return skb; + +- /* The pkt is too big */ +- if (pkt->len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE) { +- kfree(pkt); ++ /* The pkt is too big or the length in the header is invalid */ ++ if (payload_len > VIRTIO_VSOCK_MAX_PKT_BUF_SIZE || ++ payload_len + sizeof(*hdr) > len) { ++ kfree_skb(skb); + return NULL; + } + +- pkt->buf = kvmalloc(pkt->len, GFP_KERNEL); +- if (!pkt->buf) { +- kfree(pkt); +- return NULL; +- } ++ virtio_vsock_skb_rx_put(skb); + +- pkt->buf_len = pkt->len; +- +- nbytes = copy_from_iter(pkt->buf, pkt->len, &iov_iter); +- if (nbytes != pkt->len) { +- vq_err(vq, "Expected %u byte payload, got %zu bytes\n", +- pkt->len, nbytes); +- virtio_transport_free_pkt(pkt); ++ nbytes = copy_from_iter(skb->data, payload_len, &iov_iter); ++ if (nbytes != payload_len) { ++ vq_err(vq, "Expected %zu byte payload, got %zu bytes\n", ++ payload_len, nbytes); ++ kfree_skb(skb); + return NULL; + } + +- return pkt; ++ return skb; + } + + /* Is there space left for replies to rx packets? */ +@@ -496,9 +466,9 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + poll.work); + struct vhost_vsock *vsock = container_of(vq->dev, struct vhost_vsock, + dev); +- struct virtio_vsock_pkt *pkt; + int head, pkts = 0, total_len = 0; + unsigned int out, in; ++ struct sk_buff *skb; + bool added = false; + + mutex_lock(&vq->mutex); +@@ -511,6 +481,8 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + + vhost_disable_notify(&vsock->dev, vq); + do { ++ struct virtio_vsock_hdr *hdr; ++ + if (!vhost_vsock_more_replies(vsock)) { + /* Stop tx until the device processes already + * pending replies. Leave tx virtqueue +@@ -532,24 +504,26 @@ static void vhost_vsock_handle_tx_kick(struct vhost_work *work) + break; + } + +- pkt = vhost_vsock_alloc_pkt(vq, out, in); +- if (!pkt) { ++ skb = vhost_vsock_alloc_skb(vq, out, in); ++ if (!skb) { + vq_err(vq, "Faulted on pkt\n"); + continue; + } + +- total_len += sizeof(pkt->hdr) + pkt->len; ++ total_len += sizeof(*hdr) + skb->len; + + /* Deliver to monitoring devices all received packets */ +- virtio_transport_deliver_tap_pkt(pkt); ++ virtio_transport_deliver_tap_pkt(skb); ++ ++ hdr = virtio_vsock_hdr(skb); + + /* Only accept correctly addressed packets */ +- if (le64_to_cpu(pkt->hdr.src_cid) == vsock->guest_cid && +- le64_to_cpu(pkt->hdr.dst_cid) == ++ if (le64_to_cpu(hdr->src_cid) == vsock->guest_cid && ++ le64_to_cpu(hdr->dst_cid) == + vhost_transport_get_local_cid()) +- virtio_transport_recv_pkt(&vhost_transport, pkt); ++ virtio_transport_recv_pkt(&vhost_transport, skb); + else +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + + vhost_add_used(vq, head, 0); + added = true; +@@ -693,8 +667,7 @@ static int vhost_vsock_dev_open(struct inode *inode, struct file *file) + VHOST_VSOCK_WEIGHT, true, NULL); + + file->private_data = vsock; +- spin_lock_init(&vsock->send_pkt_list_lock); +- INIT_LIST_HEAD(&vsock->send_pkt_list); ++ skb_queue_head_init(&vsock->send_pkt_queue); + vhost_work_init(&vsock->send_pkt_work, vhost_transport_send_pkt_work); + return 0; + +@@ -760,16 +733,7 @@ static int vhost_vsock_dev_release(struct inode *inode, struct file *file) + vhost_vsock_flush(vsock); + vhost_dev_stop(&vsock->dev); + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- while (!list_empty(&vsock->send_pkt_list)) { +- struct virtio_vsock_pkt *pkt; +- +- pkt = list_first_entry(&vsock->send_pkt_list, +- struct virtio_vsock_pkt, list); +- list_del_init(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); + + vhost_dev_cleanup(&vsock->dev); + kfree(vsock->dev.vqs); +diff --git a/include/linux/virtio_vsock.h b/include/linux/virtio_vsock.h +index 35d7eedb5e8e4..3f9c166113063 100644 +--- a/include/linux/virtio_vsock.h ++++ b/include/linux/virtio_vsock.h +@@ -7,6 +7,109 @@ + #include + #include + ++#define VIRTIO_VSOCK_SKB_HEADROOM (sizeof(struct virtio_vsock_hdr)) ++ ++struct virtio_vsock_skb_cb { ++ bool reply; ++ bool tap_delivered; ++}; ++ ++#define VIRTIO_VSOCK_SKB_CB(skb) ((struct virtio_vsock_skb_cb *)((skb)->cb)) ++ ++static inline struct virtio_vsock_hdr *virtio_vsock_hdr(struct sk_buff *skb) ++{ ++ return (struct virtio_vsock_hdr *)skb->head; ++} ++ ++static inline bool virtio_vsock_skb_reply(struct sk_buff *skb) ++{ ++ return VIRTIO_VSOCK_SKB_CB(skb)->reply; ++} ++ ++static inline void virtio_vsock_skb_set_reply(struct sk_buff *skb) ++{ ++ VIRTIO_VSOCK_SKB_CB(skb)->reply = true; ++} ++ ++static inline bool virtio_vsock_skb_tap_delivered(struct sk_buff *skb) ++{ ++ return VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered; ++} ++ ++static inline void virtio_vsock_skb_set_tap_delivered(struct sk_buff *skb) ++{ ++ VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = true; ++} ++ ++static inline void virtio_vsock_skb_clear_tap_delivered(struct sk_buff *skb) ++{ ++ VIRTIO_VSOCK_SKB_CB(skb)->tap_delivered = false; ++} ++ ++static inline void virtio_vsock_skb_rx_put(struct sk_buff *skb) ++{ ++ u32 len; ++ ++ len = le32_to_cpu(virtio_vsock_hdr(skb)->len); ++ ++ if (len > 0) ++ skb_put(skb, len); ++} ++ ++static inline struct sk_buff *virtio_vsock_alloc_skb(unsigned int size, gfp_t mask) ++{ ++ struct sk_buff *skb; ++ ++ if (size < VIRTIO_VSOCK_SKB_HEADROOM) ++ return NULL; ++ ++ skb = alloc_skb(size, mask); ++ if (!skb) ++ return NULL; ++ ++ skb_reserve(skb, VIRTIO_VSOCK_SKB_HEADROOM); ++ return skb; ++} ++ ++static inline void ++virtio_vsock_skb_queue_head(struct sk_buff_head *list, struct sk_buff *skb) ++{ ++ spin_lock_bh(&list->lock); ++ __skb_queue_head(list, skb); ++ spin_unlock_bh(&list->lock); ++} ++ ++static inline void ++virtio_vsock_skb_queue_tail(struct sk_buff_head *list, struct sk_buff *skb) ++{ ++ spin_lock_bh(&list->lock); ++ __skb_queue_tail(list, skb); ++ spin_unlock_bh(&list->lock); ++} ++ ++static inline struct sk_buff *virtio_vsock_skb_dequeue(struct sk_buff_head *list) ++{ ++ struct sk_buff *skb; ++ ++ spin_lock_bh(&list->lock); ++ skb = __skb_dequeue(list); ++ spin_unlock_bh(&list->lock); ++ ++ return skb; ++} ++ ++static inline void virtio_vsock_skb_queue_purge(struct sk_buff_head *list) ++{ ++ spin_lock_bh(&list->lock); ++ __skb_queue_purge(list); ++ spin_unlock_bh(&list->lock); ++} ++ ++static inline size_t virtio_vsock_skb_len(struct sk_buff *skb) ++{ ++ return (size_t)(skb_end_pointer(skb) - skb->head); ++} ++ + #define VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE (1024 * 4) + #define VIRTIO_VSOCK_MAX_BUF_SIZE 0xFFFFFFFFUL + #define VIRTIO_VSOCK_MAX_PKT_BUF_SIZE (1024 * 64) +@@ -35,23 +138,10 @@ struct virtio_vsock_sock { + u32 last_fwd_cnt; + u32 rx_bytes; + u32 buf_alloc; +- struct list_head rx_queue; ++ struct sk_buff_head rx_queue; + u32 msg_count; + }; + +-struct virtio_vsock_pkt { +- struct virtio_vsock_hdr hdr; +- struct list_head list; +- /* socket refcnt not held, only use for cancellation */ +- struct vsock_sock *vsk; +- void *buf; +- u32 buf_len; +- u32 len; +- u32 off; +- bool reply; +- bool tap_delivered; +-}; +- + struct virtio_vsock_pkt_info { + u32 remote_cid, remote_port; + struct vsock_sock *vsk; +@@ -68,7 +158,7 @@ struct virtio_transport { + struct vsock_transport transport; + + /* Takes ownership of the packet */ +- int (*send_pkt)(struct virtio_vsock_pkt *pkt); ++ int (*send_pkt)(struct sk_buff *skb); + }; + + ssize_t +@@ -149,11 +239,10 @@ virtio_transport_dgram_enqueue(struct vsock_sock *vsk, + void virtio_transport_destruct(struct vsock_sock *vsk); + + void virtio_transport_recv_pkt(struct virtio_transport *t, +- struct virtio_vsock_pkt *pkt); +-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt); +-void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt); ++ struct sk_buff *skb); ++void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb); + u32 virtio_transport_get_credit(struct virtio_vsock_sock *vvs, u32 wanted); + void virtio_transport_put_credit(struct virtio_vsock_sock *vvs, u32 credit); +-void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt); +- ++void virtio_transport_deliver_tap_pkt(struct sk_buff *skb); ++int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *list); + #endif /* _LINUX_VIRTIO_VSOCK_H */ +diff --git a/net/vmw_vsock/virtio_transport.c b/net/vmw_vsock/virtio_transport.c +index 460e7fbb42da3..16575ea836590 100644 +--- a/net/vmw_vsock/virtio_transport.c ++++ b/net/vmw_vsock/virtio_transport.c +@@ -42,8 +42,7 @@ struct virtio_vsock { + bool tx_run; + + struct work_struct send_pkt_work; +- spinlock_t send_pkt_list_lock; +- struct list_head send_pkt_list; ++ struct sk_buff_head send_pkt_queue; + + atomic_t queued_replies; + +@@ -101,41 +100,31 @@ virtio_transport_send_pkt_work(struct work_struct *work) + vq = vsock->vqs[VSOCK_VQ_TX]; + + for (;;) { +- struct virtio_vsock_pkt *pkt; + struct scatterlist hdr, buf, *sgs[2]; + int ret, in_sg = 0, out_sg = 0; ++ struct sk_buff *skb; + bool reply; + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- if (list_empty(&vsock->send_pkt_list)) { +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ skb = virtio_vsock_skb_dequeue(&vsock->send_pkt_queue); ++ if (!skb) + break; +- } +- +- pkt = list_first_entry(&vsock->send_pkt_list, +- struct virtio_vsock_pkt, list); +- list_del_init(&pkt->list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); + +- virtio_transport_deliver_tap_pkt(pkt); ++ virtio_transport_deliver_tap_pkt(skb); ++ reply = virtio_vsock_skb_reply(skb); + +- reply = pkt->reply; +- +- sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); ++ sg_init_one(&hdr, virtio_vsock_hdr(skb), sizeof(*virtio_vsock_hdr(skb))); + sgs[out_sg++] = &hdr; +- if (pkt->buf) { +- sg_init_one(&buf, pkt->buf, pkt->len); ++ if (skb->len > 0) { ++ sg_init_one(&buf, skb->data, skb->len); + sgs[out_sg++] = &buf; + } + +- ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, pkt, GFP_KERNEL); ++ ret = virtqueue_add_sgs(vq, sgs, out_sg, in_sg, skb, GFP_KERNEL); + /* Usually this means that there is no more space available in + * the vq + */ + if (ret < 0) { +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ virtio_vsock_skb_queue_head(&vsock->send_pkt_queue, skb); + break; + } + +@@ -164,32 +153,32 @@ virtio_transport_send_pkt_work(struct work_struct *work) + } + + static int +-virtio_transport_send_pkt(struct virtio_vsock_pkt *pkt) ++virtio_transport_send_pkt(struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr; + struct virtio_vsock *vsock; +- int len = pkt->len; ++ int len = skb->len; ++ ++ hdr = virtio_vsock_hdr(skb); + + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); + if (!vsock) { +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + len = -ENODEV; + goto out_rcu; + } + +- if (le64_to_cpu(pkt->hdr.dst_cid) == vsock->guest_cid) { +- virtio_transport_free_pkt(pkt); ++ if (le64_to_cpu(hdr->dst_cid) == vsock->guest_cid) { ++ kfree_skb(skb); + len = -ENODEV; + goto out_rcu; + } + +- if (pkt->reply) ++ if (virtio_vsock_skb_reply(skb)) + atomic_inc(&vsock->queued_replies); + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_add_tail(&pkt->list, &vsock->send_pkt_list); +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- ++ virtio_vsock_skb_queue_tail(&vsock->send_pkt_queue, skb); + queue_work(virtio_vsock_workqueue, &vsock->send_pkt_work); + + out_rcu: +@@ -201,9 +190,7 @@ static int + virtio_transport_cancel_pkt(struct vsock_sock *vsk) + { + struct virtio_vsock *vsock; +- struct virtio_vsock_pkt *pkt, *n; + int cnt = 0, ret; +- LIST_HEAD(freeme); + + rcu_read_lock(); + vsock = rcu_dereference(the_virtio_vsock); +@@ -212,20 +199,7 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) + goto out_rcu; + } + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- list_for_each_entry_safe(pkt, n, &vsock->send_pkt_list, list) { +- if (pkt->vsk != vsk) +- continue; +- list_move(&pkt->list, &freeme); +- } +- spin_unlock_bh(&vsock->send_pkt_list_lock); +- +- list_for_each_entry_safe(pkt, n, &freeme, list) { +- if (pkt->reply) +- cnt++; +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } ++ cnt = virtio_transport_purge_skbs(vsk, &vsock->send_pkt_queue); + + if (cnt) { + struct virtqueue *rx_vq = vsock->vqs[VSOCK_VQ_RX]; +@@ -246,38 +220,28 @@ virtio_transport_cancel_pkt(struct vsock_sock *vsk) + + static void virtio_vsock_rx_fill(struct virtio_vsock *vsock) + { +- int buf_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE; +- struct virtio_vsock_pkt *pkt; +- struct scatterlist hdr, buf, *sgs[2]; ++ int total_len = VIRTIO_VSOCK_DEFAULT_RX_BUF_SIZE + VIRTIO_VSOCK_SKB_HEADROOM; ++ struct scatterlist pkt, *p; + struct virtqueue *vq; ++ struct sk_buff *skb; + int ret; + + vq = vsock->vqs[VSOCK_VQ_RX]; + + do { +- pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); +- if (!pkt) ++ skb = virtio_vsock_alloc_skb(total_len, GFP_KERNEL); ++ if (!skb) + break; + +- pkt->buf = kmalloc(buf_len, GFP_KERNEL); +- if (!pkt->buf) { +- virtio_transport_free_pkt(pkt); ++ memset(skb->head, 0, VIRTIO_VSOCK_SKB_HEADROOM); ++ sg_init_one(&pkt, virtio_vsock_hdr(skb), total_len); ++ p = &pkt; ++ ret = virtqueue_add_sgs(vq, &p, 0, 1, skb, GFP_KERNEL); ++ if (ret < 0) { ++ kfree_skb(skb); + break; + } + +- pkt->buf_len = buf_len; +- pkt->len = buf_len; +- +- sg_init_one(&hdr, &pkt->hdr, sizeof(pkt->hdr)); +- sgs[0] = &hdr; +- +- sg_init_one(&buf, pkt->buf, buf_len); +- sgs[1] = &buf; +- ret = virtqueue_add_sgs(vq, sgs, 0, 2, pkt, GFP_KERNEL); +- if (ret) { +- virtio_transport_free_pkt(pkt); +- break; +- } + vsock->rx_buf_nr++; + } while (vq->num_free); + if (vsock->rx_buf_nr > vsock->rx_buf_max_nr) +@@ -299,12 +263,12 @@ static void virtio_transport_tx_work(struct work_struct *work) + goto out; + + do { +- struct virtio_vsock_pkt *pkt; ++ struct sk_buff *skb; + unsigned int len; + + virtqueue_disable_cb(vq); +- while ((pkt = virtqueue_get_buf(vq, &len)) != NULL) { +- virtio_transport_free_pkt(pkt); ++ while ((skb = virtqueue_get_buf(vq, &len)) != NULL) { ++ consume_skb(skb); + added = true; + } + } while (!virtqueue_enable_cb(vq)); +@@ -529,7 +493,7 @@ static void virtio_transport_rx_work(struct work_struct *work) + do { + virtqueue_disable_cb(vq); + for (;;) { +- struct virtio_vsock_pkt *pkt; ++ struct sk_buff *skb; + unsigned int len; + + if (!virtio_transport_more_replies(vsock)) { +@@ -540,23 +504,22 @@ static void virtio_transport_rx_work(struct work_struct *work) + goto out; + } + +- pkt = virtqueue_get_buf(vq, &len); +- if (!pkt) { ++ skb = virtqueue_get_buf(vq, &len); ++ if (!skb) + break; +- } + + vsock->rx_buf_nr--; + + /* Drop short/long packets */ +- if (unlikely(len < sizeof(pkt->hdr) || +- len > sizeof(pkt->hdr) + pkt->len)) { +- virtio_transport_free_pkt(pkt); ++ if (unlikely(len < sizeof(struct virtio_vsock_hdr) || ++ len > virtio_vsock_skb_len(skb))) { ++ kfree_skb(skb); + continue; + } + +- pkt->len = len - sizeof(pkt->hdr); +- virtio_transport_deliver_tap_pkt(pkt); +- virtio_transport_recv_pkt(&virtio_transport, pkt); ++ virtio_vsock_skb_rx_put(skb); ++ virtio_transport_deliver_tap_pkt(skb); ++ virtio_transport_recv_pkt(&virtio_transport, skb); + } + } while (!virtqueue_enable_cb(vq)); + +@@ -624,7 +587,7 @@ static void virtio_vsock_vqs_start(struct virtio_vsock *vsock) + static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) + { + struct virtio_device *vdev = vsock->vdev; +- struct virtio_vsock_pkt *pkt; ++ struct sk_buff *skb; + + /* Reset all connected sockets when the VQs disappear */ + vsock_for_each_connected_socket(&virtio_transport.transport, +@@ -651,23 +614,16 @@ static void virtio_vsock_vqs_del(struct virtio_vsock *vsock) + virtio_reset_device(vdev); + + mutex_lock(&vsock->rx_lock); +- while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) +- virtio_transport_free_pkt(pkt); ++ while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_RX]))) ++ kfree_skb(skb); + mutex_unlock(&vsock->rx_lock); + + mutex_lock(&vsock->tx_lock); +- while ((pkt = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) +- virtio_transport_free_pkt(pkt); ++ while ((skb = virtqueue_detach_unused_buf(vsock->vqs[VSOCK_VQ_TX]))) ++ kfree_skb(skb); + mutex_unlock(&vsock->tx_lock); + +- spin_lock_bh(&vsock->send_pkt_list_lock); +- while (!list_empty(&vsock->send_pkt_list)) { +- pkt = list_first_entry(&vsock->send_pkt_list, +- struct virtio_vsock_pkt, list); +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } +- spin_unlock_bh(&vsock->send_pkt_list_lock); ++ virtio_vsock_skb_queue_purge(&vsock->send_pkt_queue); + + /* Delete virtqueues and flush outstanding callbacks if any */ + vdev->config->del_vqs(vdev); +@@ -704,8 +660,7 @@ static int virtio_vsock_probe(struct virtio_device *vdev) + mutex_init(&vsock->tx_lock); + mutex_init(&vsock->rx_lock); + mutex_init(&vsock->event_lock); +- spin_lock_init(&vsock->send_pkt_list_lock); +- INIT_LIST_HEAD(&vsock->send_pkt_list); ++ skb_queue_head_init(&vsock->send_pkt_queue); + INIT_WORK(&vsock->rx_work, virtio_transport_rx_work); + INIT_WORK(&vsock->tx_work, virtio_transport_tx_work); + INIT_WORK(&vsock->event_work, virtio_transport_event_work); +diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c +index a9980e9b93040..a1581c77cf84a 100644 +--- a/net/vmw_vsock/virtio_transport_common.c ++++ b/net/vmw_vsock/virtio_transport_common.c +@@ -37,53 +37,56 @@ virtio_transport_get_ops(struct vsock_sock *vsk) + return container_of(t, struct virtio_transport, transport); + } + +-static struct virtio_vsock_pkt * +-virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, ++/* Returns a new packet on success, otherwise returns NULL. ++ * ++ * If NULL is returned, errp is set to a negative errno. ++ */ ++static struct sk_buff * ++virtio_transport_alloc_skb(struct virtio_vsock_pkt_info *info, + size_t len, + u32 src_cid, + u32 src_port, + u32 dst_cid, + u32 dst_port) + { +- struct virtio_vsock_pkt *pkt; ++ const size_t skb_len = VIRTIO_VSOCK_SKB_HEADROOM + len; ++ struct virtio_vsock_hdr *hdr; ++ struct sk_buff *skb; ++ void *payload; + int err; + +- pkt = kzalloc(sizeof(*pkt), GFP_KERNEL); +- if (!pkt) ++ skb = virtio_vsock_alloc_skb(skb_len, GFP_KERNEL); ++ if (!skb) + return NULL; + +- pkt->hdr.type = cpu_to_le16(info->type); +- pkt->hdr.op = cpu_to_le16(info->op); +- pkt->hdr.src_cid = cpu_to_le64(src_cid); +- pkt->hdr.dst_cid = cpu_to_le64(dst_cid); +- pkt->hdr.src_port = cpu_to_le32(src_port); +- pkt->hdr.dst_port = cpu_to_le32(dst_port); +- pkt->hdr.flags = cpu_to_le32(info->flags); +- pkt->len = len; +- pkt->hdr.len = cpu_to_le32(len); +- pkt->reply = info->reply; +- pkt->vsk = info->vsk; ++ hdr = virtio_vsock_hdr(skb); ++ hdr->type = cpu_to_le16(info->type); ++ hdr->op = cpu_to_le16(info->op); ++ hdr->src_cid = cpu_to_le64(src_cid); ++ hdr->dst_cid = cpu_to_le64(dst_cid); ++ hdr->src_port = cpu_to_le32(src_port); ++ hdr->dst_port = cpu_to_le32(dst_port); ++ hdr->flags = cpu_to_le32(info->flags); ++ hdr->len = cpu_to_le32(len); + + if (info->msg && len > 0) { +- pkt->buf = kmalloc(len, GFP_KERNEL); +- if (!pkt->buf) +- goto out_pkt; +- +- pkt->buf_len = len; +- +- err = memcpy_from_msg(pkt->buf, info->msg, len); ++ payload = skb_put(skb, len); ++ err = memcpy_from_msg(payload, info->msg, len); + if (err) + goto out; + + if (msg_data_left(info->msg) == 0 && + info->type == VIRTIO_VSOCK_TYPE_SEQPACKET) { +- pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); ++ hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOM); + + if (info->msg->msg_flags & MSG_EOR) +- pkt->hdr.flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); ++ hdr->flags |= cpu_to_le32(VIRTIO_VSOCK_SEQ_EOR); + } + } + ++ if (info->reply) ++ virtio_vsock_skb_set_reply(skb); ++ + trace_virtio_transport_alloc_pkt(src_cid, src_port, + dst_cid, dst_port, + len, +@@ -91,19 +94,18 @@ virtio_transport_alloc_pkt(struct virtio_vsock_pkt_info *info, + info->op, + info->flags); + +- return pkt; ++ return skb; + + out: +- kfree(pkt->buf); +-out_pkt: +- kfree(pkt); ++ kfree_skb(skb); + return NULL; + } + + /* Packet capture */ + static struct sk_buff *virtio_transport_build_skb(void *opaque) + { +- struct virtio_vsock_pkt *pkt = opaque; ++ struct virtio_vsock_hdr *pkt_hdr; ++ struct sk_buff *pkt = opaque; + struct af_vsockmon_hdr *hdr; + struct sk_buff *skb; + size_t payload_len; +@@ -113,10 +115,11 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) + * the payload length from the header and the buffer pointer taking + * care of the offset in the original packet. + */ +- payload_len = le32_to_cpu(pkt->hdr.len); +- payload_buf = pkt->buf + pkt->off; ++ pkt_hdr = virtio_vsock_hdr(pkt); ++ payload_len = pkt->len; ++ payload_buf = pkt->data; + +- skb = alloc_skb(sizeof(*hdr) + sizeof(pkt->hdr) + payload_len, ++ skb = alloc_skb(sizeof(*hdr) + sizeof(*pkt_hdr) + payload_len, + GFP_ATOMIC); + if (!skb) + return NULL; +@@ -124,16 +127,16 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) + hdr = skb_put(skb, sizeof(*hdr)); + + /* pkt->hdr is little-endian so no need to byteswap here */ +- hdr->src_cid = pkt->hdr.src_cid; +- hdr->src_port = pkt->hdr.src_port; +- hdr->dst_cid = pkt->hdr.dst_cid; +- hdr->dst_port = pkt->hdr.dst_port; ++ hdr->src_cid = pkt_hdr->src_cid; ++ hdr->src_port = pkt_hdr->src_port; ++ hdr->dst_cid = pkt_hdr->dst_cid; ++ hdr->dst_port = pkt_hdr->dst_port; + + hdr->transport = cpu_to_le16(AF_VSOCK_TRANSPORT_VIRTIO); +- hdr->len = cpu_to_le16(sizeof(pkt->hdr)); ++ hdr->len = cpu_to_le16(sizeof(*pkt_hdr)); + memset(hdr->reserved, 0, sizeof(hdr->reserved)); + +- switch (le16_to_cpu(pkt->hdr.op)) { ++ switch (le16_to_cpu(pkt_hdr->op)) { + case VIRTIO_VSOCK_OP_REQUEST: + case VIRTIO_VSOCK_OP_RESPONSE: + hdr->op = cpu_to_le16(AF_VSOCK_OP_CONNECT); +@@ -154,7 +157,7 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) + break; + } + +- skb_put_data(skb, &pkt->hdr, sizeof(pkt->hdr)); ++ skb_put_data(skb, pkt_hdr, sizeof(*pkt_hdr)); + + if (payload_len) { + skb_put_data(skb, payload_buf, payload_len); +@@ -163,13 +166,13 @@ static struct sk_buff *virtio_transport_build_skb(void *opaque) + return skb; + } + +-void virtio_transport_deliver_tap_pkt(struct virtio_vsock_pkt *pkt) ++void virtio_transport_deliver_tap_pkt(struct sk_buff *skb) + { +- if (pkt->tap_delivered) ++ if (virtio_vsock_skb_tap_delivered(skb)) + return; + +- vsock_deliver_tap(virtio_transport_build_skb, pkt); +- pkt->tap_delivered = true; ++ vsock_deliver_tap(virtio_transport_build_skb, skb); ++ virtio_vsock_skb_set_tap_delivered(skb); + } + EXPORT_SYMBOL_GPL(virtio_transport_deliver_tap_pkt); + +@@ -192,8 +195,8 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + u32 src_cid, src_port, dst_cid, dst_port; + const struct virtio_transport *t_ops; + struct virtio_vsock_sock *vvs; +- struct virtio_vsock_pkt *pkt; + u32 pkt_len = info->pkt_len; ++ struct sk_buff *skb; + + info->type = virtio_transport_get_type(sk_vsock(vsk)); + +@@ -224,42 +227,47 @@ static int virtio_transport_send_pkt_info(struct vsock_sock *vsk, + if (pkt_len == 0 && info->op == VIRTIO_VSOCK_OP_RW) + return pkt_len; + +- pkt = virtio_transport_alloc_pkt(info, pkt_len, ++ skb = virtio_transport_alloc_skb(info, pkt_len, + src_cid, src_port, + dst_cid, dst_port); +- if (!pkt) { ++ if (!skb) { + virtio_transport_put_credit(vvs, pkt_len); + return -ENOMEM; + } + +- virtio_transport_inc_tx_pkt(vvs, pkt); ++ virtio_transport_inc_tx_pkt(vvs, skb); + +- return t_ops->send_pkt(pkt); ++ return t_ops->send_pkt(skb); + } + + static bool virtio_transport_inc_rx_pkt(struct virtio_vsock_sock *vvs, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { +- if (vvs->rx_bytes + pkt->len > vvs->buf_alloc) ++ if (vvs->rx_bytes + skb->len > vvs->buf_alloc) + return false; + +- vvs->rx_bytes += pkt->len; ++ vvs->rx_bytes += skb->len; + return true; + } + + static void virtio_transport_dec_rx_pkt(struct virtio_vsock_sock *vvs, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { +- vvs->rx_bytes -= pkt->len; +- vvs->fwd_cnt += pkt->len; ++ int len; ++ ++ len = skb_headroom(skb) - sizeof(struct virtio_vsock_hdr) - skb->len; ++ vvs->rx_bytes -= len; ++ vvs->fwd_cnt += len; + } + +-void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct virtio_vsock_pkt *pkt) ++void virtio_transport_inc_tx_pkt(struct virtio_vsock_sock *vvs, struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); ++ + spin_lock_bh(&vvs->rx_lock); + vvs->last_fwd_cnt = vvs->fwd_cnt; +- pkt->hdr.fwd_cnt = cpu_to_le32(vvs->fwd_cnt); +- pkt->hdr.buf_alloc = cpu_to_le32(vvs->buf_alloc); ++ hdr->fwd_cnt = cpu_to_le32(vvs->fwd_cnt); ++ hdr->buf_alloc = cpu_to_le32(vvs->buf_alloc); + spin_unlock_bh(&vvs->rx_lock); + } + EXPORT_SYMBOL_GPL(virtio_transport_inc_tx_pkt); +@@ -303,29 +311,29 @@ virtio_transport_stream_do_peek(struct vsock_sock *vsk, + size_t len) + { + struct virtio_vsock_sock *vvs = vsk->trans; +- struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0, off; ++ struct sk_buff *skb, *tmp; + int err = -EFAULT; + + spin_lock_bh(&vvs->rx_lock); + +- list_for_each_entry(pkt, &vvs->rx_queue, list) { +- off = pkt->off; ++ skb_queue_walk_safe(&vvs->rx_queue, skb, tmp) { ++ off = 0; + + if (total == len) + break; + +- while (total < len && off < pkt->len) { ++ while (total < len && off < skb->len) { + bytes = len - total; +- if (bytes > pkt->len - off) +- bytes = pkt->len - off; ++ if (bytes > skb->len - off) ++ bytes = skb->len - off; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + +- err = memcpy_to_msg(msg, pkt->buf + off, bytes); ++ err = memcpy_to_msg(msg, skb->data + off, bytes); + if (err) + goto out; + +@@ -352,37 +360,38 @@ virtio_transport_stream_do_dequeue(struct vsock_sock *vsk, + size_t len) + { + struct virtio_vsock_sock *vvs = vsk->trans; +- struct virtio_vsock_pkt *pkt; + size_t bytes, total = 0; +- u32 free_space; ++ struct sk_buff *skb; + int err = -EFAULT; ++ u32 free_space; + + spin_lock_bh(&vvs->rx_lock); +- while (total < len && !list_empty(&vvs->rx_queue)) { +- pkt = list_first_entry(&vvs->rx_queue, +- struct virtio_vsock_pkt, list); ++ while (total < len && !skb_queue_empty(&vvs->rx_queue)) { ++ skb = __skb_dequeue(&vvs->rx_queue); + + bytes = len - total; +- if (bytes > pkt->len - pkt->off) +- bytes = pkt->len - pkt->off; ++ if (bytes > skb->len) ++ bytes = skb->len; + + /* sk_lock is held by caller so no one else can dequeue. + * Unlock rx_lock since memcpy_to_msg() may sleep. + */ + spin_unlock_bh(&vvs->rx_lock); + +- err = memcpy_to_msg(msg, pkt->buf + pkt->off, bytes); ++ err = memcpy_to_msg(msg, skb->data, bytes); + if (err) + goto out; + + spin_lock_bh(&vvs->rx_lock); + + total += bytes; +- pkt->off += bytes; +- if (pkt->off == pkt->len) { +- virtio_transport_dec_rx_pkt(vvs, pkt); +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); ++ skb_pull(skb, bytes); ++ ++ if (skb->len == 0) { ++ virtio_transport_dec_rx_pkt(vvs, skb); ++ consume_skb(skb); ++ } else { ++ __skb_queue_head(&vvs->rx_queue, skb); + } + } + +@@ -414,10 +423,10 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + int flags) + { + struct virtio_vsock_sock *vvs = vsk->trans; +- struct virtio_vsock_pkt *pkt; + int dequeued_len = 0; + size_t user_buf_len = msg_data_left(msg); + bool msg_ready = false; ++ struct sk_buff *skb; + + spin_lock_bh(&vvs->rx_lock); + +@@ -427,13 +436,18 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + } + + while (!msg_ready) { +- pkt = list_first_entry(&vvs->rx_queue, struct virtio_vsock_pkt, list); ++ struct virtio_vsock_hdr *hdr; ++ ++ skb = __skb_dequeue(&vvs->rx_queue); ++ if (!skb) ++ break; ++ hdr = virtio_vsock_hdr(skb); + + if (dequeued_len >= 0) { + size_t pkt_len; + size_t bytes_to_copy; + +- pkt_len = (size_t)le32_to_cpu(pkt->hdr.len); ++ pkt_len = (size_t)le32_to_cpu(hdr->len); + bytes_to_copy = min(user_buf_len, pkt_len); + + if (bytes_to_copy) { +@@ -444,7 +458,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + */ + spin_unlock_bh(&vvs->rx_lock); + +- err = memcpy_to_msg(msg, pkt->buf, bytes_to_copy); ++ err = memcpy_to_msg(msg, skb->data, bytes_to_copy); + if (err) { + /* Copy of message failed. Rest of + * fragments will be freed without copy. +@@ -452,6 +466,7 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + dequeued_len = err; + } else { + user_buf_len -= bytes_to_copy; ++ skb_pull(skb, bytes_to_copy); + } + + spin_lock_bh(&vvs->rx_lock); +@@ -461,17 +476,16 @@ static int virtio_transport_seqpacket_do_dequeue(struct vsock_sock *vsk, + dequeued_len += pkt_len; + } + +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) { ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) { + msg_ready = true; + vvs->msg_count--; + +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOR) ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOR) + msg->msg_flags |= MSG_EOR; + } + +- virtio_transport_dec_rx_pkt(vvs, pkt); +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); ++ virtio_transport_dec_rx_pkt(vvs, skb); ++ kfree_skb(skb); + } + + spin_unlock_bh(&vvs->rx_lock); +@@ -609,7 +623,7 @@ int virtio_transport_do_socket_init(struct vsock_sock *vsk, + + spin_lock_init(&vvs->rx_lock); + spin_lock_init(&vvs->tx_lock); +- INIT_LIST_HEAD(&vvs->rx_queue); ++ skb_queue_head_init(&vvs->rx_queue); + + return 0; + } +@@ -806,16 +820,16 @@ void virtio_transport_destruct(struct vsock_sock *vsk) + EXPORT_SYMBOL_GPL(virtio_transport_destruct); + + static int virtio_transport_reset(struct vsock_sock *vsk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, +- .reply = !!pkt, ++ .reply = !!skb, + .vsk = vsk, + }; + + /* Send RST only if the original pkt is not a RST pkt */ +- if (pkt && le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) ++ if (skb && le16_to_cpu(virtio_vsock_hdr(skb)->op) == VIRTIO_VSOCK_OP_RST) + return 0; + + return virtio_transport_send_pkt_info(vsk, &info); +@@ -825,29 +839,30 @@ static int virtio_transport_reset(struct vsock_sock *vsk, + * attempt was made to connect to a socket that does not exist. + */ + static int virtio_transport_reset_no_sock(const struct virtio_transport *t, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { +- struct virtio_vsock_pkt *reply; ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RST, +- .type = le16_to_cpu(pkt->hdr.type), ++ .type = le16_to_cpu(hdr->type), + .reply = true, + }; ++ struct sk_buff *reply; + + /* Send RST only if the original pkt is not a RST pkt */ +- if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) ++ if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) + return 0; + +- reply = virtio_transport_alloc_pkt(&info, 0, +- le64_to_cpu(pkt->hdr.dst_cid), +- le32_to_cpu(pkt->hdr.dst_port), +- le64_to_cpu(pkt->hdr.src_cid), +- le32_to_cpu(pkt->hdr.src_port)); ++ reply = virtio_transport_alloc_skb(&info, 0, ++ le64_to_cpu(hdr->dst_cid), ++ le32_to_cpu(hdr->dst_port), ++ le64_to_cpu(hdr->src_cid), ++ le32_to_cpu(hdr->src_port)); + if (!reply) + return -ENOMEM; + + if (!t) { +- virtio_transport_free_pkt(reply); ++ kfree_skb(reply); + return -ENOTCONN; + } + +@@ -858,16 +873,11 @@ static int virtio_transport_reset_no_sock(const struct virtio_transport *t, + static void virtio_transport_remove_sock(struct vsock_sock *vsk) + { + struct virtio_vsock_sock *vvs = vsk->trans; +- struct virtio_vsock_pkt *pkt, *tmp; + + /* We don't need to take rx_lock, as the socket is closing and we are + * removing it. + */ +- list_for_each_entry_safe(pkt, tmp, &vvs->rx_queue, list) { +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } +- ++ __skb_queue_purge(&vvs->rx_queue); + vsock_remove_sock(vsk); + } + +@@ -981,13 +991,14 @@ EXPORT_SYMBOL_GPL(virtio_transport_release); + + static int + virtio_transport_recv_connecting(struct sock *sk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vsock_sock *vsk = vsock_sk(sk); +- int err; + int skerr; ++ int err; + +- switch (le16_to_cpu(pkt->hdr.op)) { ++ switch (le16_to_cpu(hdr->op)) { + case VIRTIO_VSOCK_OP_RESPONSE: + sk->sk_state = TCP_ESTABLISHED; + sk->sk_socket->state = SS_CONNECTED; +@@ -1008,7 +1019,7 @@ virtio_transport_recv_connecting(struct sock *sk, + return 0; + + destroy: +- virtio_transport_reset(vsk, pkt); ++ virtio_transport_reset(vsk, skb); + sk->sk_state = TCP_CLOSE; + sk->sk_err = skerr; + sk_error_report(sk); +@@ -1017,34 +1028,37 @@ virtio_transport_recv_connecting(struct sock *sk, + + static void + virtio_transport_recv_enqueue(struct vsock_sock *vsk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { + struct virtio_vsock_sock *vvs = vsk->trans; + bool can_enqueue, free_pkt = false; ++ struct virtio_vsock_hdr *hdr; ++ u32 len; + +- pkt->len = le32_to_cpu(pkt->hdr.len); +- pkt->off = 0; ++ hdr = virtio_vsock_hdr(skb); ++ len = le32_to_cpu(hdr->len); + + spin_lock_bh(&vvs->rx_lock); + +- can_enqueue = virtio_transport_inc_rx_pkt(vvs, pkt); ++ can_enqueue = virtio_transport_inc_rx_pkt(vvs, skb); + if (!can_enqueue) { + free_pkt = true; + goto out; + } + +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM) ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SEQ_EOM) + vvs->msg_count++; + + /* Try to copy small packets into the buffer of last packet queued, + * to avoid wasting memory queueing the entire buffer with a small + * payload. + */ +- if (pkt->len <= GOOD_COPY_LEN && !list_empty(&vvs->rx_queue)) { +- struct virtio_vsock_pkt *last_pkt; ++ if (len <= GOOD_COPY_LEN && !skb_queue_empty(&vvs->rx_queue)) { ++ struct virtio_vsock_hdr *last_hdr; ++ struct sk_buff *last_skb; + +- last_pkt = list_last_entry(&vvs->rx_queue, +- struct virtio_vsock_pkt, list); ++ last_skb = skb_peek_tail(&vvs->rx_queue); ++ last_hdr = virtio_vsock_hdr(last_skb); + + /* If there is space in the last packet queued, we copy the + * new packet in its buffer. We avoid this if the last packet +@@ -1052,35 +1066,35 @@ virtio_transport_recv_enqueue(struct vsock_sock *vsk, + * delimiter of SEQPACKET message, so 'pkt' is the first packet + * of a new message. + */ +- if ((pkt->len <= last_pkt->buf_len - last_pkt->len) && +- !(le32_to_cpu(last_pkt->hdr.flags) & VIRTIO_VSOCK_SEQ_EOM)) { +- memcpy(last_pkt->buf + last_pkt->len, pkt->buf, +- pkt->len); +- last_pkt->len += pkt->len; ++ if (skb->len < skb_tailroom(last_skb) && ++ !(le32_to_cpu(last_hdr->flags) & VIRTIO_VSOCK_SEQ_EOM)) { ++ memcpy(skb_put(last_skb, skb->len), skb->data, skb->len); + free_pkt = true; +- last_pkt->hdr.flags |= pkt->hdr.flags; ++ last_hdr->flags |= hdr->flags; ++ last_hdr->len = cpu_to_le32(last_skb->len); + goto out; + } + } + +- list_add_tail(&pkt->list, &vvs->rx_queue); ++ __skb_queue_tail(&vvs->rx_queue, skb); + + out: + spin_unlock_bh(&vvs->rx_lock); + if (free_pkt) +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + } + + static int + virtio_transport_recv_connected(struct sock *sk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vsock_sock *vsk = vsock_sk(sk); + int err = 0; + +- switch (le16_to_cpu(pkt->hdr.op)) { ++ switch (le16_to_cpu(hdr->op)) { + case VIRTIO_VSOCK_OP_RW: +- virtio_transport_recv_enqueue(vsk, pkt); ++ virtio_transport_recv_enqueue(vsk, skb); + vsock_data_ready(sk); + return err; + case VIRTIO_VSOCK_OP_CREDIT_REQUEST: +@@ -1090,18 +1104,17 @@ virtio_transport_recv_connected(struct sock *sk, + sk->sk_write_space(sk); + break; + case VIRTIO_VSOCK_OP_SHUTDOWN: +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_RCV) + vsk->peer_shutdown |= RCV_SHUTDOWN; +- if (le32_to_cpu(pkt->hdr.flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) ++ if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; + if (vsk->peer_shutdown == SHUTDOWN_MASK && + vsock_stream_has_data(vsk) <= 0 && + !sock_flag(sk, SOCK_DONE)) { + (void)virtio_transport_reset(vsk, NULL); +- + virtio_transport_do_close(vsk, true); + } +- if (le32_to_cpu(pkt->hdr.flags)) ++ if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) + sk->sk_state_change(sk); + break; + case VIRTIO_VSOCK_OP_RST: +@@ -1112,28 +1125,30 @@ virtio_transport_recv_connected(struct sock *sk, + break; + } + +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + return err; + } + + static void + virtio_transport_recv_disconnecting(struct sock *sk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vsock_sock *vsk = vsock_sk(sk); + +- if (le16_to_cpu(pkt->hdr.op) == VIRTIO_VSOCK_OP_RST) ++ if (le16_to_cpu(hdr->op) == VIRTIO_VSOCK_OP_RST) + virtio_transport_do_close(vsk, true); + } + + static int + virtio_transport_send_response(struct vsock_sock *vsk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct virtio_vsock_pkt_info info = { + .op = VIRTIO_VSOCK_OP_RESPONSE, +- .remote_cid = le64_to_cpu(pkt->hdr.src_cid), +- .remote_port = le32_to_cpu(pkt->hdr.src_port), ++ .remote_cid = le64_to_cpu(hdr->src_cid), ++ .remote_port = le32_to_cpu(hdr->src_port), + .reply = true, + .vsk = vsk, + }; +@@ -1142,8 +1157,9 @@ virtio_transport_send_response(struct vsock_sock *vsk, + } + + static bool virtio_transport_space_update(struct sock *sk, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vsock_sock *vsk = vsock_sk(sk); + struct virtio_vsock_sock *vvs = vsk->trans; + bool space_available; +@@ -1158,8 +1174,8 @@ static bool virtio_transport_space_update(struct sock *sk, + + /* buf_alloc and fwd_cnt is always included in the hdr */ + spin_lock_bh(&vvs->tx_lock); +- vvs->peer_buf_alloc = le32_to_cpu(pkt->hdr.buf_alloc); +- vvs->peer_fwd_cnt = le32_to_cpu(pkt->hdr.fwd_cnt); ++ vvs->peer_buf_alloc = le32_to_cpu(hdr->buf_alloc); ++ vvs->peer_fwd_cnt = le32_to_cpu(hdr->fwd_cnt); + space_available = virtio_transport_has_space(vsk); + spin_unlock_bh(&vvs->tx_lock); + return space_available; +@@ -1167,27 +1183,28 @@ static bool virtio_transport_space_update(struct sock *sk, + + /* Handle server socket */ + static int +-virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, ++virtio_transport_recv_listen(struct sock *sk, struct sk_buff *skb, + struct virtio_transport *t) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct vsock_sock *vsk = vsock_sk(sk); + struct vsock_sock *vchild; + struct sock *child; + int ret; + +- if (le16_to_cpu(pkt->hdr.op) != VIRTIO_VSOCK_OP_REQUEST) { +- virtio_transport_reset_no_sock(t, pkt); ++ if (le16_to_cpu(hdr->op) != VIRTIO_VSOCK_OP_REQUEST) { ++ virtio_transport_reset_no_sock(t, skb); + return -EINVAL; + } + + if (sk_acceptq_is_full(sk)) { +- virtio_transport_reset_no_sock(t, pkt); ++ virtio_transport_reset_no_sock(t, skb); + return -ENOMEM; + } + + child = vsock_create_connected(sk); + if (!child) { +- virtio_transport_reset_no_sock(t, pkt); ++ virtio_transport_reset_no_sock(t, skb); + return -ENOMEM; + } + +@@ -1198,10 +1215,10 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, + child->sk_state = TCP_ESTABLISHED; + + vchild = vsock_sk(child); +- vsock_addr_init(&vchild->local_addr, le64_to_cpu(pkt->hdr.dst_cid), +- le32_to_cpu(pkt->hdr.dst_port)); +- vsock_addr_init(&vchild->remote_addr, le64_to_cpu(pkt->hdr.src_cid), +- le32_to_cpu(pkt->hdr.src_port)); ++ vsock_addr_init(&vchild->local_addr, le64_to_cpu(hdr->dst_cid), ++ le32_to_cpu(hdr->dst_port)); ++ vsock_addr_init(&vchild->remote_addr, le64_to_cpu(hdr->src_cid), ++ le32_to_cpu(hdr->src_port)); + + ret = vsock_assign_transport(vchild, vsk); + /* Transport assigned (looking at remote_addr) must be the same +@@ -1209,17 +1226,17 @@ virtio_transport_recv_listen(struct sock *sk, struct virtio_vsock_pkt *pkt, + */ + if (ret || vchild->transport != &t->transport) { + release_sock(child); +- virtio_transport_reset_no_sock(t, pkt); ++ virtio_transport_reset_no_sock(t, skb); + sock_put(child); + return ret; + } + +- if (virtio_transport_space_update(child, pkt)) ++ if (virtio_transport_space_update(child, skb)) + child->sk_write_space(child); + + vsock_insert_connected(vchild); + vsock_enqueue_accept(sk, child); +- virtio_transport_send_response(vchild, pkt); ++ virtio_transport_send_response(vchild, skb); + + release_sock(child); + +@@ -1237,29 +1254,30 @@ static bool virtio_transport_valid_type(u16 type) + * lock. + */ + void virtio_transport_recv_pkt(struct virtio_transport *t, +- struct virtio_vsock_pkt *pkt) ++ struct sk_buff *skb) + { ++ struct virtio_vsock_hdr *hdr = virtio_vsock_hdr(skb); + struct sockaddr_vm src, dst; + struct vsock_sock *vsk; + struct sock *sk; + bool space_available; + +- vsock_addr_init(&src, le64_to_cpu(pkt->hdr.src_cid), +- le32_to_cpu(pkt->hdr.src_port)); +- vsock_addr_init(&dst, le64_to_cpu(pkt->hdr.dst_cid), +- le32_to_cpu(pkt->hdr.dst_port)); ++ vsock_addr_init(&src, le64_to_cpu(hdr->src_cid), ++ le32_to_cpu(hdr->src_port)); ++ vsock_addr_init(&dst, le64_to_cpu(hdr->dst_cid), ++ le32_to_cpu(hdr->dst_port)); + + trace_virtio_transport_recv_pkt(src.svm_cid, src.svm_port, + dst.svm_cid, dst.svm_port, +- le32_to_cpu(pkt->hdr.len), +- le16_to_cpu(pkt->hdr.type), +- le16_to_cpu(pkt->hdr.op), +- le32_to_cpu(pkt->hdr.flags), +- le32_to_cpu(pkt->hdr.buf_alloc), +- le32_to_cpu(pkt->hdr.fwd_cnt)); +- +- if (!virtio_transport_valid_type(le16_to_cpu(pkt->hdr.type))) { +- (void)virtio_transport_reset_no_sock(t, pkt); ++ le32_to_cpu(hdr->len), ++ le16_to_cpu(hdr->type), ++ le16_to_cpu(hdr->op), ++ le32_to_cpu(hdr->flags), ++ le32_to_cpu(hdr->buf_alloc), ++ le32_to_cpu(hdr->fwd_cnt)); ++ ++ if (!virtio_transport_valid_type(le16_to_cpu(hdr->type))) { ++ (void)virtio_transport_reset_no_sock(t, skb); + goto free_pkt; + } + +@@ -1270,13 +1288,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, + if (!sk) { + sk = vsock_find_bound_socket(&dst); + if (!sk) { +- (void)virtio_transport_reset_no_sock(t, pkt); ++ (void)virtio_transport_reset_no_sock(t, skb); + goto free_pkt; + } + } + +- if (virtio_transport_get_type(sk) != le16_to_cpu(pkt->hdr.type)) { +- (void)virtio_transport_reset_no_sock(t, pkt); ++ if (virtio_transport_get_type(sk) != le16_to_cpu(hdr->type)) { ++ (void)virtio_transport_reset_no_sock(t, skb); + sock_put(sk); + goto free_pkt; + } +@@ -1287,13 +1305,13 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, + + /* Check if sk has been closed before lock_sock */ + if (sock_flag(sk, SOCK_DONE)) { +- (void)virtio_transport_reset_no_sock(t, pkt); ++ (void)virtio_transport_reset_no_sock(t, skb); + release_sock(sk); + sock_put(sk); + goto free_pkt; + } + +- space_available = virtio_transport_space_update(sk, pkt); ++ space_available = virtio_transport_space_update(sk, skb); + + /* Update CID in case it has changed after a transport reset event */ + if (vsk->local_addr.svm_cid != VMADDR_CID_ANY) +@@ -1304,23 +1322,23 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, + + switch (sk->sk_state) { + case TCP_LISTEN: +- virtio_transport_recv_listen(sk, pkt, t); +- virtio_transport_free_pkt(pkt); ++ virtio_transport_recv_listen(sk, skb, t); ++ kfree_skb(skb); + break; + case TCP_SYN_SENT: +- virtio_transport_recv_connecting(sk, pkt); +- virtio_transport_free_pkt(pkt); ++ virtio_transport_recv_connecting(sk, skb); ++ kfree_skb(skb); + break; + case TCP_ESTABLISHED: +- virtio_transport_recv_connected(sk, pkt); ++ virtio_transport_recv_connected(sk, skb); + break; + case TCP_CLOSING: +- virtio_transport_recv_disconnecting(sk, pkt); +- virtio_transport_free_pkt(pkt); ++ virtio_transport_recv_disconnecting(sk, skb); ++ kfree_skb(skb); + break; + default: +- (void)virtio_transport_reset_no_sock(t, pkt); +- virtio_transport_free_pkt(pkt); ++ (void)virtio_transport_reset_no_sock(t, skb); ++ kfree_skb(skb); + break; + } + +@@ -1333,16 +1351,42 @@ void virtio_transport_recv_pkt(struct virtio_transport *t, + return; + + free_pkt: +- virtio_transport_free_pkt(pkt); ++ kfree_skb(skb); + } + EXPORT_SYMBOL_GPL(virtio_transport_recv_pkt); + +-void virtio_transport_free_pkt(struct virtio_vsock_pkt *pkt) ++/* Remove skbs found in a queue that have a vsk that matches. ++ * ++ * Each skb is freed. ++ * ++ * Returns the count of skbs that were reply packets. ++ */ ++int virtio_transport_purge_skbs(void *vsk, struct sk_buff_head *queue) + { +- kvfree(pkt->buf); +- kfree(pkt); ++ struct sk_buff_head freeme; ++ struct sk_buff *skb, *tmp; ++ int cnt = 0; ++ ++ skb_queue_head_init(&freeme); ++ ++ spin_lock_bh(&queue->lock); ++ skb_queue_walk_safe(queue, skb, tmp) { ++ if (vsock_sk(skb->sk) != vsk) ++ continue; ++ ++ __skb_unlink(skb, queue); ++ __skb_queue_tail(&freeme, skb); ++ ++ if (virtio_vsock_skb_reply(skb)) ++ cnt++; ++ } ++ spin_unlock_bh(&queue->lock); ++ ++ __skb_queue_purge(&freeme); ++ ++ return cnt; + } +-EXPORT_SYMBOL_GPL(virtio_transport_free_pkt); ++EXPORT_SYMBOL_GPL(virtio_transport_purge_skbs); + + MODULE_LICENSE("GPL v2"); + MODULE_AUTHOR("Asias He"); +diff --git a/net/vmw_vsock/vsock_loopback.c b/net/vmw_vsock/vsock_loopback.c +index 169a8cf65b390..671e03240fc52 100644 +--- a/net/vmw_vsock/vsock_loopback.c ++++ b/net/vmw_vsock/vsock_loopback.c +@@ -16,7 +16,7 @@ struct vsock_loopback { + struct workqueue_struct *workqueue; + + spinlock_t pkt_list_lock; /* protects pkt_list */ +- struct list_head pkt_list; ++ struct sk_buff_head pkt_queue; + struct work_struct pkt_work; + }; + +@@ -27,13 +27,13 @@ static u32 vsock_loopback_get_local_cid(void) + return VMADDR_CID_LOCAL; + } + +-static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) ++static int vsock_loopback_send_pkt(struct sk_buff *skb) + { + struct vsock_loopback *vsock = &the_vsock_loopback; +- int len = pkt->len; ++ int len = skb->len; + + spin_lock_bh(&vsock->pkt_list_lock); +- list_add_tail(&pkt->list, &vsock->pkt_list); ++ skb_queue_tail(&vsock->pkt_queue, skb); + spin_unlock_bh(&vsock->pkt_list_lock); + + queue_work(vsock->workqueue, &vsock->pkt_work); +@@ -44,21 +44,8 @@ static int vsock_loopback_send_pkt(struct virtio_vsock_pkt *pkt) + static int vsock_loopback_cancel_pkt(struct vsock_sock *vsk) + { + struct vsock_loopback *vsock = &the_vsock_loopback; +- struct virtio_vsock_pkt *pkt, *n; +- LIST_HEAD(freeme); + +- spin_lock_bh(&vsock->pkt_list_lock); +- list_for_each_entry_safe(pkt, n, &vsock->pkt_list, list) { +- if (pkt->vsk != vsk) +- continue; +- list_move(&pkt->list, &freeme); +- } +- spin_unlock_bh(&vsock->pkt_list_lock); +- +- list_for_each_entry_safe(pkt, n, &freeme, list) { +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } ++ virtio_transport_purge_skbs(vsk, &vsock->pkt_queue); + + return 0; + } +@@ -121,20 +108,18 @@ static void vsock_loopback_work(struct work_struct *work) + { + struct vsock_loopback *vsock = + container_of(work, struct vsock_loopback, pkt_work); +- LIST_HEAD(pkts); ++ struct sk_buff_head pkts; ++ struct sk_buff *skb; ++ ++ skb_queue_head_init(&pkts); + + spin_lock_bh(&vsock->pkt_list_lock); +- list_splice_init(&vsock->pkt_list, &pkts); ++ skb_queue_splice_init(&vsock->pkt_queue, &pkts); + spin_unlock_bh(&vsock->pkt_list_lock); + +- while (!list_empty(&pkts)) { +- struct virtio_vsock_pkt *pkt; +- +- pkt = list_first_entry(&pkts, struct virtio_vsock_pkt, list); +- list_del_init(&pkt->list); +- +- virtio_transport_deliver_tap_pkt(pkt); +- virtio_transport_recv_pkt(&loopback_transport, pkt); ++ while ((skb = __skb_dequeue(&pkts))) { ++ virtio_transport_deliver_tap_pkt(skb); ++ virtio_transport_recv_pkt(&loopback_transport, skb); + } + } + +@@ -148,7 +133,7 @@ static int __init vsock_loopback_init(void) + return -ENOMEM; + + spin_lock_init(&vsock->pkt_list_lock); +- INIT_LIST_HEAD(&vsock->pkt_list); ++ skb_queue_head_init(&vsock->pkt_queue); + INIT_WORK(&vsock->pkt_work, vsock_loopback_work); + + ret = vsock_core_register(&loopback_transport.transport, +@@ -166,19 +151,13 @@ static int __init vsock_loopback_init(void) + static void __exit vsock_loopback_exit(void) + { + struct vsock_loopback *vsock = &the_vsock_loopback; +- struct virtio_vsock_pkt *pkt; + + vsock_core_unregister(&loopback_transport.transport); + + flush_work(&vsock->pkt_work); + + spin_lock_bh(&vsock->pkt_list_lock); +- while (!list_empty(&vsock->pkt_list)) { +- pkt = list_first_entry(&vsock->pkt_list, +- struct virtio_vsock_pkt, list); +- list_del(&pkt->list); +- virtio_transport_free_pkt(pkt); +- } ++ virtio_vsock_skb_queue_purge(&vsock->pkt_queue); + spin_unlock_bh(&vsock->pkt_list_lock); + + destroy_workqueue(vsock->workqueue); +-- +2.42.0 + diff --git a/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch b/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch new file mode 100644 index 00000000000..012b78e0d13 --- /dev/null +++ b/queue-6.1/vsock-virtio-remove-socket-from-connected-bound-list.patch @@ -0,0 +1,75 @@ +From 1ee503bd1f558d4497498a5f553aa2e0961bfa18 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 3 Nov 2023 18:55:48 +0100 +Subject: vsock/virtio: remove socket from connected/bound list on shutdown + +From: Filippo Storniolo + +[ Upstream commit 3a5cc90a4d1756072619fe511d07621bdef7f120 ] + +If the same remote peer, using the same port, tries to connect +to a server on a listening port more than once, the server will +reject the connection, causing a "connection reset by peer" +error on the remote peer. This is due to the presence of a +dangling socket from a previous connection in both the connected +and bound socket lists. +The inconsistency of the above lists only occurs when the remote +peer disconnects and the server remains active. + +This bug does not occur when the server socket is closed: +virtio_transport_release() will eventually schedule a call to +virtio_transport_do_close() and the latter will remove the socket +from the bound and connected socket lists and clear the sk_buff. + +However, virtio_transport_do_close() will only perform the above +actions if it has been scheduled, and this will not happen +if the server is processing the shutdown message from a remote peer. + +To fix this, introduce a call to vsock_remove_sock() +when the server is handling a client disconnect. +This is to remove the socket from the bound and connected socket +lists without clearing the sk_buff. + +Fixes: 06a8fc78367d ("VSOCK: Introduce virtio_vsock_common.ko") +Reported-by: Daan De Meyer +Tested-by: Daan De Meyer +Co-developed-by: Luigi Leonardi +Signed-off-by: Luigi Leonardi +Signed-off-by: Filippo Storniolo +Reviewed-by: Stefano Garzarella +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/vmw_vsock/virtio_transport_common.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/net/vmw_vsock/virtio_transport_common.c b/net/vmw_vsock/virtio_transport_common.c +index a1581c77cf84a..73e5093928325 100644 +--- a/net/vmw_vsock/virtio_transport_common.c ++++ b/net/vmw_vsock/virtio_transport_common.c +@@ -1108,11 +1108,17 @@ virtio_transport_recv_connected(struct sock *sk, + vsk->peer_shutdown |= RCV_SHUTDOWN; + if (le32_to_cpu(hdr->flags) & VIRTIO_VSOCK_SHUTDOWN_SEND) + vsk->peer_shutdown |= SEND_SHUTDOWN; +- if (vsk->peer_shutdown == SHUTDOWN_MASK && +- vsock_stream_has_data(vsk) <= 0 && +- !sock_flag(sk, SOCK_DONE)) { +- (void)virtio_transport_reset(vsk, NULL); +- virtio_transport_do_close(vsk, true); ++ if (vsk->peer_shutdown == SHUTDOWN_MASK) { ++ if (vsock_stream_has_data(vsk) <= 0 && !sock_flag(sk, SOCK_DONE)) { ++ (void)virtio_transport_reset(vsk, NULL); ++ virtio_transport_do_close(vsk, true); ++ } ++ /* Remove this socket anyway because the remote peer sent ++ * the shutdown. This way a new connection will succeed ++ * if the remote peer uses the same source port, ++ * even if the old socket is still unreleased, but now disconnected. ++ */ ++ vsock_remove_sock(vsk); + } + if (le32_to_cpu(virtio_vsock_hdr(skb)->flags)) + sk->sk_state_change(sk); +-- +2.42.0 + diff --git a/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch b/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch new file mode 100644 index 00000000000..20d7e627cc3 --- /dev/null +++ b/queue-6.1/watchdog-ixp4xx-make-sure-restart-always-works.patch @@ -0,0 +1,88 @@ +From 9156bf060d4bf888e7dae1dfad5025795047337f Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 26 Sep 2023 11:13:44 +0200 +Subject: watchdog: ixp4xx: Make sure restart always works + +From: Linus Walleij + +[ Upstream commit b4075ecfe348a44209534c75ad72392c63a489a6 ] + +The IXP4xx watchdog in early "A0" silicon is unreliable and +cannot be registered, however for some systems such as the +USRobotics USR8200 the watchdog is the only restart option, +so implement a "dummy" watchdog that can only support restart +in this case. + +Fixes: 1aea522809e6 ("watchdog: ixp4xx: Implement restart") +Signed-off-by: Linus Walleij +Reviewed-by: Guenter Roeck +Link: https://lore.kernel.org/r/20230926-ixp4xx-wdt-restart-v2-1-15cf4639b423@linaro.org +Signed-off-by: Guenter Roeck +Signed-off-by: Wim Van Sebroeck +Signed-off-by: Sasha Levin +--- + drivers/watchdog/ixp4xx_wdt.c | 28 +++++++++++++++++++++++++--- + 1 file changed, 25 insertions(+), 3 deletions(-) + +diff --git a/drivers/watchdog/ixp4xx_wdt.c b/drivers/watchdog/ixp4xx_wdt.c +index 281a48d9889fc..0fc91e9c4a773 100644 +--- a/drivers/watchdog/ixp4xx_wdt.c ++++ b/drivers/watchdog/ixp4xx_wdt.c +@@ -105,6 +105,25 @@ static const struct watchdog_ops ixp4xx_wdt_ops = { + .owner = THIS_MODULE, + }; + ++/* ++ * The A0 version of the IXP422 had a bug in the watchdog making ++ * is useless, but we still need to use it to restart the system ++ * as it is the only way, so in this special case we register a ++ * "dummy" watchdog that doesn't really work, but will support ++ * the restart operation. ++ */ ++static int ixp4xx_wdt_dummy(struct watchdog_device *wdd) ++{ ++ return 0; ++} ++ ++static const struct watchdog_ops ixp4xx_wdt_restart_only_ops = { ++ .start = ixp4xx_wdt_dummy, ++ .stop = ixp4xx_wdt_dummy, ++ .restart = ixp4xx_wdt_restart, ++ .owner = THIS_MODULE, ++}; ++ + static const struct watchdog_info ixp4xx_wdt_info = { + .options = WDIOF_KEEPALIVEPING + | WDIOF_MAGICCLOSE +@@ -120,14 +139,17 @@ static void ixp4xx_clock_action(void *d) + + static int ixp4xx_wdt_probe(struct platform_device *pdev) + { ++ static const struct watchdog_ops *iwdt_ops; + struct device *dev = &pdev->dev; + struct ixp4xx_wdt *iwdt; + struct clk *clk; + int ret; + + if (!(read_cpuid_id() & 0xf) && !cpu_is_ixp46x()) { +- dev_err(dev, "Rev. A0 IXP42x CPU detected - watchdog disabled\n"); +- return -ENODEV; ++ dev_info(dev, "Rev. A0 IXP42x CPU detected - only restart supported\n"); ++ iwdt_ops = &ixp4xx_wdt_restart_only_ops; ++ } else { ++ iwdt_ops = &ixp4xx_wdt_ops; + } + + iwdt = devm_kzalloc(dev, sizeof(*iwdt), GFP_KERNEL); +@@ -153,7 +175,7 @@ static int ixp4xx_wdt_probe(struct platform_device *pdev) + iwdt->rate = IXP4XX_TIMER_FREQ; + + iwdt->wdd.info = &ixp4xx_wdt_info; +- iwdt->wdd.ops = &ixp4xx_wdt_ops; ++ iwdt->wdd.ops = iwdt_ops; + iwdt->wdd.min_timeout = 1; + iwdt->wdd.max_timeout = U32_MAX / iwdt->rate; + iwdt->wdd.parent = dev; +-- +2.42.0 +