--- /dev/null
+From d4c784e91102c0fe5bfb9431f7842247c55a3d6d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Oct 2023 08:42:18 +0100
+Subject: afs: Add comments on abort handling
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit fe245c8fcdac339e6b42076c828a6bede3a5e948 ]
+
+Add some comments on AFS abort code handling in the rotation algorithm and
+adjust the errors produced to match.
+
+Reported-by: Jeffrey E Altman <jaltman@auristor.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/rotate.c | 101 ++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 90 insertions(+), 11 deletions(-)
+
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a840c3588ebb..a3d127953ac6 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -13,6 +13,7 @@
+ #include <linux/sched/signal.h>
+ #include "internal.h"
+ #include "afs_fs.h"
++#include "protocol_uae.h"
+
+ /*
+ * Begin iteration through a server list, starting with the vnode's last used
+@@ -143,6 +144,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+ case -ECONNABORTED:
+ /* The far side rejected the operation on some grounds. This
+ * might involve the server being busy or the volume having been moved.
++ *
++ * Note that various V* errors should not be sent to a cache manager
++ * by a fileserver as they should be translated to more modern UAE*
++ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
++ * these abort codes.
+ */
+ switch (op->ac.abort_code) {
+ case VNOVOL:
+@@ -150,6 +156,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * - May indicate that the VL is wrong - retry once and compare
+ * the results.
+ * - May indicate that the fileserver couldn't attach to the vol.
++ * - The volume might have been temporarily removed so that it can
++ * be replaced by a volume restore. "vos" might have ended one
++ * transaction and has yet to create the next.
++ * - The volume might not be blessed or might not be in-service
++ * (administrative action).
+ */
+ if (op->flags & AFS_OPERATION_VNOVOL) {
+ op->error = -EREMOTEIO;
+@@ -183,16 +194,56 @@ bool afs_select_fileserver(struct afs_operation *op)
+ _leave(" = t [vnovol]");
+ return true;
+
+- case VSALVAGE: /* TODO: Should this return an error or iterate? */
+ case VVOLEXISTS:
+- case VNOSERVICE:
+ case VONLINE:
+- case VDISKFULL:
+- case VOVERQUOTA:
+- op->error = afs_abort_to_error(op->ac.abort_code);
++ /* These should not be returned from the fileserver. */
++ pr_warn("Fileserver returned unexpected abort %d\n",
++ op->ac.abort_code);
++ op->error = -EREMOTEIO;
++ goto next_server;
++
++ case VNOSERVICE:
++ /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
++ * if the volume was neither in-service nor administratively
++ * blessed. All usage was replaced by VNOVOL because AFS 3.1 and
++ * earlier cache managers did not handle VNOSERVICE and assumed
++ * it was the client OSes errno 105.
++ *
++ * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
++ * fileserver idle dead time error which was sent in place of
++ * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the
++ * fileserver took too long to send a reply to the client.
++ * RX_CALL_TIMEOUT would have caused the cache manager to mark the
++ * server down whereas VNOSERVICE since AFS 3.2 would cause cache
++ * manager to temporarily (up to 15 minutes) mark the volume
++ * instance as unusable.
++ *
++ * The idle dead logic resulted in cache inconsistency since a
++ * state changing call that the cache manager assumed was dead
++ * could still be processed to completion by the fileserver. This
++ * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
++ * returned. However, many 1.4.8 through 1.6.24 fileservers are
++ * still in existence.
++ *
++ * AuriStorFS fileservers have never returned VNOSERVICE.
++ *
++ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
++ */
++ case RX_CALL_TIMEOUT:
++ op->error = -ETIMEDOUT;
+ goto next_server;
+
++ case VSALVAGING: /* This error should not be leaked to cache managers
++ * but is from OpenAFS demand attach fileservers.
++ * It should be treated as an alias for VOFFLINE.
++ */
++ case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
+ case VOFFLINE:
++ /* The volume is in use by the volserver or another volume utility
++ * for an operation that might alter the contents. The volume is
++ * expected to come back but it might take a long time (could be
++ * days).
++ */
+ if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
+ afs_busy(op->volume, op->ac.abort_code);
+ clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+@@ -207,11 +258,20 @@ bool afs_select_fileserver(struct afs_operation *op)
+ }
+ goto busy;
+
+- case VSALVAGING:
+- case VRESTARTING:
++ case VRESTARTING: /* The fileserver is either shutting down or starting up. */
+ case VBUSY:
+- /* Retry after going round all the servers unless we
+- * have a file lock we need to maintain.
++ /* The volume is in use by the volserver or another volume
++ * utility for an operation that is not expected to alter the
++ * contents of the volume. VBUSY does not need to be returned
++ * for a ROVOL or BACKVOL bound to an ITBusy volserver
++ * transaction. The fileserver is permitted to continue serving
++ * content from ROVOLs and BACKVOLs during an ITBusy transaction
++ * because the content will not change. However, many fileserver
++ * releases do return VBUSY for ROVOL and BACKVOL instances under
++ * many circumstances.
++ *
++ * Retry after going round all the servers unless we have a file
++ * lock we need to maintain.
+ */
+ if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+ op->error = -EBUSY;
+@@ -226,7 +286,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ if (!afs_sleep_and_retry(op))
+ goto failed;
+
+- /* Retry with same server & address */
++ /* Retry with same server & address */
+ _leave(" = t [vbusy]");
+ return true;
+ }
+@@ -270,10 +330,29 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ goto restart_from_beginning;
+
++ case VDISKFULL:
++ case UAENOSPC:
++ /* The partition is full. Only applies to RWVOLs.
++ * Translate locally and return ENOSPC.
++ * No replicas to failover to.
++ */
++ op->error = -ENOSPC;
++ goto failed_but_online;
++
++ case VOVERQUOTA:
++ case UAEDQUOT:
++ /* Volume is full. Only applies to RWVOLs.
++ * Translate locally and return EDQUOT.
++ * No replicas to failover to.
++ */
++ op->error = -EDQUOT;
++ goto failed_but_online;
++
+ default:
++ op->error = afs_abort_to_error(op->ac.abort_code);
++ failed_but_online:
+ clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+- op->error = afs_abort_to_error(op->ac.abort_code);
+ goto failed;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 2885f7375cc37abe94fcd4895fa663b6b24e4904 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Oct 2023 09:54:07 +0100
+Subject: afs: Don't put afs_call in afs_wait_for_call_to_complete()
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 6f2ff7e89bd05677f4c08fccafcf625ca3e09c1c ]
+
+Don't put the afs_call struct in afs_wait_for_call_to_complete() but rather
+have the caller do it. This will allow the caller to fish stuff out of the
+afs_call struct rather than the afs_addr_cursor struct, thereby allowing a
+subsequent patch to subsume it.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/fs_operation.c | 7 +++--
+ fs/afs/fsclient.c | 5 ++-
+ fs/afs/internal.h | 2 +-
+ fs/afs/rxrpc.c | 73 ++++++++++++++++---------------------------
+ fs/afs/vlclient.c | 64 ++++++++++++++++++++++---------------
+ 5 files changed, 75 insertions(+), 76 deletions(-)
+
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index bfb9a7634bd9..1c22d6e77846 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -191,8 +191,11 @@ void afs_wait_for_operation(struct afs_operation *op)
+ else
+ op->ac.error = -ENOTSUPP;
+
+- if (op->call)
+- op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
++ if (op->call) {
++ afs_wait_for_call_to_complete(op->call, &op->ac);
++ op->error = op->ac.error;
++ afs_put_call(op->call);
++ }
+ }
+
+ switch (op->error) {
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 6821ce0f9d63..020073387111 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1612,6 +1612,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+ {
+ struct afs_call *call;
+ __be32 *bp;
++ int ret;
+
+ _enter("");
+
+@@ -1627,7 +1628,9 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+
+ call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+ afs_make_call(ac, call, GFP_NOFS);
+- return afs_wait_for_call_to_complete(call, ac);
++ afs_wait_for_call_to_complete(call, ac);
++ afs_put_call(call);
++ return ret;
+ }
+
+ /*
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 1a306df267b0..45c4526b56be 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -1291,7 +1291,7 @@ extern void __net_exit afs_close_socket(struct afs_net *);
+ extern void afs_charge_preallocation(struct work_struct *);
+ extern void afs_put_call(struct afs_call *);
+ extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
+-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac);
+ extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
+ const struct afs_call_type *,
+ size_t, size_t);
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index 2603db03b7ff..dad8efadbc44 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -575,48 +575,44 @@ static void afs_deliver_to_call(struct afs_call *call)
+ /*
+ * Wait synchronously for a call to complete and clean up the call struct.
+ */
+-long afs_wait_for_call_to_complete(struct afs_call *call,
+- struct afs_addr_cursor *ac)
++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac)
+ {
+- long ret;
+ bool rxrpc_complete = false;
+
+- DECLARE_WAITQUEUE(myself, current);
+-
+ _enter("");
+
+- ret = call->error;
+- if (ret < 0)
+- goto out;
++ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
++ DECLARE_WAITQUEUE(myself, current);
++
++ add_wait_queue(&call->waitq, &myself);
++ for (;;) {
++ set_current_state(TASK_UNINTERRUPTIBLE);
++
++ /* deliver any messages that are in the queue */
++ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
++ call->need_attention) {
++ call->need_attention = false;
++ __set_current_state(TASK_RUNNING);
++ afs_deliver_to_call(call);
++ continue;
++ }
+
+- add_wait_queue(&call->waitq, &myself);
+- for (;;) {
+- set_current_state(TASK_UNINTERRUPTIBLE);
+-
+- /* deliver any messages that are in the queue */
+- if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+- call->need_attention) {
+- call->need_attention = false;
+- __set_current_state(TASK_RUNNING);
+- afs_deliver_to_call(call);
+- continue;
+- }
++ if (afs_check_call_state(call, AFS_CALL_COMPLETE))
++ break;
+
+- if (afs_check_call_state(call, AFS_CALL_COMPLETE))
+- break;
++ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
++ /* rxrpc terminated the call. */
++ rxrpc_complete = true;
++ break;
++ }
+
+- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
+- /* rxrpc terminated the call. */
+- rxrpc_complete = true;
+- break;
++ schedule();
+ }
+
+- schedule();
++ remove_wait_queue(&call->waitq, &myself);
++ __set_current_state(TASK_RUNNING);
+ }
+
+- remove_wait_queue(&call->waitq, &myself);
+- __set_current_state(TASK_RUNNING);
+-
+ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
+ if (rxrpc_complete) {
+ afs_set_call_complete(call, call->error, call->abort_code);
+@@ -635,23 +631,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
+ ac->error = call->error;
+ spin_unlock_bh(&call->state_lock);
+
+- ret = ac->error;
+- switch (ret) {
+- case 0:
+- ret = call->ret0;
+- call->ret0 = 0;
+-
+- fallthrough;
+- case -ECONNABORTED:
++ if (call->error == 0 || call->error == -ECONNABORTED)
+ ac->responded = true;
+- break;
+- }
+-
+-out:
+- _debug("call complete");
+- afs_put_call(call);
+- _leave(" = %p", (void *)ret);
+- return ret;
+ }
+
+ /*
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 41e7932d75c6..650534892a20 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -106,12 +106,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
+ return 0;
+ }
+
+-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
+-{
+- kfree(call->ret_vldb);
+- afs_flat_call_destructor(call);
+-}
+-
+ /*
+ * VL.GetEntryByNameU operation type.
+ */
+@@ -119,7 +113,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
+ .name = "VL.GetEntryByNameU",
+ .op = afs_VL_GetEntryByNameU,
+ .deliver = afs_deliver_vl_get_entry_by_name_u,
+- .destructor = afs_destroy_vl_get_entry_by_name_u,
++ .destructor = afs_flat_call_destructor,
+ };
+
+ /*
+@@ -166,7 +160,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
+
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+- return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
++ afs_wait_for_call_to_complete(call, &vc->ac);
++ afs_put_call(call);
++ if (vc->ac.error) {
++ kfree(entry);
++ return ERR_PTR(vc->ac.error);
++ }
++ return entry;
+ }
+
+ /*
+@@ -249,12 +249,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+ return 0;
+ }
+
+-static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
+-{
+- afs_put_addrlist(call->ret_alist);
+- return afs_flat_call_destructor(call);
+-}
+-
+ /*
+ * VL.GetAddrsU operation type.
+ */
+@@ -262,7 +256,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
+ .name = "VL.GetAddrsU",
+ .op = afs_VL_GetAddrsU,
+ .deliver = afs_deliver_vl_get_addrs_u,
+- .destructor = afs_vl_get_addrs_u_destructor,
++ .destructor = afs_flat_call_destructor,
+ };
+
+ /*
+@@ -273,6 +267,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+ const uuid_t *uuid)
+ {
+ struct afs_ListAddrByAttributes__xdr *r;
++ struct afs_addr_list *alist;
+ const struct afs_uuid *u = (const struct afs_uuid *)uuid;
+ struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
+@@ -309,7 +304,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
++ afs_wait_for_call_to_complete(call, &vc->ac);
++ alist = call->ret_alist;
++ afs_put_call(call);
++ if (vc->ac.error) {
++ afs_put_addrlist(alist);
++ return ERR_PTR(vc->ac.error);
++ }
++ return alist;
+ }
+
+ /*
+@@ -618,7 +620,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
+ .name = "YFSVL.GetEndpoints",
+ .op = afs_YFSVL_GetEndpoints,
+ .deliver = afs_deliver_yfsvl_get_endpoints,
+- .destructor = afs_vl_get_addrs_u_destructor,
++ .destructor = afs_flat_call_destructor,
+ };
+
+ /*
+@@ -628,6 +630,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
+ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+ const uuid_t *uuid)
+ {
++ struct afs_addr_list *alist;
+ struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
+ __be32 *bp;
+@@ -652,7 +655,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
++ afs_wait_for_call_to_complete(call, &vc->ac);
++ alist = call->ret_alist;
++ afs_put_call(call);
++ if (vc->ac.error) {
++ afs_put_addrlist(alist);
++ return ERR_PTR(vc->ac.error);
++ }
++ return alist;
+ }
+
+ /*
+@@ -717,12 +727,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
+ return 0;
+ }
+
+-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call)
+-{
+- kfree(call->ret_str);
+- afs_flat_call_destructor(call);
+-}
+-
+ /*
+ * VL.GetCapabilities operation type
+ */
+@@ -730,7 +734,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = {
+ .name = "YFSVL.GetCellName",
+ .op = afs_YFSVL_GetCellName,
+ .deliver = afs_deliver_yfsvl_get_cell_name,
+- .destructor = afs_destroy_yfsvl_get_cell_name,
++ .destructor = afs_flat_call_destructor,
+ };
+
+ /*
+@@ -745,6 +749,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+ struct afs_call *call;
+ struct afs_net *net = vc->cell->net;
+ __be32 *bp;
++ char *cellname;
+
+ _enter("");
+
+@@ -763,5 +768,12 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+ /* Can't take a ref on server */
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+- return (char *)afs_wait_for_call_to_complete(call, &vc->ac);
++ afs_wait_for_call_to_complete(call, &vc->ac);
++ cellname = call->ret_str;
++ afs_put_call(call);
++ if (vc->ac.error) {
++ kfree(cellname);
++ return ERR_PTR(vc->ac.error);
++ }
++ return cellname;
+ }
+--
+2.43.0
+
--- /dev/null
+From 118984ee79c3e4b1b67ee3211d94d7d945c76fec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jan 2024 14:02:37 +0000
+Subject: afs: Fix error handling with lookup via FS.InlineBulkStatus
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 17ba6f0bd14fe3ac606aac6bebe5e69bdaad8ba1 ]
+
+When afs does a lookup, it tries to use FS.InlineBulkStatus to preemptively
+look up a bunch of files in the parent directory and cache this locally, on
+the basis that we might want to look at them too (for example if someone
+does an ls on a directory, they may want want to then stat every file
+listed).
+
+FS.InlineBulkStatus can be considered a compound op with the normal abort
+code applying to the compound as a whole. Each status fetch within the
+compound is then given its own individual abort code - but assuming no
+error that prevents the bulk fetch from returning the compound result will
+be 0, even if all the constituent status fetches failed.
+
+At the conclusion of afs_do_lookup(), we should use the abort code from the
+appropriate status to determine the error to return, if any - but instead
+it is assumed that we were successful if the op as a whole succeeded and we
+return an incompletely initialised inode, resulting in ENOENT, no matter
+the actual reason. In the particular instance reported, a vnode with no
+permission granted to be accessed is being given a UAEACCES abort code
+which should be reported as EACCES, but is instead being reported as
+ENOENT.
+
+Fix this by abandoning the inode (which will be cleaned up with the op) if
+file[1] has an abort code indicated and turn that abort code into an error
+instead.
+
+Whilst we're at it, add a tracepoint so that the abort codes of the
+individual subrequests of FS.InlineBulkStatus can be logged. At the moment
+only the container abort code can be 0.
+
+Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
+Reported-by: Jeffrey Altman <jaltman@auristor.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 12 +++++++++---
+ include/trace/events/afs.h | 25 +++++++++++++++++++++++++
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 75896a677b96..9140780be5a4 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -716,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
+ break;
+ }
+
++ if (vp->scb.status.abort_code)
++ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
+ if (!vp->scb.have_status && !vp->scb.have_error)
+ continue;
+
+@@ -905,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+- inode = ERR_PTR(afs_op_error(op));
+
+ out_op:
+ if (!afs_op_error(op)) {
+- inode = &op->file[1].vnode->netfs.inode;
+- op->file[1].vnode = NULL;
++ if (op->file[1].scb.status.abort_code) {
++ afs_op_accumulate_error(op, -ECONNABORTED,
++ op->file[1].scb.status.abort_code);
++ } else {
++ inode = &op->file[1].vnode->netfs.inode;
++ op->file[1].vnode = NULL;
++ }
+ }
+
+ if (op->file[0].scb.have_status)
+diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
+index e9d412d19dbb..caec276515dc 100644
+--- a/include/trace/events/afs.h
++++ b/include/trace/events/afs.h
+@@ -1216,6 +1216,31 @@ TRACE_EVENT(afs_file_error,
+ __print_symbolic(__entry->where, afs_file_errors))
+ );
+
++TRACE_EVENT(afs_bulkstat_error,
++ TP_PROTO(struct afs_operation *op, struct afs_fid *fid, unsigned int index, s32 abort),
++
++ TP_ARGS(op, fid, index, abort),
++
++ TP_STRUCT__entry(
++ __field_struct(struct afs_fid, fid)
++ __field(unsigned int, op)
++ __field(unsigned int, index)
++ __field(s32, abort)
++ ),
++
++ TP_fast_assign(
++ __entry->op = op->debug_id;
++ __entry->fid = *fid;
++ __entry->index = index;
++ __entry->abort = abort;
++ ),
++
++ TP_printk("OP=%08x[%02x] %llx:%llx:%x a=%d",
++ __entry->op, __entry->index,
++ __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
++ __entry->abort)
++ );
++
+ TRACE_EVENT(afs_cm_no_server,
+ TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx),
+
+--
+2.43.0
+
--- /dev/null
+From f7c86f260e13437b71984ae0f0cd27554335461d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 30 Nov 2023 12:56:14 +0100
+Subject: afs: fix the usage of read_seqbegin_or_lock() in afs_find_server*()
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+[ Upstream commit 1702e0654ca9a7bcd7c7619c8a5004db58945b71 ]
+
+David Howells says:
+
+ (5) afs_find_server().
+
+ There could be a lot of servers in the list and each server can have
+ multiple addresses, so I think this would be better with an exclusive
+ second pass.
+
+ The server list isn't likely to change all that often, but when it does
+ change, there's a good chance several servers are going to be
+ added/removed one after the other. Further, this is only going to be
+ used for incoming cache management/callback requests from the server,
+ which hopefully aren't going to happen too often - but it is remotely
+ drivable.
+
+ (6) afs_find_server_by_uuid().
+
+ Similarly to (5), there could be a lot of servers to search through, but
+ they are in a tree not a flat list, so it should be faster to process.
+ Again, it's not likely to change that often and, again, when it does
+ change it's likely to involve multiple changes. This can be driven
+ remotely by an incoming cache management request but is mostly going to
+ be driven by setting up or reconfiguring a volume's server list -
+ something that also isn't likely to happen often.
+
+Make the "seq" counter odd on the 2nd pass, otherwise read_seqbegin_or_lock()
+never takes the lock.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Link: https://lore.kernel.org/r/20231130115614.GA21581@redhat.com/
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/server.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index b5237206eac3..0bd2f5ba6900 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -27,7 +27,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+ const struct afs_addr_list *alist;
+ struct afs_server *server = NULL;
+ unsigned int i;
+- int seq = 0, diff;
++ int seq = 1, diff;
+
+ rcu_read_lock();
+
+@@ -35,6 +35,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+ if (server)
+ afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
+ server = NULL;
++ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+
+ if (srx->transport.family == AF_INET6) {
+@@ -90,7 +91,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
+ {
+ struct afs_server *server = NULL;
+ struct rb_node *p;
+- int diff, seq = 0;
++ int diff, seq = 1;
+
+ _enter("%pU", uuid);
+
+@@ -102,7 +103,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
+ if (server)
+ afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
+ server = NULL;
+-
++ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ read_seqbegin_or_lock(&net->fs_lock, &seq);
+
+ p = net->fs_servers.rb_node;
+--
+2.43.0
+
--- /dev/null
+From e9106b5b7e80e3ab85a87fbea7ea3ecdc53673cd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 Oct 2023 16:00:18 +0100
+Subject: afs: Handle the VIO and UAEIO aborts explicitly
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit eb8eae65f0c713bcef84b082aa919f72c3d83268 ]
+
+When processing the result of a call, handle the VIO and UAEIO abort
+specifically rather than leaving it to a default case. Rather than
+erroring out unconditionally, see if there's another server if the volume
+has more than one server available, otherwise return -EREMOTEIO.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/rotate.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 59aed7a6dd11..a108cd55bb4e 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -330,6 +330,13 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ goto restart_from_beginning;
+
++ case UAEIO:
++ case VIO:
++ op->error = -EREMOTEIO;
++ if (op->volume->type != AFSVL_RWVOL)
++ goto next_server;
++ goto failed;
++
+ case VDISKFULL:
+ case UAENOSPC:
+ /* The partition is full. Only applies to RWVOLs.
+--
+2.43.0
+
--- /dev/null
+From b8adfd03eeab12713e571102691a0551705137fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jan 2024 17:22:36 +0000
+Subject: afs: Hide silly-rename files from userspace
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 57e9d49c54528c49b8bffe6d99d782ea051ea534 ]
+
+There appears to be a race between silly-rename files being created/removed
+and various userspace tools iterating over the contents of a directory,
+leading to such errors as:
+
+ find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory
+ tar: ./include/linux/greybus/.__afs3C95: File removed before we read it
+
+when building a kernel.
+
+Fix afs_readdir() so that it doesn't return .__afsXXXX silly-rename files
+to userspace. This doesn't stop them being looked up directly by name as
+we need to be able to look them up from within the kernel as part of the
+silly-rename algorithm.
+
+Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename")
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 5219182e52e1..2df2e9ee130d 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+ continue;
+ }
+
++ /* Don't expose silly rename entries to userspace. */
++ if (nlen > 6 &&
++ dire->u.name[0] == '.' &&
++ ctx->actor != afs_lookup_filldir &&
++ ctx->actor != afs_lookup_one_filldir &&
++ memcmp(dire->u.name, ".__afs", 6) == 0)
++ continue;
++
+ /* found the next entry */
+ if (!dir_emit(ctx, dire->u.name, nlen,
+ ntohl(dire->u.vnode),
+--
+2.43.0
+
--- /dev/null
+From d2f27e70f3691aa4364cbb1f807ed0811f3a3ab0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 17:53:33 +0100
+Subject: afs: Simplify error handling
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit aa453becce5d1ae1b94b7fc22f47d7b05d22b14e ]
+
+Simplify error handling a bit by moving it from the afs_addr_cursor struct
+to the afs_operation and afs_vl_cursor structs and using the error
+prioritisation function for accumulating errors from multiple sources (AFS
+tries to rotate between multiple fileservers, some of which may be
+inaccessible or in some state of offlinedness).
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c | 8 ++--
+ fs/afs/dir.c | 14 ++++---
+ fs/afs/dir_silly.c | 2 +-
+ fs/afs/file.c | 3 --
+ fs/afs/fs_operation.c | 24 +++++------
+ fs/afs/fsclient.c | 1 +
+ fs/afs/internal.h | 44 +++++++++++++++------
+ fs/afs/misc.c | 10 ++++-
+ fs/afs/rotate.c | 58 ++++++++++++++-------------
+ fs/afs/rxrpc.c | 17 ++++----
+ fs/afs/server.c | 1 -
+ fs/afs/vl_alias.c | 2 +-
+ fs/afs/vl_probe.c | 7 ++--
+ fs/afs/vl_rotate.c | 92 +++++++++++++++++++++----------------------
+ fs/afs/vlclient.c | 34 ++++++++++------
+ 15 files changed, 174 insertions(+), 143 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index 519821f5aedc..f4837c3b8ae2 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -386,26 +386,24 @@ bool afs_iterate_addresses(struct afs_addr_cursor *ac)
+ selected:
+ ac->index = index;
+ set_bit(index, &ac->tried);
+- ac->responded = false;
++ ac->call_responded = false;
+ return true;
+ }
+
+ /*
+ * Release an address list cursor.
+ */
+-int afs_end_cursor(struct afs_addr_cursor *ac)
++void afs_end_cursor(struct afs_addr_cursor *ac)
+ {
+ struct afs_addr_list *alist;
+
+ alist = ac->alist;
+ if (alist) {
+- if (ac->responded &&
++ if (ac->call_responded &&
+ ac->index != alist->preferred &&
+ test_bit(ac->alist->preferred, &ac->tried))
+ WRITE_ONCE(alist->preferred, ac->index);
+ afs_put_addrlist(alist);
+ ac->alist = NULL;
+ }
+-
+- return ac->error;
+ }
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 15763418a938..75896a677b96 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -701,8 +701,9 @@ static void afs_do_lookup_success(struct afs_operation *op)
+ vp = &op->file[0];
+ abort_code = vp->scb.status.abort_code;
+ if (abort_code != 0) {
+- op->ac.abort_code = abort_code;
+- op->error = afs_abort_to_error(abort_code);
++ op->call_abort_code = abort_code;
++ afs_op_set_error(op, afs_abort_to_error(abort_code));
++ op->cumul_error.abort_code = abort_code;
+ }
+ break;
+
+@@ -854,13 +855,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ _debug("nr_files %u", op->nr_files);
+
+ /* Need space for examining all the selected files */
+- op->error = -ENOMEM;
+ if (op->nr_files > 2) {
+ op->more_files = kvcalloc(op->nr_files - 2,
+ sizeof(struct afs_vnode_param),
+ GFP_KERNEL);
+- if (!op->more_files)
++ if (!op->more_files) {
++ afs_op_nomem(op);
+ goto out_op;
++ }
+
+ for (i = 2; i < op->nr_files; i++) {
+ vp = &op->more_files[i - 2];
+@@ -1263,7 +1265,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
+ {
+ struct afs_vnode *vnode = op->file[0].vnode;
+
+- switch (op->ac.abort_code) {
++ switch (afs_op_abort_code(op)) {
+ case VNOVNODE:
+ set_bit(AFS_VNODE_DELETED, &vnode->flags);
+ afs_break_callback(vnode, afs_cb_break_for_deleted);
+@@ -1288,7 +1290,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+ /* ENOMEM or EINTR at a really inconvenient time - just abandon
+ * the new directory on the server.
+ */
+- op->error = PTR_ERR(inode);
++ afs_op_accumulate_error(op, PTR_ERR(inode), 0);
+ return;
+ }
+
+diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
+index bb5807e87fa4..a1e581946b93 100644
+--- a/fs/afs/dir_silly.c
++++ b/fs/afs/dir_silly.c
+@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
++ if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+diff --git a/fs/afs/file.c b/fs/afs/file.c
+index 0c81c39c32f5..8f9b42427569 100644
+--- a/fs/afs/file.c
++++ b/fs/afs/file.c
+@@ -245,10 +245,7 @@ static void afs_fetch_data_notify(struct afs_operation *op)
+ struct netfs_io_subrequest *subreq = req->subreq;
+ int error = afs_op_error(op);
+
+- if (error == -ECONNABORTED)
+- error = afs_abort_to_error(op->ac.abort_code);
+ req->error = error;
+-
+ if (subreq) {
+ __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+ netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 1c22d6e77846..cebe4fad8192 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -169,9 +169,6 @@ static void afs_end_vnode_operation(struct afs_operation *op)
+ }
+
+ afs_drop_io_locks(op);
+-
+- if (op->error == -ECONNABORTED)
+- op->error = afs_abort_to_error(op->ac.abort_code);
+ }
+
+ /*
+@@ -182,6 +179,8 @@ void afs_wait_for_operation(struct afs_operation *op)
+ _enter("");
+
+ while (afs_select_fileserver(op)) {
++ op->call_error = 0;
++ op->call_abort_code = 0;
+ op->cb_s_break = op->server->cb_s_break;
+ if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
+ op->ops->issue_yfs_rpc)
+@@ -189,28 +188,29 @@ void afs_wait_for_operation(struct afs_operation *op)
+ else if (op->ops->issue_afs_rpc)
+ op->ops->issue_afs_rpc(op);
+ else
+- op->ac.error = -ENOTSUPP;
++ op->call_error = -ENOTSUPP;
+
+ if (op->call) {
+ afs_wait_for_call_to_complete(op->call, &op->ac);
+- op->error = op->ac.error;
++ op->call_abort_code = op->call->abort_code;
++ op->call_error = op->call->error;
++ op->call_responded = op->call->responded;
++ op->ac.call_responded = true;
++ WRITE_ONCE(op->ac.alist->addrs[op->ac.index].last_error,
++ op->call_error);
+ afs_put_call(op->call);
+ }
+ }
+
+- switch (op->error) {
+- case 0:
++ if (!afs_op_error(op)) {
+ _debug("success");
+ op->ops->success(op);
+- break;
+- case -ECONNABORTED:
++ } else if (op->cumul_error.aborted) {
+ if (op->ops->aborted)
+ op->ops->aborted(op);
+- fallthrough;
+- default:
++ } else {
+ if (op->ops->failed)
+ op->ops->failed(op);
+- break;
+ }
+
+ afs_end_vnode_operation(op);
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 020073387111..2a56dea22519 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1629,6 +1629,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+ call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+ afs_make_call(ac, call, GFP_NOFS);
+ afs_wait_for_call_to_complete(call, ac);
++ ret = call->error;
+ afs_put_call(call);
+ return ret;
+ }
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 45c4526b56be..5f6db0ac06ac 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -75,6 +75,7 @@ enum afs_call_state {
+ struct afs_address {
+ struct rxrpc_peer *peer;
+ u16 service_id;
++ short last_error; /* Last error from this address */
+ };
+
+ /*
+@@ -121,7 +122,6 @@ struct afs_call {
+ };
+ void *buffer; /* reply receive buffer */
+ union {
+- long ret0; /* Value to reply with instead of 0 */
+ struct afs_addr_list *ret_alist;
+ struct afs_vldb_entry *ret_vldb;
+ char *ret_str;
+@@ -145,6 +145,7 @@ struct afs_call {
+ bool upgrade; /* T to request service upgrade */
+ bool intr; /* T if interruptible */
+ bool unmarshalling_error; /* T if an unmarshalling error occurred */
++ bool responded; /* Got a response from the call (may be abort) */
+ u16 service_id; /* Actual service ID (after upgrade) */
+ unsigned int debug_id; /* Trace ID */
+ u32 operation_ID; /* operation ID for an incoming call */
+@@ -719,8 +720,10 @@ struct afs_permits {
+ * Error prioritisation and accumulation.
+ */
+ struct afs_error {
+- short error; /* Accumulated error */
++ s32 abort_code; /* Cumulative abort code */
++ short error; /* Cumulative error */
+ bool responded; /* T if server responded */
++ bool aborted; /* T if ->error is from an abort */
+ };
+
+ /*
+@@ -730,10 +733,8 @@ struct afs_addr_cursor {
+ struct afs_addr_list *alist; /* Current address list (pins ref) */
+ unsigned long tried; /* Tried addresses */
+ signed char index; /* Current address */
+- bool responded; /* T if the current address responded */
+ unsigned short nr_iterations; /* Number of address iterations */
+- short error;
+- u32 abort_code;
++ bool call_responded;
+ };
+
+ /*
+@@ -746,13 +747,16 @@ struct afs_vl_cursor {
+ struct afs_vlserver *server; /* Server on which this resides */
+ struct key *key; /* Key for the server */
+ unsigned long untried; /* Bitmask of untried servers */
++ struct afs_error cumul_error; /* Cumulative error */
++ s32 call_abort_code;
+ short index; /* Current server */
+- short error;
++ short call_error; /* Error from single call */
+ unsigned short flags;
+ #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */
+ #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */
+ #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */
+- unsigned short nr_iterations; /* Number of server iterations */
++ short nr_iterations; /* Number of server iterations */
++ bool call_responded; /* T if the current address responded */
+ };
+
+ /*
+@@ -803,8 +807,10 @@ struct afs_operation {
+ struct dentry *dentry_2; /* Second dentry to be altered */
+ struct timespec64 mtime; /* Modification time to record */
+ struct timespec64 ctime; /* Change time to set */
++ struct afs_error cumul_error; /* Cumulative error */
+ short nr_files; /* Number of entries in file[], more_files */
+- short error;
++ short call_error; /* Error from single call */
++ s32 call_abort_code; /* Abort code from single call */
+ unsigned int debug_id;
+
+ unsigned int cb_v_break; /* Volume break counter before op */
+@@ -860,6 +866,8 @@ struct afs_operation {
+ unsigned long untried; /* Bitmask of untried servers */
+ short index; /* Current server */
+ short nr_iterations; /* Number of server iterations */
++ bool call_responded; /* T if the current address responded */
++
+
+ unsigned int flags;
+ #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */
+@@ -976,7 +984,7 @@ bool afs_addr_list_same(const struct afs_addr_list *a,
+ const struct afs_addr_list *b);
+ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+-extern int afs_end_cursor(struct afs_addr_cursor *);
++extern void afs_end_cursor(struct afs_addr_cursor *ac);
+
+ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
+ __be32 xdr, u16 port);
+@@ -1235,17 +1243,27 @@ extern void afs_prioritise_error(struct afs_error *, int, u32);
+
+ static inline void afs_op_nomem(struct afs_operation *op)
+ {
+- op->error = -ENOMEM;
++ op->cumul_error.error = -ENOMEM;
+ }
+
+ static inline int afs_op_error(const struct afs_operation *op)
+ {
+- return op->error;
++ return op->cumul_error.error;
++}
++
++static inline s32 afs_op_abort_code(const struct afs_operation *op)
++{
++ return op->cumul_error.abort_code;
+ }
+
+ static inline int afs_op_set_error(struct afs_operation *op, int error)
+ {
+- return op->error = error;
++ return op->cumul_error.error = error;
++}
++
++static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code)
++{
++ afs_prioritise_error(&op->cumul_error, error, abort_code);
+ }
+
+ /*
+@@ -1619,7 +1637,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
+ struct afs_vnode_param *dir_vp,
+ struct dentry *dentry)
+ {
+- if (!op->error)
++ if (!op->cumul_error.error)
+ dentry->d_fsdata =
+ (void *)(unsigned long)dir_vp->scb.status.data_version;
+ }
+diff --git a/fs/afs/misc.c b/fs/afs/misc.c
+index 805328ca5428..b8180bf2281f 100644
+--- a/fs/afs/misc.c
++++ b/fs/afs/misc.c
+@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+ {
+ switch (error) {
+ case 0:
++ e->aborted = false;
++ e->error = 0;
+ return;
+ default:
+ if (e->error == -ETIMEDOUT ||
+@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+ if (e->responded)
+ return;
+ e->error = error;
++ e->aborted = false;
+ return;
+
+ case -ECONNABORTED:
+- error = afs_abort_to_error(abort_code);
+- fallthrough;
++ e->error = afs_abort_to_error(abort_code);
++ e->aborted = true;
++ e->responded = true;
++ return;
+ case -ENETRESET: /* Responded, but we seem to have changed address */
++ e->aborted = false;
+ e->responded = true;
+ e->error = error;
+ return;
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index d64c1d90faed..68c88e3a0916 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -112,9 +112,9 @@ bool afs_select_fileserver(struct afs_operation *op)
+ struct afs_addr_list *alist;
+ struct afs_server *server;
+ struct afs_vnode *vnode = op->file[0].vnode;
+- struct afs_error e;
+ unsigned int rtt;
+- int error = op->ac.error, i;
++ s32 abort_code = op->call_abort_code;
++ int error = op->call_error, i;
+
+ op->nr_iterations++;
+
+@@ -122,7 +122,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ op->debug_id, op->nr_iterations, op->volume->vid,
+ op->untried, op->index,
+ op->ac.tried, op->ac.index,
+- error, op->ac.abort_code);
++ error, abort_code);
+
+ if (op->flags & AFS_OPERATION_STOP) {
+ _leave(" = f [stopped]");
+@@ -133,8 +133,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+ goto start;
+
+ /* Evaluate the result of the previous operation, if there was one. */
+- switch (error) {
++ switch (op->call_error) {
+ case 0:
++ op->cumul_error.responded = true;
++ fallthrough;
+ default:
+ /* Success or local failure. Stop. */
+ afs_op_set_error(op, error);
+@@ -151,7 +153,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak
+ * these abort codes.
+ */
+- switch (op->ac.abort_code) {
++ op->cumul_error.responded = true;
++ switch (abort_code) {
+ case VNOVOL:
+ /* This fileserver doesn't know about the volume.
+ * - May indicate that the VL is wrong - retry once and compare
+@@ -164,7 +167,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * (administrative action).
+ */
+ if (op->flags & AFS_OPERATION_VNOVOL) {
+- op->error = -EREMOTEIO;
++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ goto next_server;
+ }
+
+@@ -188,7 +191,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * it's the fileserver having trouble.
+ */
+ if (rcu_access_pointer(op->volume->servers) == op->server_list) {
+- op->error = -EREMOTEIO;
++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ goto next_server;
+ }
+
+@@ -201,8 +204,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+ case VONLINE:
+ /* These should not be returned from the fileserver. */
+ pr_warn("Fileserver returned unexpected abort %d\n",
+- op->ac.abort_code);
+- op->error = -EREMOTEIO;
++ abort_code);
++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ goto next_server;
+
+ case VNOSERVICE:
+@@ -233,7 +236,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
+ */
+ case RX_CALL_TIMEOUT:
+- op->error = -ETIMEDOUT;
++ afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
+ goto next_server;
+
+ case VSALVAGING: /* This error should not be leaked to cache managers
+@@ -248,7 +251,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * days).
+ */
+ if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
+- afs_busy(op->volume, op->ac.abort_code);
++ afs_busy(op->volume, abort_code);
+ clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ }
+ if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+@@ -281,7 +284,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ goto failed;
+ }
+ if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
+- afs_busy(op->volume, op->ac.abort_code);
++ afs_busy(op->volume, abort_code);
+ clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ }
+ busy:
+@@ -329,7 +332,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * TODO: Retry a few times with sleeps.
+ */
+ if (rcu_access_pointer(op->volume->servers) == op->server_list) {
+- op->error = -ENOMEDIUM;
++ afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
+ goto failed;
+ }
+
+@@ -337,7 +340,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ case UAEIO:
+ case VIO:
+- op->error = -EREMOTEIO;
++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+ if (op->volume->type != AFSVL_RWVOL)
+ goto next_server;
+ goto failed;
+@@ -361,7 +364,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ goto failed_but_online;
+
+ default:
+- op->error = afs_abort_to_error(op->ac.abort_code);
++ afs_op_accumulate_error(op, error, abort_code);
+ failed_but_online:
+ clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+ clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+@@ -380,7 +383,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ case -EHOSTDOWN:
+ case -ECONNREFUSED:
+ _debug("no conn");
+- op->error = error;
++ afs_op_accumulate_error(op, error, 0);
+ goto iterate_address;
+
+ case -ENETRESET:
+@@ -506,6 +509,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ op->index, op->ac.index, op->ac.alist->nr_addrs,
+ rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
+
++ op->call_responded = false;
+ _leave(" = t");
+ return true;
+
+@@ -543,17 +547,14 @@ bool afs_select_fileserver(struct afs_operation *op)
+ if (op->flags & AFS_OPERATION_VBUSY)
+ goto restart_from_beginning;
+
+- e.error = -EDESTADDRREQ;
+- e.responded = false;
+ for (i = 0; i < op->server_list->nr_servers; i++) {
+ struct afs_server *s = op->server_list->servers[i].server;
+
+- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+- s->probe.abort_code);
++ error = READ_ONCE(s->probe.error);
++ if (error < 0)
++ afs_op_accumulate_error(op, error, s->probe.abort_code);
+ }
+
+- error = e.error;
+- op->error = error;
+ failed:
+ op->flags |= AFS_OPERATION_STOP;
+ afs_end_cursor(&op->ac);
+@@ -576,11 +577,13 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
+ rcu_read_lock();
+
+ pr_notice("EDESTADDR occurred\n");
+- pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
++ pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
+ op->file[0].cb_break_before,
+- op->file[1].cb_break_before, op->flags, op->error);
+- pr_notice("FC: ut=%lx ix=%d ni=%u\n",
++ op->file[1].cb_break_before, op->flags, op->cumul_error.error);
++ pr_notice("OP: ut=%lx ix=%d ni=%u\n",
+ op->untried, op->index, op->nr_iterations);
++ pr_notice("OP: call er=%d ac=%d r=%u\n",
++ op->call_error, op->call_abort_code, op->call_responded);
+
+ if (op->server_list) {
+ const struct afs_server_list *sl = op->server_list;
+@@ -605,8 +608,7 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
+ }
+ }
+
+- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+- op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
+- op->ac.responded, op->ac.nr_iterations);
++ pr_notice("AC: t=%lx ax=%u ni=%u\n",
++ op->ac.tried, op->ac.index, op->ac.nr_iterations);
+ rcu_read_unlock();
+ }
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index dad8efadbc44..0b3e2f20b0e0 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -408,8 +408,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ rxrpc_kernel_recv_data(call->net->socket, rxcall,
+ &msg.msg_iter, &len, false,
+ &call->abort_code, &call->service_id);
+- ac->abort_code = call->abort_code;
+- ac->responded = true;
++ call->responded = true;
+ }
+ call->error = ret;
+ trace_afs_call_done(call);
+@@ -429,7 +428,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ afs_set_call_complete(call, ret, 0);
+ }
+
+- ac->error = ret;
++ call->error = ret;
+ call->state = AFS_CALL_COMPLETE;
+ _leave(" = %d", ret);
+ }
+@@ -510,6 +509,7 @@ static void afs_deliver_to_call(struct afs_call *call)
+ ret = -EBADMSG;
+ switch (ret) {
+ case 0:
++ call->responded = true;
+ afs_queue_call_work(call);
+ if (state == AFS_CALL_CL_PROC_REPLY) {
+ if (call->op)
+@@ -524,9 +524,11 @@ static void afs_deliver_to_call(struct afs_call *call)
+ goto out;
+ case -ECONNABORTED:
+ ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
++ call->responded = true;
+ afs_log_error(call, call->abort_code);
+ goto done;
+ case -ENOTSUPP:
++ call->responded = true;
+ abort_code = RXGEN_OPCODE;
+ rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+ abort_code, ret,
+@@ -573,7 +575,7 @@ static void afs_deliver_to_call(struct afs_call *call)
+ }
+
+ /*
+- * Wait synchronously for a call to complete and clean up the call struct.
++ * Wait synchronously for a call to complete.
+ */
+ void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac)
+ {
+@@ -626,13 +628,8 @@ void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor
+ }
+ }
+
+- spin_lock_bh(&call->state_lock);
+- ac->abort_code = call->abort_code;
+- ac->error = call->error;
+- spin_unlock_bh(&call->state_lock);
+-
+ if (call->error == 0 || call->error == -ECONNABORTED)
+- ac->responded = true;
++ call->responded = true;
+ }
+
+ /*
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 2826e6eced71..f7791ef13618 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -437,7 +437,6 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
+ struct afs_addr_cursor ac = {
+ .alist = alist,
+ .index = alist->preferred,
+- .error = 0,
+ };
+
+ afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index 6fdf9f1bedc0..89cadd9a69e1 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -236,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
+
+ while (afs_select_vlserver(&vc)) {
+ if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) {
+- vc.ac.error = -EOPNOTSUPP;
++ vc.call_error = -EOPNOTSUPP;
+ skipped = true;
+ continue;
+ }
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index 9551aef07cee..2f8a13c2bf0c 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -169,10 +169,11 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
+ call = afs_vl_get_capabilities(net, &ac, key, server,
+ server_index);
+ if (!IS_ERR(call)) {
++ afs_prioritise_error(_e, call->error, call->abort_code);
+ afs_put_call(call);
+ in_progress = true;
+ } else {
+- afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
++ afs_prioritise_error(_e, PTR_ERR(call), 0);
+ afs_done_one_vl_probe(server, false);
+ }
+ }
+@@ -187,12 +188,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
+ struct afs_vlserver_list *vllist)
+ {
+ struct afs_vlserver *server;
+- struct afs_error e;
++ struct afs_error e = {};
+ bool in_progress = false;
+ int i;
+
+- e.error = 0;
+- e.responded = false;
+ for (i = 0; i < vllist->nr_servers; i++) {
+ server = vllist->servers[i].server;
+ if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index f8f255c966ae..e2dc54082a05 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -20,11 +20,11 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cel
+ memset(vc, 0, sizeof(*vc));
+ vc->cell = cell;
+ vc->key = key;
+- vc->error = -EDESTADDRREQ;
+- vc->ac.error = SHRT_MAX;
++ vc->cumul_error.error = -EDESTADDRREQ;
++ vc->nr_iterations = -1;
+
+ if (signal_pending(current)) {
+- vc->error = -EINTR;
++ vc->cumul_error.error = -EINTR;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ return false;
+ }
+@@ -52,7 +52,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+ &cell->dns_lookup_count,
+ smp_load_acquire(&cell->dns_lookup_count)
+ != dns_lookup_count) < 0) {
+- vc->error = -ERESTARTSYS;
++ vc->cumul_error.error = -ERESTARTSYS;
+ return false;
+ }
+ }
+@@ -60,12 +60,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+ /* Status load is ordered after lookup counter load */
+ if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
+ pr_warn("No record of cell %s\n", cell->name);
+- vc->error = -ENOENT;
++ vc->cumul_error.error = -ENOENT;
+ return false;
+ }
+
+ if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+- vc->error = -EDESTADDRREQ;
++ vc->cumul_error.error = -EDESTADDRREQ;
+ return false;
+ }
+ }
+@@ -91,52 +91,52 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ {
+ struct afs_addr_list *alist;
+ struct afs_vlserver *vlserver;
+- struct afs_error e;
+ unsigned int rtt;
+- int error = vc->ac.error, i;
++ s32 abort_code = vc->call_abort_code;
++ int error = vc->call_error, i;
++
++ vc->nr_iterations++;
+
+ _enter("%lx[%d],%lx[%d],%d,%d",
+ vc->untried, vc->index,
+ vc->ac.tried, vc->ac.index,
+- error, vc->ac.abort_code);
++ error, abort_code);
+
+ if (vc->flags & AFS_VL_CURSOR_STOP) {
+ _leave(" = f [stopped]");
+ return false;
+ }
+
+- vc->nr_iterations++;
++ if (vc->nr_iterations == 0)
++ goto start;
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (error) {
+- case SHRT_MAX:
+- goto start;
+-
+ default:
+ case 0:
+ /* Success or local failure. Stop. */
+- vc->error = error;
++ vc->cumul_error.error = error;
+ vc->flags |= AFS_VL_CURSOR_STOP;
+- _leave(" = f [okay/local %d]", vc->ac.error);
++ _leave(" = f [okay/local %d]", vc->cumul_error.error);
+ return false;
+
+ case -ECONNABORTED:
+ /* The far side rejected the operation on some grounds. This
+ * might involve the server being busy or the volume having been moved.
+ */
+- switch (vc->ac.abort_code) {
++ switch (abort_code) {
+ case AFSVL_IO:
+ case AFSVL_BADVOLOPER:
+ case AFSVL_NOMEM:
+ /* The server went weird. */
+- vc->error = -EREMOTEIO;
++ afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
+ //write_lock(&vc->cell->vl_servers_lock);
+ //vc->server_list->weird_mask |= 1 << vc->index;
+ //write_unlock(&vc->cell->vl_servers_lock);
+ goto next_server;
+
+ default:
+- vc->error = afs_abort_to_error(vc->ac.abort_code);
++ afs_prioritise_error(&vc->cumul_error, error, abort_code);
+ goto failed;
+ }
+
+@@ -149,12 +149,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ case -ETIMEDOUT:
+ case -ETIME:
+ _debug("no conn %d", error);
+- vc->error = error;
++ afs_prioritise_error(&vc->cumul_error, error, 0);
+ goto iterate_address;
+
+ case -ECONNRESET:
+ _debug("call reset");
+- vc->error = error;
++ afs_prioritise_error(&vc->cumul_error, error, 0);
+ vc->flags |= AFS_VL_CURSOR_RETRY;
+ goto next_server;
+
+@@ -178,15 +178,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ goto failed;
+
+ error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_prioritise_error(&vc->cumul_error, error, 0);
++ goto failed;
++ }
+
+ pick_server:
+ _debug("pick [%lx]", vc->untried);
+
+ error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_prioritise_error(&vc->cumul_error, error, 0);
++ goto failed;
++ }
+
+ /* Pick the untried server with the lowest RTT. */
+ vc->index = vc->server_list->preferred;
+@@ -249,6 +253,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+
+ _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
++ vc->call_responded = false;
+ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer));
+ return true;
+
+@@ -264,25 +269,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ if (vc->flags & AFS_VL_CURSOR_RETRY)
+ goto restart_from_beginning;
+
+- e.error = -EDESTADDRREQ;
+- e.responded = false;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+ if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+- e.responded = true;
+- afs_prioritise_error(&e, READ_ONCE(s->probe.error),
++ vc->cumul_error.responded = true;
++ afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
+ s->probe.abort_code);
+ }
+
+- error = e.error;
+-
+-failed_set_error:
+- vc->error = error;
+ failed:
+ vc->flags |= AFS_VL_CURSOR_STOP;
+ afs_end_cursor(&vc->ac);
+- _leave(" = f [failed %d]", vc->error);
++ _leave(" = f [failed %d]", vc->cumul_error.error);
+ return false;
+ }
+
+@@ -305,7 +304,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+ pr_notice("DNS: src=%u st=%u lc=%x\n",
+ cell->dns_source, cell->dns_status, cell->dns_lookup_count);
+ pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
+- vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
++ vc->untried, vc->index, vc->nr_iterations, vc->flags,
++ vc->cumul_error.error);
++ pr_notice("VC: call er=%d ac=%d r=%u\n",
++ vc->call_error, vc->call_abort_code, vc->call_responded);
+
+ if (vc->server_list) {
+ const struct afs_vlserver_list *sl = vc->server_list;
+@@ -329,9 +331,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+ }
+ }
+
+- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+- vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
+- vc->ac.responded, vc->ac.nr_iterations);
++ pr_notice("AC: t=%lx ax=%u ni=%u\n",
++ vc->ac.tried, vc->ac.index, vc->ac.nr_iterations);
+ rcu_read_unlock();
+ }
+
+@@ -342,17 +343,16 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
+ {
+ struct afs_net *net = vc->cell->net;
+
+- if (vc->error == -EDESTADDRREQ ||
+- vc->error == -EADDRNOTAVAIL ||
+- vc->error == -ENETUNREACH ||
+- vc->error == -EHOSTUNREACH)
++ switch (vc->cumul_error.error) {
++ case -EDESTADDRREQ:
++ case -EADDRNOTAVAIL:
++ case -ENETUNREACH:
++ case -EHOSTUNREACH:
+ afs_vl_dump_edestaddrreq(vc);
++ break;
++ }
+
+ afs_end_cursor(&vc->ac);
+ afs_put_vlserverlist(net, vc->server_list);
+-
+- if (vc->error == -ECONNABORTED)
+- vc->error = afs_abort_to_error(vc->ac.abort_code);
+-
+- return vc->error;
++ return vc->cumul_error.error;
+ }
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 650534892a20..db7e94584e87 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -161,10 +161,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call, &vc->ac);
++ vc->call_abort_code = call->abort_code;
++ vc->call_error = call->error;
++ vc->call_responded = call->responded;
+ afs_put_call(call);
+- if (vc->ac.error) {
++ if (vc->call_error) {
+ kfree(entry);
+- return ERR_PTR(vc->ac.error);
++ return ERR_PTR(vc->call_error);
+ }
+ return entry;
+ }
+@@ -305,11 +308,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call, &vc->ac);
+- alist = call->ret_alist;
++ vc->call_abort_code = call->abort_code;
++ vc->call_error = call->error;
++ vc->call_responded = call->responded;
++ alist = call->ret_alist;
+ afs_put_call(call);
+- if (vc->ac.error) {
++ if (vc->call_error) {
+ afs_put_addrlist(alist);
+- return ERR_PTR(vc->ac.error);
++ return ERR_PTR(vc->call_error);
+ }
+ return alist;
+ }
+@@ -656,11 +662,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call, &vc->ac);
+- alist = call->ret_alist;
++ vc->call_abort_code = call->abort_code;
++ vc->call_error = call->error;
++ vc->call_responded = call->responded;
++ alist = call->ret_alist;
+ afs_put_call(call);
+- if (vc->ac.error) {
++ if (vc->call_error) {
+ afs_put_addrlist(alist);
+- return ERR_PTR(vc->ac.error);
++ return ERR_PTR(vc->call_error);
+ }
+ return alist;
+ }
+@@ -769,11 +778,14 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+ trace_afs_make_vl_call(call);
+ afs_make_call(&vc->ac, call, GFP_KERNEL);
+ afs_wait_for_call_to_complete(call, &vc->ac);
+- cellname = call->ret_str;
++ vc->call_abort_code = call->abort_code;
++ vc->call_error = call->error;
++ vc->call_responded = call->responded;
++ cellname = call->ret_str;
+ afs_put_call(call);
+- if (vc->ac.error) {
++ if (vc->call_error) {
+ kfree(cellname);
+- return ERR_PTR(vc->ac.error);
++ return ERR_PTR(vc->call_error);
+ }
+ return cellname;
+ }
+--
+2.43.0
+
--- /dev/null
+From 8001a9917176e5da09c7619530be845008e837ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Oct 2023 15:38:14 +0100
+Subject: afs: Turn the afs_addr_list address array into an array of structs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 07f3502b33a260f873e35708d2fa693eb52225cb ]
+
+Turn the afs_addr_list address array into an array of structs, thereby
+allowing per-address (such as RTT) info to be added.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c | 10 +++++-----
+ fs/afs/fs_probe.c | 6 +++---
+ fs/afs/internal.h | 6 +++++-
+ fs/afs/proc.c | 4 ++--
+ fs/afs/rotate.c | 2 +-
+ fs/afs/rxrpc.c | 4 ++--
+ fs/afs/server.c | 4 ++--
+ fs/afs/vl_alias.c | 4 ++--
+ fs/afs/vl_probe.c | 6 +++---
+ fs/afs/vl_rotate.c | 2 +-
+ 10 files changed, 26 insertions(+), 22 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index de1ae0bead3b..ac05a59e9d46 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -45,7 +45,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+ alist->max_addrs = nr;
+
+ for (i = 0; i < nr; i++) {
+- struct sockaddr_rxrpc *srx = &alist->addrs[i];
++ struct sockaddr_rxrpc *srx = &alist->addrs[i].srx;
+ srx->srx_family = AF_RXRPC;
+ srx->srx_service = service;
+ srx->transport_type = SOCK_DGRAM;
+@@ -281,7 +281,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+ return;
+
+ for (i = 0; i < alist->nr_ipv4; i++) {
+- struct sockaddr_in *a = &alist->addrs[i].transport.sin;
++ struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin;
+ u32 a_addr = ntohl(a->sin_addr.s_addr);
+ u16 a_port = ntohs(a->sin_port);
+
+@@ -298,7 +298,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+ alist->addrs + i,
+ sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+- srx = &alist->addrs[i];
++ srx = &alist->addrs[i].srx;
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin);
+@@ -321,7 +321,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+ return;
+
+ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+- struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
++ struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6;
+ u16 a_port = ntohs(a->sin6_port);
+
+ diff = memcmp(xdr, &a->sin6_addr, 16);
+@@ -338,7 +338,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+ alist->addrs + i,
+ sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+- srx = &alist->addrs[i];
++ srx = &alist->addrs[i].srx;
+ srx->srx_family = AF_RXRPC;
+ srx->transport_type = SOCK_DGRAM;
+ srx->transport_len = sizeof(srx->transport.sin6);
+diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
+index daaf3810cc92..3dd24842f277 100644
+--- a/fs/afs/fs_probe.c
++++ b/fs/afs/fs_probe.c
+@@ -153,12 +153,12 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ if (call->service_id == YFS_FS_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx_service = call->service_id;
++ alist->addrs[index].srx.srx_service = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx_service = call->service_id;
++ alist->addrs[index].srx.srx_service = call->service_id;
+ }
+ cap0 = ntohl(call->tmp);
+ if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+@@ -182,7 +182,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
+- &server->uuid, index, &alist->addrs[index].transport,
++ &server->uuid, index, &alist->addrs[index].srx.transport,
+ rtt_us, ret);
+
+ return afs_done_one_fs_probe(call->net, server);
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 7385d62c8cf5..e2adb314ab6a 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -87,7 +87,9 @@ struct afs_addr_list {
+ enum dns_lookup_status status:8;
+ unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long responded; /* Mask of addrs that responded */
+- struct sockaddr_rxrpc addrs[] __counted_by(max_addrs);
++ struct {
++ struct sockaddr_rxrpc srx;
++ } addrs[] __counted_by(max_addrs);
+ #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+ };
+
+@@ -969,6 +971,8 @@ extern void afs_put_addrlist(struct afs_addr_list *);
+ extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+ const char *, size_t, char,
+ unsigned short, unsigned short);
++bool afs_addr_list_same(const struct afs_addr_list *a,
++ const struct afs_addr_list *b);
+ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+ extern int afs_end_cursor(struct afs_addr_cursor *);
+diff --git a/fs/afs/proc.c b/fs/afs/proc.c
+index 2a0c83d71565..ab9cd986cfd9 100644
+--- a/fs/afs/proc.c
++++ b/fs/afs/proc.c
+@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+ for (i = 0; i < alist->nr_addrs; i++)
+ seq_printf(m, " %c %pISpc\n",
+ alist->preferred == i ? '>' : '-',
+- &alist->addrs[i].transport);
++ &alist->addrs[i].srx.transport);
+ }
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+@@ -399,7 +399,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
+ alist->version, alist->responded, alist->failed);
+ for (i = 0; i < alist->nr_addrs; i++)
+ seq_printf(m, " [%x] %pISpc%s\n",
+- i, &alist->addrs[i].transport,
++ i, &alist->addrs[i].srx.transport,
+ alist->preferred == i ? "*" : "");
+ return 0;
+ }
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a3d127953ac6..46081e5da6f5 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ _debug("address [%u] %u/%u %pISp",
+ op->index, op->ac.index, op->ac.alist->nr_addrs,
+- &op->ac.alist->addrs[op->ac.index].transport);
++ &op->ac.alist->addrs[op->ac.index].srx.transport);
+
+ _leave(" = t");
+ return true;
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index d642d06a453b..181317126e43 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -296,7 +296,7 @@ static void afs_notify_end_request_tx(struct sock *sock,
+ */
+ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ {
+- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
++ struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx;
+ struct rxrpc_call *rxcall;
+ struct msghdr msg;
+ struct kvec iov[1];
+@@ -461,7 +461,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
+ max = m + 1;
+ pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+ msg, call->type->name,
+- &call->alist->addrs[call->addr_ix].transport);
++ &call->alist->addrs[call->addr_ix].srx.transport);
+ }
+ }
+
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 0bd2f5ba6900..b8e2d211d4a1 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -43,7 +43,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+ hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+ alist = rcu_dereference(server->addresses);
+ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+- b = &alist->addrs[i].transport.sin6;
++ b = &alist->addrs[i].srx.transport.sin6;
+ diff = ((u16 __force)a->sin6_port -
+ (u16 __force)b->sin6_port);
+ if (diff == 0)
+@@ -59,7 +59,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+ hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+ alist = rcu_dereference(server->addresses);
+ for (i = 0; i < alist->nr_ipv4; i++) {
+- b = &alist->addrs[i].transport.sin;
++ b = &alist->addrs[i].srx.transport.sin;
+ diff = ((u16 __force)a->sin_port -
+ (u16 __force)b->sin_port);
+ if (diff == 0)
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index f04a80e4f5c3..d3c0df70a1a5 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -94,8 +94,8 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
+ lb = rcu_dereference(server_b->addresses);
+
+ while (a < la->nr_addrs && b < lb->nr_addrs) {
+- const struct sockaddr_rxrpc *srx_a = &la->addrs[a];
+- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b];
++ const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx;
++ const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx;
+ int diff = afs_compare_addrs(srx_a, srx_b);
+
+ if (diff < 0) {
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index 58452b86e672..bdd9372e3fb2 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -106,12 +106,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ if (call->service_id == YFS_VL_SERVICE) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
+ set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx_service = call->service_id;
++ alist->addrs[index].srx.srx_service = call->service_id;
+ } else {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
+ clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx_service = call->service_id;
++ alist->addrs[index].srx.srx_service = call->service_id;
+ }
+ }
+
+@@ -131,7 +131,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ spin_unlock(&server->probe_lock);
+
+ _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+- server_index, index, &alist->addrs[index].transport, rtt_us, ret);
++ server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret);
+
+ afs_done_one_vl_probe(server, have_result);
+ }
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index eb415ce56360..e52b9d4c8a0a 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+
+ _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
+- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
++ _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport);
+ return true;
+
+ next_server:
+--
+2.43.0
+
--- /dev/null
+From c0d15d3cc0ef5a5944b6b80309a57145d4500e26 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 Oct 2023 16:04:52 +0100
+Subject: afs: Use op->nr_iterations=-1 to indicate to begin fileserver
+ iteration
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 075171fd22be33acf4ab354814bfa6de1c3412ce ]
+
+Set op->nr_iterations to -1 to indicate that we need to begin fileserver
+iteration rather than setting error to SHRT_MAX. This makes it easier to
+eliminate the address cursor.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/fs_operation.c | 2 +-
+ fs/afs/internal.h | 2 +-
+ fs/afs/rotate.c | 11 ++++++-----
+ 3 files changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 7a3803ce3a22..3e31fae9a149 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -41,7 +41,7 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
+ op->cb_v_break = volume->cb_v_break;
+ op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
+ op->error = -EDESTADDRREQ;
+- op->ac.error = SHRT_MAX;
++ op->nr_iterations = -1;
+
+ _leave(" = [op=%08x]", op->debug_id);
+ return op;
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index ec08b4a7e499..88381935bd66 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -859,7 +859,7 @@ struct afs_operation {
+ struct afs_call *call;
+ unsigned long untried; /* Bitmask of untried servers */
+ short index; /* Current server */
+- unsigned short nr_iterations; /* Number of server iterations */
++ short nr_iterations; /* Number of server iterations */
+
+ unsigned int flags;
+ #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a108cd55bb4e..4084e023ff43 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -116,7 +116,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+ unsigned int rtt;
+ int error = op->ac.error, i;
+
+- _enter("%lx[%d],%lx[%d],%d,%d",
++ op->nr_iterations++;
++
++ _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d",
++ op->debug_id, op->nr_iterations, op->volume->vid,
+ op->untried, op->index,
+ op->ac.tried, op->ac.index,
+ error, op->ac.abort_code);
+@@ -126,13 +129,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+ return false;
+ }
+
+- op->nr_iterations++;
++ if (op->nr_iterations == 0)
++ goto start;
+
+ /* Evaluate the result of the previous operation, if there was one. */
+ switch (error) {
+- case SHRT_MAX:
+- goto start;
+-
+ case 0:
+ default:
+ /* Success or local failure. Stop. */
+--
+2.43.0
+
--- /dev/null
+From 2dc8dd6e4d297a768f1c515165a1918554b25b85 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Oct 2023 09:43:23 +0100
+Subject: afs: Wrap most op->error accesses with inline funcs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 2de5599f63babb416e09b1a6be429a47910dd47c ]
+
+Wrap most op->error accesses with inline funcs which will make it easier
+for a subsequent patch to replace op->error with something else. Two
+functions are added to this end:
+
+ (1) afs_op_error() - Get the error code.
+
+ (2) afs_op_set_error() - Set the error code.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 38 +++++++++++++++---------------
+ fs/afs/file.c | 4 ++--
+ fs/afs/fs_operation.c | 21 ++++++++++-------
+ fs/afs/fsclient.c | 2 +-
+ fs/afs/inode.c | 2 +-
+ fs/afs/internal.h | 20 ++++++++++++----
+ fs/afs/rotate.c | 55 ++++++++++++++++++++++++-------------------
+ fs/afs/server.c | 6 ++---
+ fs/afs/write.c | 6 ++---
+ 9 files changed, 87 insertions(+), 67 deletions(-)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 2df2e9ee130d..15763418a938 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -886,14 +886,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ * lookups contained therein are stored in the reply without aborting
+ * the whole operation.
+ */
+- op->error = -ENOTSUPP;
++ afs_op_set_error(op, -ENOTSUPP);
+ if (!cookie->one_only) {
+ op->ops = &afs_inline_bulk_status_operation;
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+
+- if (op->error == -ENOTSUPP) {
++ if (afs_op_error(op) == -ENOTSUPP) {
+ /* We could try FS.BulkStatus next, but this aborts the entire
+ * op if any of the lookups fails - so, for the moment, revert
+ * to FS.FetchStatus for op->file[1].
+@@ -903,10 +903,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+ afs_begin_vnode_operation(op);
+ afs_wait_for_operation(op);
+ }
+- inode = ERR_PTR(op->error);
++ inode = ERR_PTR(afs_op_error(op));
+
+ out_op:
+- if (op->error == 0) {
++ if (!afs_op_error(op)) {
+ inode = &op->file[1].vnode->netfs.inode;
+ op->file[1].vnode = NULL;
+ }
+@@ -1281,7 +1281,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+
+ _enter("");
+
+- ASSERTCMP(op->error, ==, 0);
++ ASSERTCMP(afs_op_error(op), ==, 0);
+
+ inode = afs_iget(op, vp);
+ if (IS_ERR(inode)) {
+@@ -1294,7 +1294,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+
+ vnode = AFS_FS_I(inode);
+ set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+- if (!op->error)
++ if (!afs_op_error(op))
+ afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
+ d_instantiate(op->dentry, inode);
+ }
+@@ -1328,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op)
+ {
+ _enter("op=%08x", op->debug_id);
+
+- if (op->error)
++ if (afs_op_error(op))
+ d_drop(op->dentry);
+ }
+
+@@ -1488,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
+ struct dentry *dentry = op->dentry;
+ int ret;
+
+- if (op->error != 0 ||
++ if (afs_op_error(op) ||
+ (op->file[1].scb.have_status && op->file[1].scb.have_error))
+ return;
+ if (d_really_is_positive(dentry))
+@@ -1512,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op)
+
+ ret = afs_validate(vnode, op->key);
+ if (ret != -ESTALE)
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ }
+
+- _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
++ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op));
+ }
+
+ static void afs_unlink_success(struct afs_operation *op)
+@@ -1546,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
+ static void afs_unlink_put(struct afs_operation *op)
+ {
+ _enter("op=%08x", op->debug_id);
+- if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT)
++ if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
+ d_rehash(op->dentry);
+ }
+
+@@ -1587,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+ /* Try to make sure we have a callback promise on the victim. */
+ ret = afs_validate(vnode, op->key);
+ if (ret < 0) {
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ goto error;
+ }
+
+@@ -1596,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+ spin_unlock(&dentry->d_lock);
+ /* Start asynchronous writeout of the inode */
+ write_inode_now(d_inode(dentry), 0);
+- op->error = afs_sillyrename(dvnode, vnode, dentry, op->key);
++ afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
+ goto error;
+ }
+ if (!d_unhashed(dentry)) {
+@@ -1617,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+ /* If there was a conflict with a third party, check the status of the
+ * unlinked vnode.
+ */
+- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
++ if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+ op->file[1].update_ctime = false;
+ op->fetch_status.which = 1;
+ op->ops = &afs_fetch_status_operation;
+@@ -1699,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op)
+ static void afs_link_put(struct afs_operation *op)
+ {
+ _enter("op=%08x", op->debug_id);
+- if (op->error)
++ if (afs_op_error(op))
+ d_drop(op->dentry);
+ }
+
+@@ -1897,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op)
+ if (op->rename.rehash)
+ d_rehash(op->rename.rehash);
+ dput(op->rename.tmp);
+- if (op->error)
++ if (afs_op_error(op))
+ d_rehash(op->dentry);
+ }
+
+@@ -1942,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ return PTR_ERR(op);
+
+ ret = afs_validate(vnode, op->key);
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ if (ret < 0)
+ goto error;
+
+@@ -1979,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ op->rename.tmp = d_alloc(new_dentry->d_parent,
+ &new_dentry->d_name);
+ if (!op->rename.tmp) {
+- op->error = -ENOMEM;
++ afs_op_nomem(op);
+ goto error;
+ }
+
+@@ -1987,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+ AFS_FS_I(d_inode(new_dentry)),
+ new_dentry, op->key);
+ if (ret) {
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ goto error;
+ }
+
+diff --git a/fs/afs/file.c b/fs/afs/file.c
+index d37dd201752b..0c81c39c32f5 100644
+--- a/fs/afs/file.c
++++ b/fs/afs/file.c
+@@ -243,7 +243,7 @@ static void afs_fetch_data_notify(struct afs_operation *op)
+ {
+ struct afs_read *req = op->fetch.req;
+ struct netfs_io_subrequest *subreq = req->subreq;
+- int error = op->error;
++ int error = afs_op_error(op);
+
+ if (error == -ECONNABORTED)
+ error = afs_abort_to_error(op->ac.abort_code);
+@@ -271,7 +271,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
+
+ static void afs_fetch_data_put(struct afs_operation *op)
+ {
+- op->fetch.req->error = op->error;
++ op->fetch.req->error = afs_op_error(op);
+ afs_put_read(op->fetch.req);
+ }
+
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 3e31fae9a149..bfb9a7634bd9 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -40,8 +40,8 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
+ op->net = volume->cell->net;
+ op->cb_v_break = volume->cb_v_break;
+ op->debug_id = atomic_inc_return(&afs_operation_debug_counter);
+- op->error = -EDESTADDRREQ;
+ op->nr_iterations = -1;
++ afs_op_set_error(op, -EDESTADDRREQ);
+
+ _leave(" = [op=%08x]", op->debug_id);
+ return op;
+@@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
+ swap(vnode, vnode2);
+
+ if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+- op->error = -ERESTARTSYS;
++ afs_op_set_error(op, -ERESTARTSYS);
+ op->flags |= AFS_OPERATION_STOP;
+ _leave(" = f [I 0]");
+ return false;
+@@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
+
+ if (vnode2) {
+ if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
+- op->error = -ERESTARTSYS;
++ afs_op_set_error(op, -ERESTARTSYS);
+ op->flags |= AFS_OPERATION_STOP;
+ mutex_unlock(&vnode->io_lock);
+ op->flags &= ~AFS_OPERATION_LOCK_0;
+@@ -159,11 +159,14 @@ static void afs_end_vnode_operation(struct afs_operation *op)
+ {
+ _enter("");
+
+- if (op->error == -EDESTADDRREQ ||
+- op->error == -EADDRNOTAVAIL ||
+- op->error == -ENETUNREACH ||
+- op->error == -EHOSTUNREACH)
++ switch (afs_op_error(op)) {
++ case -EDESTADDRREQ:
++ case -EADDRNOTAVAIL:
++ case -ENETUNREACH:
++ case -EHOSTUNREACH:
+ afs_dump_edestaddrreq(op);
++ break;
++ }
+
+ afs_drop_io_locks(op);
+
+@@ -209,7 +212,7 @@ void afs_wait_for_operation(struct afs_operation *op)
+
+ afs_end_vnode_operation(op);
+
+- if (op->error == 0 && op->ops->edit_dir) {
++ if (!afs_op_error(op) && op->ops->edit_dir) {
+ _debug("edit_dir");
+ op->ops->edit_dir(op);
+ }
+@@ -221,7 +224,7 @@ void afs_wait_for_operation(struct afs_operation *op)
+ */
+ int afs_put_operation(struct afs_operation *op)
+ {
+- int i, ret = op->error;
++ int i, ret = afs_op_error(op);
+
+ _enter("op=%08x,%d", op->debug_id, ret);
+
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 7d37f63ef0f0..6821ce0f9d63 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1899,7 +1899,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
+ int i;
+
+ if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) {
+- op->error = -ENOTSUPP;
++ afs_op_set_error(op, -ENOTSUPP);
+ return;
+ }
+
+diff --git a/fs/afs/inode.c b/fs/afs/inode.c
+index 78efc9719349..d6eed332507f 100644
+--- a/fs/afs/inode.c
++++ b/fs/afs/inode.c
+@@ -331,7 +331,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
+
+ if (vnode->netfs.inode.i_state & I_NEW) {
+ ret = afs_inode_init_from_status(op, vp, vnode);
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ if (ret == 0)
+ afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
+ } else {
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 88381935bd66..1a306df267b0 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -1140,11 +1140,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *);
+ extern void afs_wait_for_operation(struct afs_operation *);
+ extern int afs_do_sync_operation(struct afs_operation *);
+
+-static inline void afs_op_nomem(struct afs_operation *op)
+-{
+- op->error = -ENOMEM;
+-}
+-
+ static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n,
+ struct afs_vnode *vnode)
+ {
+@@ -1238,6 +1233,21 @@ static inline void __afs_stat(atomic_t *s)
+ extern int afs_abort_to_error(u32);
+ extern void afs_prioritise_error(struct afs_error *, int, u32);
+
++static inline void afs_op_nomem(struct afs_operation *op)
++{
++ op->error = -ENOMEM;
++}
++
++static inline int afs_op_error(const struct afs_operation *op)
++{
++ return op->error;
++}
++
++static inline int afs_op_set_error(struct afs_operation *op, int error)
++{
++ return op->error = error;
++}
++
+ /*
+ * mntpt.c
+ */
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 4084e023ff43..d64c1d90faed 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -51,7 +51,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
+ * and have to return an error.
+ */
+ if (op->flags & AFS_OPERATION_CUR_ONLY) {
+- op->error = -ESTALE;
++ afs_op_set_error(op, -ESTALE);
+ return false;
+ }
+
+@@ -93,7 +93,7 @@ static bool afs_sleep_and_retry(struct afs_operation *op)
+ if (!(op->flags & AFS_OPERATION_UNINTR)) {
+ msleep_interruptible(1000);
+ if (signal_pending(current)) {
+- op->error = -ERESTARTSYS;
++ afs_op_set_error(op, -ERESTARTSYS);
+ return false;
+ }
+ } else {
+@@ -137,7 +137,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ case 0:
+ default:
+ /* Success or local failure. Stop. */
+- op->error = error;
++ afs_op_set_error(op, error);
+ op->flags |= AFS_OPERATION_STOP;
+ _leave(" = f [okay/local %d]", error);
+ return false;
+@@ -174,11 +174,13 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
+ error = afs_check_volume_status(op->volume, op);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_op_set_error(op, error);
++ goto failed;
++ }
+
+ if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
+- op->error = -ENOMEDIUM;
++ afs_op_set_error(op, -ENOMEDIUM);
+ goto failed;
+ }
+
+@@ -250,11 +252,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+ clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+ }
+ if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+- op->error = -EADV;
++ afs_op_set_error(op, -EADV);
+ goto failed;
+ }
+ if (op->flags & AFS_OPERATION_CUR_ONLY) {
+- op->error = -ESTALE;
++ afs_op_set_error(op, -ESTALE);
+ goto failed;
+ }
+ goto busy;
+@@ -275,7 +277,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * lock we need to maintain.
+ */
+ if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+- op->error = -EBUSY;
++ afs_op_set_error(op, -EBUSY);
+ goto failed;
+ }
+ if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
+@@ -304,7 +306,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * honour, just in case someone sets up a loop.
+ */
+ if (op->flags & AFS_OPERATION_VMOVED) {
+- op->error = -EREMOTEIO;
++ afs_op_set_error(op, -EREMOTEIO);
+ goto failed;
+ }
+ op->flags |= AFS_OPERATION_VMOVED;
+@@ -312,8 +314,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+ set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
+ set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
+ error = afs_check_volume_status(op->volume, op);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_op_set_error(op, error);
++ goto failed;
++ }
+
+ /* If the server list didn't change, then the VLDB is
+ * out of sync with the fileservers. This is hopefully
+@@ -344,7 +348,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * Translate locally and return ENOSPC.
+ * No replicas to failover to.
+ */
+- op->error = -ENOSPC;
++ afs_op_set_error(op, -ENOSPC);
+ goto failed_but_online;
+
+ case VOVERQUOTA:
+@@ -353,7 +357,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * Translate locally and return EDQUOT.
+ * No replicas to failover to.
+ */
+- op->error = -EDQUOT;
++ afs_op_set_error(op, -EDQUOT);
+ goto failed_but_online;
+
+ default:
+@@ -366,7 +370,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ case -ETIMEDOUT:
+ case -ETIME:
+- if (op->error != -EDESTADDRREQ)
++ if (afs_op_error(op) != -EDESTADDRREQ)
+ goto iterate_address;
+ fallthrough;
+ case -ERFKILL:
+@@ -385,7 +389,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ fallthrough;
+ case -ECONNRESET:
+ _debug("call reset");
+- op->error = error;
++ afs_op_set_error(op, error);
+ goto failed;
+ }
+
+@@ -401,8 +405,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+ * volume may have moved or even have been deleted.
+ */
+ error = afs_check_volume_status(op->volume, op);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_op_set_error(op, error);
++ goto failed;
++ }
+
+ if (!afs_start_fs_iteration(op, vnode))
+ goto failed;
+@@ -413,8 +419,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+ _debug("pick [%lx]", op->untried);
+
+ error = afs_wait_for_fs_probes(op->server_list, op->untried);
+- if (error < 0)
+- goto failed_set_error;
++ if (error < 0) {
++ afs_op_set_error(op, error);
++ goto failed;
++ }
+
+ /* Pick the untried server with the lowest RTT. If we have outstanding
+ * callbacks, we stick with the server we're already using if we can.
+@@ -515,7 +523,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+ op->flags &= ~AFS_OPERATION_RETRY_SERVER;
+ goto retry_server;
+ case -ERESTARTSYS:
+- goto failed_set_error;
++ afs_op_set_error(op, error);
++ goto failed;
+ case -ETIME:
+ case -EDESTADDRREQ:
+ goto next_server;
+@@ -544,13 +553,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+ }
+
+ error = e.error;
+-
+-failed_set_error:
+ op->error = error;
+ failed:
+ op->flags |= AFS_OPERATION_STOP;
+ afs_end_cursor(&op->ac);
+- _leave(" = f [failed %d]", op->error);
++ _leave(" = f [failed %d]", afs_op_error(op));
+ return false;
+ }
+
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 5b5fa94005c9..2826e6eced71 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -629,8 +629,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
+ _leave(" = t [intr]");
+ return true;
+ }
+- op->error = PTR_ERR(alist);
+- _leave(" = f [%d]", op->error);
++ afs_op_set_error(op, PTR_ERR(alist));
++ _leave(" = f [%d]", afs_op_error(op));
+ return false;
+ }
+
+@@ -684,7 +684,7 @@ bool afs_check_server_record(struct afs_operation *op, struct afs_server *server
+ (op->flags & AFS_OPERATION_UNINTR) ?
+ TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE);
+ if (ret == -ERESTARTSYS) {
+- op->error = ret;
++ afs_op_set_error(op, ret);
+ _leave(" = f [intr]");
+ return false;
+ }
+diff --git a/fs/afs/write.c b/fs/afs/write.c
+index 4a168781936b..9f90d8970ce9 100644
+--- a/fs/afs/write.c
++++ b/fs/afs/write.c
+@@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op)
+
+ op->ctime = op->file[0].scb.status.mtime_client;
+ afs_vnode_commit_status(op, &op->file[0]);
+- if (op->error == 0) {
++ if (!afs_op_error(op)) {
+ if (!op->store.laundering)
+ afs_pages_written_back(vnode, op->store.pos, op->store.size);
+ afs_stat_v(vnode, n_stores);
+@@ -428,7 +428,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
+
+ afs_wait_for_operation(op);
+
+- switch (op->error) {
++ switch (afs_op_error(op)) {
+ case -EACCES:
+ case -EPERM:
+ case -ENOKEY:
+@@ -447,7 +447,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
+ }
+
+ afs_put_wb_key(wbk);
+- _leave(" = %d", op->error);
++ _leave(" = %d", afs_op_error(op));
+ return afs_put_operation(op);
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 171db765ad289c8587da5dec137d4deb2f99c402 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:14 -0800
+Subject: bnxt_en: Prevent kernel warning when running offline self test
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit c20f482129a582455f02eb9a6dcb2a4215274599 ]
+
+We call bnxt_half_open_nic() to setup the chip partially to run
+loopback tests. The rings and buffers are initialized normally
+so that we can transmit and receive packets in loopback mode.
+That means page pool buffers are allocated for the aggregation ring
+just like the normal case. NAPI is not needed because we are just
+polling for the loopback packets.
+
+When we're done with the loopback tests, we call bnxt_half_close_nic()
+to clean up. When freeing the page pools, we hit a WARN_ON()
+in page_pool_unlink_napi() because the NAPI state linked to the
+page pool is uninitialized.
+
+The simplest way to avoid this warning is just to initialize the
+NAPIs during half open and delete the NAPIs during half close.
+Trying to skip the page pool initialization or skip linking of
+NAPI during half open will be more complicated.
+
+This fix avoids this warning:
+
+WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30
+CPU: 4 PID: 46967 Comm: ethtool Tainted: G S W 6.7.0-rc5+ #22
+Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021
+RIP: 0010:page_pool_unlink_napi+0x1f/0x30
+Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90
+RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246
+RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008
+RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000
+RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641
+R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001
+R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0
+FS: 00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ ? __warn+0x81/0x140
+ ? page_pool_unlink_napi+0x1f/0x30
+ ? report_bug+0x102/0x200
+ ? handle_bug+0x44/0x70
+ ? exc_invalid_op+0x13/0x60
+ ? asm_exc_invalid_op+0x16/0x20
+ ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en]
+ ? page_pool_unlink_napi+0x1f/0x30
+ page_pool_destroy+0x3e/0x150
+ bnxt_free_mem+0x441/0x5e0 [bnxt_en]
+ bnxt_half_close_nic+0x2a/0x40 [bnxt_en]
+ bnxt_self_test+0x21d/0x450 [bnxt_en]
+ __dev_ethtool+0xeda/0x2e30
+ ? native_queued_spin_lock_slowpath+0x17f/0x2b0
+ ? __link_object+0xa1/0x160
+ ? _raw_spin_unlock_irqrestore+0x23/0x40
+ ? __create_object+0x5f/0x90
+ ? __kmem_cache_alloc_node+0x317/0x3c0
+ ? dev_ethtool+0x59/0x170
+ dev_ethtool+0xa7/0x170
+ dev_ioctl+0xc3/0x530
+ sock_do_ioctl+0xa8/0xf0
+ sock_ioctl+0x270/0x310
+ __x64_sys_ioctl+0x8c/0xc0
+ do_syscall_64+0x3e/0xf0
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools")
+Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
+Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 1019b4dc7bed..22c8bfb5ed9d 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -10627,10 +10627,12 @@ int bnxt_half_open_nic(struct bnxt *bp)
+ netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
+ goto half_open_err;
+ }
++ bnxt_init_napi(bp);
+ set_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+ rc = bnxt_init_nic(bp, true);
+ if (rc) {
+ clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
++ bnxt_del_napi(bp);
+ netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc);
+ goto half_open_err;
+ }
+@@ -10649,6 +10651,7 @@ int bnxt_half_open_nic(struct bnxt *bp)
+ void bnxt_half_close_nic(struct bnxt *bp)
+ {
+ bnxt_hwrm_resource_free(bp, false, true);
++ bnxt_del_napi(bp);
+ bnxt_free_skbs(bp);
+ bnxt_free_mem(bp, true);
+ clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+--
+2.43.0
+
--- /dev/null
+From c6cfa8547d19c5c8f5f9a9fe22bd0b1064af03a7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:11 -0800
+Subject: bnxt_en: Wait for FLR to complete during probe
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit 3c1069fa42872f95cf3c6fedf80723d391e12d57 ]
+
+The first message to firmware may fail if the device is undergoing FLR.
+The driver has some recovery logic for this failure scenario but we must
+wait 100 msec for FLR to complete before proceeding. Otherwise the
+recovery will always fail.
+
+Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure")
+Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index e1f1e646cf48..1019b4dc7bed 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -12298,6 +12298,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp)
+
+ bp->fw_cap = 0;
+ rc = bnxt_hwrm_ver_get(bp);
++ /* FW may be unresponsive after FLR. FLR must complete within 100 msec
++ * so wait before continuing with recovery.
++ */
++ if (rc)
++ msleep(100);
+ bnxt_try_map_fw_health_reg(bp);
+ if (rc) {
+ rc = bnxt_try_recover_fw(bp);
+--
+2.43.0
+
--- /dev/null
+From 7893c91364a20cba4d0c74f3b3455ab5e3175dec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 11:02:25 +1030
+Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K
+ aligned
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit f546c4282673497a06ecb6190b50ae7f6c85b02f ]
+
+[BUG]
+There is a bug report that, on a ext4-converted btrfs, scrub leads to
+various problems, including:
+
+- "unable to find chunk map" errors
+ BTRFS info (device vdb): scrub: started on devid 1
+ BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096
+ BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056
+
+ This would lead to unrepariable errors.
+
+- Use-after-free KASAN reports:
+ ==================================================================
+ BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0
+ Read of size 8 at addr ffff8881013c9040 by task btrfs/909
+ CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b
+ Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023
+ Call Trace:
+ <TASK>
+ dump_stack_lvl+0x43/0x60
+ print_report+0xcf/0x640
+ kasan_report+0xa6/0xd0
+ __blk_rq_map_sg+0x18f/0x7c0
+ virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+ virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+ blk_mq_flush_plug_list.part.0+0x780/0x860
+ __blk_flush_plug+0x1ba/0x220
+ blk_finish_plug+0x3b/0x60
+ submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+ __x64_sys_ioctl+0xbd/0x100
+ do_syscall_64+0x5d/0xe0
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+ RIP: 0033:0x7f47e5e0952b
+
+- Crash, mostly due to above use-after-free
+
+[CAUSE]
+The converted fs has the following data chunk layout:
+
+ item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80
+ length 86016 owner 2 stripe_len 65536 type DATA|single
+
+For above logical bytenr 2214744064, it's at the chunk end
+(2214658048 + 86016 = 2214744064).
+
+This means btrfs_submit_bio() would split the bio, and trigger endio
+function for both of the two halves.
+
+However scrub_submit_initial_read() would only expect the endio function
+to be called once, not any more.
+This means the first endio function would already free the bbio::bio,
+leaving the bvec freed, thus the 2nd endio call would lead to
+use-after-free.
+
+[FIX]
+- Make sure scrub_read_endio() only updates bits in its range
+ Since we may read less than 64K at the end of the chunk, we should not
+ touch the bits beyond chunk boundary.
+
+- Make sure scrub_submit_initial_read() only to read the chunk range
+ This is done by calculating the real number of sectors we need to
+ read, and add sector-by-sector to the bio.
+
+Thankfully the scrub read repair path won't need extra fixes:
+
+- scrub_stripe_submit_repair_read()
+ With above fixes, we won't update error bit for range beyond chunk,
+ thus scrub_stripe_submit_repair_read() should never submit any read
+ beyond the chunk.
+
+Reported-by: Rongrong <i@rong.moe>
+Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
+Tested-by: Rongrong <i@rong.moe>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 29 ++++++++++++++++++++++-------
+ 1 file changed, 22 insertions(+), 7 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index f62a408671cb..443d2519f0a9 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1099,12 +1099,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
+ static void scrub_read_endio(struct btrfs_bio *bbio)
+ {
+ struct scrub_stripe *stripe = bbio->private;
++ struct bio_vec *bvec;
++ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
++ int num_sectors;
++ u32 bio_size = 0;
++ int i;
++
++ ASSERT(sector_nr < stripe->nr_sectors);
++ bio_for_each_bvec_all(bvec, &bbio->bio, i)
++ bio_size += bvec->bv_len;
++ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
+
+ if (bbio->bio.bi_status) {
+- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
++ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
++ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+ } else {
+- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
++ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+ }
+ bio_put(&bbio->bio);
+ if (atomic_dec_and_test(&stripe->pending_io)) {
+@@ -1705,6 +1715,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+ {
+ struct btrfs_fs_info *fs_info = sctx->fs_info;
+ struct btrfs_bio *bbio;
++ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
++ stripe->bg->length - stripe->logical) >>
++ fs_info->sectorsize_bits;
+ int mirror = stripe->mirror_num;
+
+ ASSERT(stripe->bg);
+@@ -1719,14 +1732,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+ bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+ scrub_read_endio, stripe);
+
+- /* Read the whole stripe. */
+ bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
++ /* Read the whole range inside the chunk boundary. */
++ for (unsigned int cur = 0; cur < nr_sectors; cur++) {
++ struct page *page = scrub_stripe_get_page(stripe, cur);
++ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
+ int ret;
+
+- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
++ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+ /* We should have allocated enough bio vectors. */
+- ASSERT(ret == PAGE_SIZE);
++ ASSERT(ret == fs_info->sectorsize);
+ }
+ atomic_inc(&stripe->pending_io);
+
+--
+2.43.0
+
--- /dev/null
+From 1a1ebca1fa42f6ee08f20960c224d7176929bbc5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:01 +0100
+Subject: dpll: fix broken error path in dpll_pin_alloc(..)
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit b6a11a7fc4d6337f7ea720b9287d1b9749c4eae0 ]
+
+If pin type is not expected, or pin properities failed to allocate
+memory, the unwind error path shall not destroy pin's xarrays, which
+were not yet initialized.
+Add new goto label and use it to fix broken error path.
+
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 830ead5fb0c5 ("dpll: fix pin dump crash for rebound module")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 3568149b9562..36f5c0eaf604 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -440,7 +440,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX ||
+ prop->type > DPLL_PIN_TYPE_MAX)) {
+ ret = -EINVAL;
+- goto err;
++ goto err_pin_prop;
+ }
+ pin->prop = prop;
+ refcount_set(&pin->refcount, 1);
+@@ -448,11 +448,12 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+ ret = xa_alloc(&dpll_pin_xa, &pin->id, pin, xa_limit_16b, GFP_KERNEL);
+ if (ret)
+- goto err;
++ goto err_xa_alloc;
+ return pin;
+-err:
++err_xa_alloc:
+ xa_destroy(&pin->dpll_refs);
+ xa_destroy(&pin->parent_refs);
++err_pin_prop:
+ kfree(pin);
+ return ERR_PTR(ret);
+ }
+--
+2.43.0
+
--- /dev/null
+From c92ec4869e0b3bdd99cf0a23d92cc463e45348dd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:02 +0100
+Subject: dpll: fix pin dump crash for rebound module
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit 830ead5fb0c5855ce4d70ba2ed4a673b5f1e7d9b ]
+
+When a kernel module is unbound but the pin resources were not entirely
+freed (other kernel module instance of the same PCI device have had kept
+the reference to that pin), and kernel module is again bound, the pin
+properties would not be updated (the properties are only assigned when
+memory for the pin is allocated), prop pointer still points to the
+kernel module memory of the kernel module which was deallocated on the
+unbind.
+
+If the pin dump is invoked in this state, the result is a kernel crash.
+Prevent the crash by storing persistent pin properties in dpll subsystem,
+copy the content from the kernel module when pin is allocated, instead of
+using memory of the kernel module.
+
+Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c | 55 +++++++++++++++++++++++++++++++++++--
+ drivers/dpll/dpll_core.h | 4 +--
+ drivers/dpll/dpll_netlink.c | 28 +++++++++----------
+ 3 files changed, 69 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 36f5c0eaf604..5e3b9b5679f9 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -424,6 +424,53 @@ void dpll_device_unregister(struct dpll_device *dpll,
+ }
+ EXPORT_SYMBOL_GPL(dpll_device_unregister);
+
++static void dpll_pin_prop_free(struct dpll_pin_properties *prop)
++{
++ kfree(prop->package_label);
++ kfree(prop->panel_label);
++ kfree(prop->board_label);
++ kfree(prop->freq_supported);
++}
++
++static int dpll_pin_prop_dup(const struct dpll_pin_properties *src,
++ struct dpll_pin_properties *dst)
++{
++ memcpy(dst, src, sizeof(*dst));
++ if (src->freq_supported && src->freq_supported_num) {
++ size_t freq_size = src->freq_supported_num *
++ sizeof(*src->freq_supported);
++ dst->freq_supported = kmemdup(src->freq_supported,
++ freq_size, GFP_KERNEL);
++ if (!src->freq_supported)
++ return -ENOMEM;
++ }
++ if (src->board_label) {
++ dst->board_label = kstrdup(src->board_label, GFP_KERNEL);
++ if (!dst->board_label)
++ goto err_board_label;
++ }
++ if (src->panel_label) {
++ dst->panel_label = kstrdup(src->panel_label, GFP_KERNEL);
++ if (!dst->panel_label)
++ goto err_panel_label;
++ }
++ if (src->package_label) {
++ dst->package_label = kstrdup(src->package_label, GFP_KERNEL);
++ if (!dst->package_label)
++ goto err_package_label;
++ }
++
++ return 0;
++
++err_package_label:
++ kfree(dst->panel_label);
++err_panel_label:
++ kfree(dst->board_label);
++err_board_label:
++ kfree(dst->freq_supported);
++ return -ENOMEM;
++}
++
+ static struct dpll_pin *
+ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ const struct dpll_pin_properties *prop)
+@@ -442,7 +489,9 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ ret = -EINVAL;
+ goto err_pin_prop;
+ }
+- pin->prop = prop;
++ ret = dpll_pin_prop_dup(prop, &pin->prop);
++ if (ret)
++ goto err_pin_prop;
+ refcount_set(&pin->refcount, 1);
+ xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
+ xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+@@ -453,6 +502,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ err_xa_alloc:
+ xa_destroy(&pin->dpll_refs);
+ xa_destroy(&pin->parent_refs);
++ dpll_pin_prop_free(&pin->prop);
+ err_pin_prop:
+ kfree(pin);
+ return ERR_PTR(ret);
+@@ -513,6 +563,7 @@ void dpll_pin_put(struct dpll_pin *pin)
+ xa_destroy(&pin->dpll_refs);
+ xa_destroy(&pin->parent_refs);
+ xa_erase(&dpll_pin_xa, pin->id);
++ dpll_pin_prop_free(&pin->prop);
+ kfree(pin);
+ }
+ mutex_unlock(&dpll_lock);
+@@ -635,7 +686,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+ unsigned long i, stop;
+ int ret;
+
+- if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
++ if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX))
+ return -EINVAL;
+
+ if (WARN_ON(!ops) ||
+diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
+index 5585873c5c1b..717f715015c7 100644
+--- a/drivers/dpll/dpll_core.h
++++ b/drivers/dpll/dpll_core.h
+@@ -44,7 +44,7 @@ struct dpll_device {
+ * @module: module of creator
+ * @dpll_refs: hold referencees to dplls pin was registered with
+ * @parent_refs: hold references to parent pins pin was registered with
+- * @prop: pointer to pin properties given by registerer
++ * @prop: pin properties copied from the registerer
+ * @rclk_dev_name: holds name of device when pin can recover clock from it
+ * @refcount: refcount
+ **/
+@@ -55,7 +55,7 @@ struct dpll_pin {
+ struct module *module;
+ struct xarray dpll_refs;
+ struct xarray parent_refs;
+- const struct dpll_pin_properties *prop;
++ struct dpll_pin_properties prop;
+ refcount_t refcount;
+ };
+
+diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
+index ce7cf736f020..4c64611d32ac 100644
+--- a/drivers/dpll/dpll_netlink.c
++++ b/drivers/dpll/dpll_netlink.c
+@@ -278,17 +278,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
+ if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
+ DPLL_A_PIN_PAD))
+ return -EMSGSIZE;
+- for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
++ for (fs = 0; fs < pin->prop.freq_supported_num; fs++) {
+ nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
+ if (!nest)
+ return -EMSGSIZE;
+- freq = pin->prop->freq_supported[fs].min;
++ freq = pin->prop.freq_supported[fs].min;
+ if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
+ &freq, DPLL_A_PIN_PAD)) {
+ nla_nest_cancel(msg, nest);
+ return -EMSGSIZE;
+ }
+- freq = pin->prop->freq_supported[fs].max;
++ freq = pin->prop.freq_supported[fs].max;
+ if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
+ &freq, DPLL_A_PIN_PAD)) {
+ nla_nest_cancel(msg, nest);
+@@ -304,9 +304,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq)
+ {
+ int fs;
+
+- for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
+- if (freq >= pin->prop->freq_supported[fs].min &&
+- freq <= pin->prop->freq_supported[fs].max)
++ for (fs = 0; fs < pin->prop.freq_supported_num; fs++)
++ if (freq >= pin->prop.freq_supported[fs].min &&
++ freq <= pin->prop.freq_supported[fs].max)
+ return true;
+ return false;
+ }
+@@ -396,7 +396,7 @@ static int
+ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
+ struct netlink_ext_ack *extack)
+ {
+- const struct dpll_pin_properties *prop = pin->prop;
++ const struct dpll_pin_properties *prop = &pin->prop;
+ struct dpll_pin_ref *ref;
+ int ret;
+
+@@ -689,7 +689,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx,
+ int ret;
+
+ if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+- pin->prop->capabilities)) {
++ pin->prop.capabilities)) {
+ NL_SET_ERR_MSG(extack, "state changing is not allowed");
+ return -EOPNOTSUPP;
+ }
+@@ -725,7 +725,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin,
+ int ret;
+
+ if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+- pin->prop->capabilities)) {
++ pin->prop.capabilities)) {
+ NL_SET_ERR_MSG(extack, "state changing is not allowed");
+ return -EOPNOTSUPP;
+ }
+@@ -752,7 +752,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin,
+ int ret;
+
+ if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
+- pin->prop->capabilities)) {
++ pin->prop.capabilities)) {
+ NL_SET_ERR_MSG(extack, "prio changing is not allowed");
+ return -EOPNOTSUPP;
+ }
+@@ -780,7 +780,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
+ int ret;
+
+ if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
+- pin->prop->capabilities)) {
++ pin->prop.capabilities)) {
+ NL_SET_ERR_MSG(extack, "direction changing is not allowed");
+ return -EOPNOTSUPP;
+ }
+@@ -810,8 +810,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr,
+ int ret;
+
+ phase_adj = nla_get_s32(phase_adj_attr);
+- if (phase_adj > pin->prop->phase_range.max ||
+- phase_adj < pin->prop->phase_range.min) {
++ if (phase_adj > pin->prop.phase_range.max ||
++ phase_adj < pin->prop.phase_range.min) {
+ NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
+ "phase adjust value not supported");
+ return -EINVAL;
+@@ -995,7 +995,7 @@ dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr,
+ unsigned long i;
+
+ xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
+- prop = pin->prop;
++ prop = &pin->prop;
+ cid_match = clock_id ? pin->clock_id == clock_id : true;
+ mod_match = mod_name_attr && module_name(pin->module) ?
+ !nla_strcmp(mod_name_attr,
+--
+2.43.0
+
--- /dev/null
+From c414d49fa449a866c343aa87835bb4d65c568c92 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:04 +0100
+Subject: dpll: fix register pin with unregistered parent pin
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit 7dc5b18ff71bd6f948810ab8a08b6a6ff8b315c5 ]
+
+In case of multiple kernel module instances using the same dpll device:
+if only one registers dpll device, then only that one can register
+directly connected pins with a dpll device. When unregistered parent is
+responsible for determining if the muxed pin can be registered with it
+or not, the drivers need to be loaded in serialized order to work
+correctly - first the driver instance which registers the direct pins
+needs to be loaded, then the other instances could register muxed type
+pins.
+
+Allow registration of a pin with a parent even if the parent was not
+yet registered, thus allow ability for unserialized driver instance
+load order.
+Do not WARN_ON notification for unregistered pin, which can be invoked
+for described case, instead just return error.
+
+Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 5e3b9b5679f9..f8fbf0394288 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -28,8 +28,6 @@ static u32 dpll_xa_id;
+ WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+ #define ASSERT_DPLL_NOT_REGISTERED(d) \
+ WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+-#define ASSERT_PIN_REGISTERED(p) \
+- WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
+
+ struct dpll_device_registration {
+ struct list_head list;
+@@ -614,8 +612,6 @@ dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+ WARN_ON(!ops->state_on_dpll_get) ||
+ WARN_ON(!ops->direction_get))
+ return -EINVAL;
+- if (ASSERT_DPLL_REGISTERED(dpll))
+- return -EINVAL;
+
+ mutex_lock(&dpll_lock);
+ if (WARN_ON(!(dpll->module == pin->module &&
+@@ -693,8 +689,6 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+ WARN_ON(!ops->state_on_pin_get) ||
+ WARN_ON(!ops->direction_get))
+ return -EINVAL;
+- if (ASSERT_PIN_REGISTERED(parent))
+- return -EINVAL;
+
+ mutex_lock(&dpll_lock);
+ ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv);
+--
+2.43.0
+
--- /dev/null
+From be5f9be3b22c6869a4688312effac793c9550550 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:03 +0100
+Subject: dpll: fix userspace availability of pins
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit db2ec3c94667eaeecc6a74d96594fab6baf80fdc ]
+
+If parent pin was unregistered but child pin was not, the userspace
+would see the "zombie" pins - the ones that were registered with
+a parent pin (dpll_pin_on_pin_register(..)).
+Technically those are not available - as there is no dpll device in the
+system. Do not dump those pins and prevent userspace from any
+interaction with them. Provide a unified function to determine if the
+pin is available and use it before acting/responding for user requests.
+
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_netlink.c | 29 +++++++++++++++++++++++++++--
+ 1 file changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
+index 4c64611d32ac..7cc99d627942 100644
+--- a/drivers/dpll/dpll_netlink.c
++++ b/drivers/dpll/dpll_netlink.c
+@@ -525,6 +525,24 @@ __dpll_device_change_ntf(struct dpll_device *dpll)
+ return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll);
+ }
+
++static bool dpll_pin_available(struct dpll_pin *pin)
++{
++ struct dpll_pin_ref *par_ref;
++ unsigned long i;
++
++ if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
++ return false;
++ xa_for_each(&pin->parent_refs, i, par_ref)
++ if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id,
++ DPLL_REGISTERED))
++ return true;
++ xa_for_each(&pin->dpll_refs, i, par_ref)
++ if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id,
++ DPLL_REGISTERED))
++ return true;
++ return false;
++}
++
+ /**
+ * dpll_device_change_ntf - notify that the dpll device has been changed
+ * @dpll: registered dpll pointer
+@@ -551,7 +569,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin)
+ int ret = -ENOMEM;
+ void *hdr;
+
+- if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
++ if (!dpll_pin_available(pin))
+ return -ENODEV;
+
+ msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+@@ -1102,6 +1120,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+ }
+ pin = dpll_pin_find_from_nlattr(info);
+ if (!IS_ERR(pin)) {
++ if (!dpll_pin_available(pin)) {
++ nlmsg_free(msg);
++ return -ENODEV;
++ }
+ ret = dpll_msg_add_pin_handle(msg, pin);
+ if (ret) {
+ nlmsg_free(msg);
+@@ -1151,6 +1173,8 @@ int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+
+ xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED,
+ ctx->idx) {
++ if (!dpll_pin_available(pin))
++ continue;
+ hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+ cb->nlh->nlmsg_seq,
+ &dpll_nl_family, NLM_F_MULTI,
+@@ -1413,7 +1437,8 @@ int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+ }
+ info->user_ptr[0] = xa_load(&dpll_pin_xa,
+ nla_get_u32(info->attrs[DPLL_A_PIN_ID]));
+- if (!info->user_ptr[0]) {
++ if (!info->user_ptr[0] ||
++ !dpll_pin_available(info->user_ptr[0])) {
+ NL_SET_ERR_MSG(info->extack, "pin not found");
+ ret = -ENODEV;
+ goto unlock_dev;
+--
+2.43.0
+
--- /dev/null
+From 2b38a16abde53bfda995910e39ff9466933e189c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 01:24:42 +0800
+Subject: fjes: fix memleaks in fjes_hw_setup
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 ]
+
+In fjes_hw_setup, it allocates several memory and delay the deallocation
+to the fjes_hw_exit in fjes_probe through the following call chain:
+
+fjes_probe
+ |-> fjes_hw_init
+ |-> fjes_hw_setup
+ |-> fjes_hw_exit
+
+However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus
+all the resources allocated in fjes_hw_setup will be leaked. In this
+patch, we free those resources in fjes_hw_setup and prevents such leaks.
+
+Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c
+index 704e949484d0..b9b5554ea862 100644
+--- a/drivers/net/fjes/fjes_hw.c
++++ b/drivers/net/fjes/fjes_hw.c
+@@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+
+ mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid);
+ hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL);
+- if (!(hw->hw_info.req_buf))
+- return -ENOMEM;
++ if (!(hw->hw_info.req_buf)) {
++ result = -ENOMEM;
++ goto free_ep_info;
++ }
+
+ hw->hw_info.req_buf_size = mem_size;
+
+ mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid);
+ hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL);
+- if (!(hw->hw_info.res_buf))
+- return -ENOMEM;
++ if (!(hw->hw_info.res_buf)) {
++ result = -ENOMEM;
++ goto free_req_buf;
++ }
+
+ hw->hw_info.res_buf_size = mem_size;
+
+ result = fjes_hw_alloc_shared_status_region(hw);
+ if (result)
+- return result;
++ goto free_res_buf;
+
+ hw->hw_info.buffer_share_bit = 0;
+ hw->hw_info.buffer_unshare_reserve_bit = 0;
+@@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+
+ result = fjes_hw_alloc_epbuf(&buf_pair->tx);
+ if (result)
+- return result;
++ goto free_epbuf;
+
+ result = fjes_hw_alloc_epbuf(&buf_pair->rx);
+ if (result)
+- return result;
++ goto free_epbuf;
+
+ spin_lock_irqsave(&hw->rx_status_lock, flags);
+ fjes_hw_setup_epbuf(&buf_pair->tx, mac,
+@@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+ fjes_hw_init_command_registers(hw, ¶m);
+
+ return 0;
++
++free_epbuf:
++ for (epidx = 0; epidx < hw->max_epid ; epidx++) {
++ if (epidx == hw->my_epid)
++ continue;
++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx);
++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx);
++ }
++ fjes_hw_free_shared_status_region(hw);
++free_res_buf:
++ kfree(hw->hw_info.res_buf);
++ hw->hw_info.res_buf = NULL;
++free_req_buf:
++ kfree(hw->hw_info.req_buf);
++ hw->hw_info.req_buf = NULL;
++free_ep_info:
++ kfree(hw->ep_shm_info);
++ hw->ep_shm_info = NULL;
++ return result;
+ }
+
+ static void fjes_hw_cleanup(struct fjes_hw *hw)
+--
+2.43.0
+
--- /dev/null
+From 4186d2f90184f83aa949ca818e6eb18bc87b6253 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:56 +0100
+Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog
+
+From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+
+[ Upstream commit 83014323c642b8faa2d64a5f303b41c019322478 ]
+
+XDP programs can shrink packets by calling the bpf_xdp_adjust_tail()
+helper function. For multi-buffer packets this may lead to reduction of
+frag count stored in skb_shared_info area of the xdp_buff struct. This
+results in issues with the current handling of XDP_PASS and XDP_DROP
+cases.
+
+For XDP_PASS, currently skb is being built using frag count of
+xdp_buffer before it was processed by XDP prog and thus will result in
+an inconsistent skb when frag count gets reduced by XDP prog. To fix
+this, get correct frag count while building the skb instead of using
+pre-obtained frag count.
+
+For XDP_DROP, current page recycling logic will not reuse the page but
+instead will adjust the pagecnt_bias so that the page can be freed. This
+again results in inconsistent behavior as the page refcnt has already
+been changed by the helper while freeing the frag(s) as part of
+shrinking the packet. To fix this, only adjust pagecnt_bias for buffers
+that are stillpart of the packet post-xdp prog run.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 ++++++++++++---------
+ 1 file changed, 23 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index dd410b15000f..35e1bb6fe5e1 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -2099,7 +2099,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
+ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+ struct xdp_buff *xdp)
+ {
+- u32 next = rx_ring->next_to_clean;
++ u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++ u32 next = rx_ring->next_to_clean, i = 0;
+ struct i40e_rx_buffer *rx_buffer;
+
+ xdp->flags = 0;
+@@ -2112,10 +2113,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+ if (!rx_buffer->page)
+ continue;
+
+- if (xdp_res == I40E_XDP_CONSUMED)
+- rx_buffer->pagecnt_bias++;
+- else
++ if (xdp_res != I40E_XDP_CONSUMED)
+ i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
++ else if (i++ <= nr_frags)
++ rx_buffer->pagecnt_bias++;
+
+ /* EOP buffer will be put in i40e_clean_rx_irq() */
+ if (next == rx_ring->next_to_process)
+@@ -2129,20 +2130,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+ * i40e_construct_skb - Allocate skb and populate it
+ * @rx_ring: rx descriptor ring to transact packets on
+ * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+ *
+ * This function allocates an skb. It then populates it with the page
+ * data from the current receive descriptor, taking care to set up the
+ * skb correctly.
+ */
+ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+- struct xdp_buff *xdp,
+- u32 nr_frags)
++ struct xdp_buff *xdp)
+ {
+ unsigned int size = xdp->data_end - xdp->data;
+ struct i40e_rx_buffer *rx_buffer;
++ struct skb_shared_info *sinfo;
+ unsigned int headlen;
+ struct sk_buff *skb;
++ u32 nr_frags = 0;
+
+ /* prefetch first cache line of first page */
+ net_prefetch(xdp->data);
+@@ -2180,6 +2181,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+ memcpy(__skb_put(skb, headlen), xdp->data,
+ ALIGN(headlen, sizeof(long)));
+
++ if (unlikely(xdp_buff_has_frags(xdp))) {
++ sinfo = xdp_get_shared_info_from_buff(xdp);
++ nr_frags = sinfo->nr_frags;
++ }
+ rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+ /* update all of the pointers */
+ size -= headlen;
+@@ -2199,9 +2204,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+ }
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+- struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb);
++ struct skb_shared_info *skinfo = skb_shinfo(skb);
+
+- sinfo = xdp_get_shared_info_from_buff(xdp);
+ memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
+ sizeof(skb_frag_t) * nr_frags);
+
+@@ -2224,17 +2228,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+ * i40e_build_skb - Build skb around an existing buffer
+ * @rx_ring: Rx descriptor ring to transact packets on
+ * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+ *
+ * This function builds an skb around an existing Rx buffer, taking care
+ * to set up the skb correctly and avoid any memcpy overhead.
+ */
+ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+- struct xdp_buff *xdp,
+- u32 nr_frags)
++ struct xdp_buff *xdp)
+ {
+ unsigned int metasize = xdp->data - xdp->data_meta;
++ struct skb_shared_info *sinfo;
+ struct sk_buff *skb;
++ u32 nr_frags;
+
+ /* Prefetch first cache line of first page. If xdp->data_meta
+ * is unused, this points exactly as xdp->data, otherwise we
+@@ -2243,6 +2247,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+ */
+ net_prefetch(xdp->data_meta);
+
++ if (unlikely(xdp_buff_has_frags(xdp))) {
++ sinfo = xdp_get_shared_info_from_buff(xdp);
++ nr_frags = sinfo->nr_frags;
++ }
++
+ /* build an skb around the page buffer */
+ skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+ if (unlikely(!skb))
+@@ -2255,9 +2264,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+ skb_metadata_set(skb, metasize);
+
+ if (unlikely(xdp_buff_has_frags(xdp))) {
+- struct skb_shared_info *sinfo;
+-
+- sinfo = xdp_get_shared_info_from_buff(xdp);
+ xdp_update_skb_shared_info(skb, nr_frags,
+ sinfo->xdp_frags_size,
+ nr_frags * xdp->frame_sz,
+@@ -2602,9 +2608,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
+ total_rx_bytes += size;
+ } else {
+ if (ring_uses_build_skb(rx_ring))
+- skb = i40e_build_skb(rx_ring, xdp, nfrags);
++ skb = i40e_build_skb(rx_ring, xdp);
+ else
+- skb = i40e_construct_skb(rx_ring, xdp, nfrags);
++ skb = i40e_construct_skb(rx_ring, xdp);
+
+ /* drop if we failed to retrieve a buffer */
+ if (!skb) {
+--
+2.43.0
+
--- /dev/null
+From 8a934672dca82b456b04642db14ad547d932075d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:01 +0100
+Subject: i40e: set xdp_rxq_info::frag_size
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit a045d2f2d03d23e7db6772dd83e0ba2705dfad93 ]
+
+i40e support XDP multi-buffer so it is supposed to use
+__xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the
+frag_size. It can not be simply converted at existing callsite because
+rx_buf_len could be un-initialized, so let us register xdp_rxq_info
+within i40e_configure_rx_ring(), which happen to be called with already
+initialized rx_buf_len value.
+
+Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to
+int, so two variables to deal with return codes are not needed within
+i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status
+from xdp_rxq_info registration.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 40 ++++++++++++---------
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 -----
+ 2 files changed, 24 insertions(+), 25 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index d5519af34657..f97a63812141 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3588,40 +3588,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+ struct i40e_hmc_obj_rxq rx_ctx;
+ int err = 0;
+ bool ok;
+- int ret;
+
+ bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
+
+ /* clear the context structure first */
+ memset(&rx_ctx, 0, sizeof(rx_ctx));
+
+- if (ring->vsi->type == I40E_VSI_MAIN)
+- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++ ring->rx_buf_len = vsi->rx_buf_len;
++
++ /* XDP RX-queue info only needed for RX rings exposed to XDP */
++ if (ring->vsi->type != I40E_VSI_MAIN)
++ goto skip;
++
++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++ ring->queue_index,
++ ring->q_vector->napi.napi_id,
++ ring->rx_buf_len);
++ if (err)
++ return err;
++ }
+
+ ring->xsk_pool = i40e_xsk_pool(ring);
+ if (ring->xsk_pool) {
+- ring->rx_buf_len =
+- xsk_pool_get_rx_frame_size(ring->xsk_pool);
+- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++ ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
+- if (ret)
+- return ret;
++ if (err)
++ return err;
+ dev_info(&vsi->back->pdev->dev,
+ "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+ ring->queue_index);
+
+ } else {
+- ring->rx_buf_len = vsi->rx_buf_len;
+- if (ring->vsi->type == I40E_VSI_MAIN) {
+- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+- MEM_TYPE_PAGE_SHARED,
+- NULL);
+- if (ret)
+- return ret;
+- }
++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++ MEM_TYPE_PAGE_SHARED,
++ NULL);
++ if (err)
++ return err;
+ }
+
++skip:
+ xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq);
+
+ rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index 35e1bb6fe5e1..071ef309a3a4 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -1555,7 +1555,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
+ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+ {
+ struct device *dev = rx_ring->dev;
+- int err;
+
+ u64_stats_init(&rx_ring->syncp);
+
+@@ -1576,14 +1575,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+ rx_ring->next_to_process = 0;
+ rx_ring->next_to_use = 0;
+
+- /* XDP RX-queue info only needed for RX rings exposed to XDP */
+- if (rx_ring->vsi->type == I40E_VSI_MAIN) {
+- err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+- rx_ring->queue_index, rx_ring->q_vector->napi.napi_id);
+- if (err < 0)
+- return err;
+- }
+-
+ rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
+
+ rx_ring->rx_bi =
+--
+2.43.0
+
--- /dev/null
+From 27105d0dd212b950eacaae0e22bf6cccdf54c566 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:02 +0100
+Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f ]
+
+Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index f97a63812141..2bd7b29fb251 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3611,7 +3611,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+
+ ring->xsk_pool = i40e_xsk_pool(ring);
+ if (ring->xsk_pool) {
++ xdp_rxq_info_unreg(&ring->xdp_rxq);
+ ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++ ring->queue_index,
++ ring->q_vector->napi.napi_id,
++ ring->rx_buf_len);
++ if (err)
++ return err;
+ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
+--
+2.43.0
+
--- /dev/null
+From 6567d90f1860790c7a73d15c47a2d8cfa8de7aae Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:57 +0100
+Subject: ice: remove redundant xdp_rxq_info registration
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 2ee788c06493d02ee85855414cca39825e768aaf ]
+
+xdp_rxq_info struct can be registered by drivers via two functions -
+xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows
+drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size
+which in turn will make it possible to grow the packet via
+bpf_xdp_adjust_tail() BPF helper.
+
+Currently, ice registers xdp_rxq_info in two spots:
+1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG
+2) ice_vsi_cfg_rxq() // via __xdp_rxq_info_reg(), OK
+
+Cited commit under fixes tag took care of setting up frag_size and
+updated registration scheme in 2) but it did not help as
+1) is called before 2) and as shown above it uses old registration
+function. This means that 2) sees that xdp_rxq_info is already
+registered and never calls __xdp_rxq_info_reg() which leaves us with
+xdp_rxq_info::frag_size being set to 0.
+
+To fix this misbehavior, simply remove xdp_rxq_info_reg() call from
+ice_setup_rx_ring().
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -----
+ 1 file changed, 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 6878448ba112..9170a3e8f088 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
+ if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+ WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
+
+- if (rx_ring->vsi->type == ICE_VSI_PF &&
+- !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+- if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+- rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
+- goto err;
+ return 0;
+
+ err:
+--
+2.43.0
+
--- /dev/null
+From ba8440c493d603f075c11edb241c244ce6a007fa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:59 +0100
+Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 3de38c87174225487fc93befeea7d380db80aef6 ]
+
+Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Use a bigger hammer and instead of unregistering only xdp_rxq_info's
+memory model, unregister it altogether and register it again and have
+xdp_rxq_info with correct frag_size value.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_base.c | 37 ++++++++++++++---------
+ 1 file changed, 23 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
+index 7fa43827a3f0..4f3e65b47cdc 100644
+--- a/drivers/net/ethernet/intel/ice/ice_base.c
++++ b/drivers/net/ethernet/intel/ice/ice_base.c
+@@ -534,19 +534,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+ ring->rx_buf_len = ring->vsi->rx_buf_len;
+
+ if (ring->vsi->type == ICE_VSI_PF) {
+- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+- /* coverity[check_return] */
+- __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+- ring->q_index,
+- ring->q_vector->napi.napi_id,
+- ring->vsi->rx_buf_len);
++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++ ring->q_index,
++ ring->q_vector->napi.napi_id,
++ ring->rx_buf_len);
++ if (err)
++ return err;
++ }
+
+ ring->xsk_pool = ice_xsk_pool(ring);
+ if (ring->xsk_pool) {
+- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++ xdp_rxq_info_unreg(&ring->xdp_rxq);
+
+ ring->rx_buf_len =
+ xsk_pool_get_rx_frame_size(ring->xsk_pool);
++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++ ring->q_index,
++ ring->q_vector->napi.napi_id,
++ ring->rx_buf_len);
++ if (err)
++ return err;
+ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+ MEM_TYPE_XSK_BUFF_POOL,
+ NULL);
+@@ -557,13 +565,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+ dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+ ring->q_index);
+ } else {
+- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+- /* coverity[check_return] */
+- __xdp_rxq_info_reg(&ring->xdp_rxq,
+- ring->netdev,
+- ring->q_index,
+- ring->q_vector->napi.napi_id,
+- ring->vsi->rx_buf_len);
++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++ ring->q_index,
++ ring->q_vector->napi.napi_id,
++ ring->rx_buf_len);
++ if (err)
++ return err;
++ }
+
+ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+ MEM_TYPE_PAGE_SHARED,
+--
+2.43.0
+
--- /dev/null
+From 62da34a963fc7911cae3aa180d1e61801a97258d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:55 +0100
+Subject: ice: work on pre-XDP prog frag count
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit ad2047cf5d9313200e308612aed516548873d124 ]
+
+Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a
+multi-buffer packet by 4k bytes and then redirects it to an AF_XDP
+socket.
+
+Since support for handling multi-buffer frames was added to XDP, usage
+of bpf_xdp_adjust_tail() helper within XDP program can free the page
+that given fragment occupies and in turn decrease the fragment count
+within skb_shared_info that is embedded in xdp_buff struct. In current
+ice driver codebase, it can become problematic when page recycling logic
+decides not to reuse the page. In such case, __page_frag_cache_drain()
+is used with ice_rx_buf::pagecnt_bias that was not adjusted after
+refcount of page was changed by XDP prog which in turn does not drain
+the refcount to 0 and page is never freed.
+
+To address this, let us store the count of frags before the XDP program
+was executed on Rx ring struct. This will be used to compare with
+current frag count from skb_shared_info embedded in xdp_buff. A smaller
+value in the latter indicates that XDP prog freed frag(s). Then, for
+given delta decrement pagecnt_bias for XDP_DROP verdict.
+
+While at it, let us also handle the EOP frag within
+ice_set_rx_bufs_act() to make our life easier, so all of the adjustments
+needed to be applied against freed frags are performed in the single
+place.
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 14 ++++++---
+ drivers/net/ethernet/intel/ice/ice_txrx.h | 1 +
+ drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------
+ 3 files changed, 32 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 9e97ea863068..6878448ba112 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+ ret = ICE_XDP_CONSUMED;
+ }
+ exit:
+- rx_buf->act = ret;
+- if (unlikely(xdp_buff_has_frags(xdp)))
+- ice_set_rx_bufs_act(xdp, rx_ring, ret);
++ ice_set_rx_bufs_act(xdp, rx_ring, ret);
+ }
+
+ /**
+@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+ }
+
+ if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
+- if (unlikely(xdp_buff_has_frags(xdp)))
+- ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
++ ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
+ return -ENOMEM;
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
+ rx_buf->page_offset, size);
+ sinfo->xdp_frags_size += size;
++ /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
++ * can pop off frags but driver has to handle it on its own
++ */
++ rx_ring->nr_frags = sinfo->nr_frags;
+
+ if (page_is_pfmemalloc(rx_buf->page))
+ xdp_buff_set_frag_pfmemalloc(xdp);
+@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+
+ xdp->data = NULL;
+ rx_ring->first_desc = ntc;
++ rx_ring->nr_frags = 0;
+ continue;
+ construct_skb:
+ if (likely(ice_ring_uses_build_skb(rx_ring)))
+@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+ ICE_XDP_CONSUMED);
+ xdp->data = NULL;
+ rx_ring->first_desc = ntc;
++ rx_ring->nr_frags = 0;
+ break;
+ }
+ xdp->data = NULL;
+ rx_ring->first_desc = ntc;
++ rx_ring->nr_frags = 0;
+
+ stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
+ if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index daf7b9dbb143..b28b9826bbcd 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -333,6 +333,7 @@ struct ice_rx_ring {
+ struct ice_channel *ch;
+ struct ice_tx_ring *xdp_ring;
+ struct xsk_buff_pool *xsk_pool;
++ u32 nr_frags;
+ dma_addr_t dma; /* physical address of ring */
+ u64 cached_phctime;
+ u16 rx_buf_len;
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+index 115969ecdf7b..b0e56675f98b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+@@ -12,26 +12,39 @@
+ * act: action to store onto Rx buffers related to XDP buffer parts
+ *
+ * Set action that should be taken before putting Rx buffer from first frag
+- * to one before last. Last one is handled by caller of this function as it
+- * is the EOP frag that is currently being processed. This function is
+- * supposed to be called only when XDP buffer contains frags.
++ * to the last.
+ */
+ static inline void
+ ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
+ const unsigned int act)
+ {
+- const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+- u32 first = rx_ring->first_desc;
+- u32 nr_frags = sinfo->nr_frags;
++ u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++ u32 nr_frags = rx_ring->nr_frags + 1;
++ u32 idx = rx_ring->first_desc;
+ u32 cnt = rx_ring->count;
+ struct ice_rx_buf *buf;
+
+ for (int i = 0; i < nr_frags; i++) {
+- buf = &rx_ring->rx_buf[first];
++ buf = &rx_ring->rx_buf[idx];
+ buf->act = act;
+
+- if (++first == cnt)
+- first = 0;
++ if (++idx == cnt)
++ idx = 0;
++ }
++
++ /* adjust pagecnt_bias on frags freed by XDP prog */
++ if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
++ u32 delta = rx_ring->nr_frags - sinfo_frags;
++
++ while (delta) {
++ if (idx == 0)
++ idx = cnt - 1;
++ else
++ idx--;
++ buf = &rx_ring->rx_buf[idx];
++ buf->pagecnt_bias--;
++ delta--;
++ }
+ }
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 5d5966086cb8bd78aab1a1b25b336edea51fe324 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 21:50:40 +0100
+Subject: idpf: distinguish vports by the dev_port attribute
+
+From: Michal Schmidt <mschmidt@redhat.com>
+
+[ Upstream commit 359724fa3ab79fbe9f42c6263cddc2afae32eef3 ]
+
+idpf registers multiple netdevs (virtual ports) for one PCI function,
+but it does not provide a way for userspace to distinguish them with
+sysfs attributes. Per Documentation/ABI/testing/sysfs-class-net, it is
+a bug not to set dev_port for independent ports on the same PCI bus,
+device and function.
+
+Without dev_port set, systemd-udevd's default naming policy attempts
+to assign the same name ("ens2f0") to all four idpf netdevs on my test
+system and obviously fails, leaving three of them with the initial
+eth<N> name.
+
+With this patch, systemd-udevd is able to assign unique names to the
+netdevs (e.g. "ens2f0", "ens2f0d1", "ens2f0d2", "ens2f0d3").
+
+The Intel-provided out-of-tree idpf driver already sets dev_port. In
+this patch I chose to do it in the same place in the idpf_cfg_netdev
+function.
+
+Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration")
+Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
+Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/idpf/idpf_lib.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
+index 19809b0ddcd9..0241e498cc20 100644
+--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
++++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
+@@ -783,6 +783,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
+ /* setup watchdog timeout value to be 5 second */
+ netdev->watchdog_timeo = 5 * HZ;
+
++ netdev->dev_port = idx;
++
+ /* configure default MTU size */
+ netdev->min_mtu = ETH_MIN_MTU;
+ netdev->max_mtu = vport->max_mtu;
+--
+2.43.0
+
--- /dev/null
+From 0166c869022f93a949db8088d9cffb95e3db16bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:58 +0100
+Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 ]
+
+Ice and i40e ZC drivers currently set offset of a frag within
+skb_shared_info to 0, which is incorrect. xdp_buffs that come from
+xsk_buff_pool always have 256 bytes of a headroom, so they need to be
+taken into account to retrieve xdp_buff::data via skb_frag_address().
+Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from
+xdp_buff::data_hard_start which would result in overwriting existing
+payload.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++-
+ drivers/net/ethernet/intel/ice/ice_xsk.c | 3 ++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index fede0bb3e047..65f38a57b3df 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -414,7 +414,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first,
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+- virt_to_page(xdp->data_hard_start), 0, size);
++ virt_to_page(xdp->data_hard_start),
++ XDP_PACKET_HEADROOM, size);
+ sinfo->xdp_frags_size += size;
+ xsk_buff_add_frag(xdp);
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 951f84bfdf2b..f3663b3f6390 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -820,7 +820,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first,
+ }
+
+ __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+- virt_to_page(xdp->data_hard_start), 0, size);
++ virt_to_page(xdp->data_hard_start),
++ XDP_PACKET_HEADROOM, size);
+ sinfo->xdp_frags_size += size;
+ xsk_buff_add_frag(xdp);
+
+--
+2.43.0
+
--- /dev/null
+From eae4daf6f79b22f71f6a35ab4233136066e16449 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 18:20:01 +0800
+Subject: ipv6: init the accept_queue's spinlocks in inet6_create
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 435e202d645c197dcfd39d7372eb2a56529b6640 ]
+
+In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks
+once"), the spinlocks of accept_queue are initialized only when socket is
+created in the inet4 scenario. The locks are not initialized when socket
+is created in the inet6 scenario. The kernel reports the following error:
+INFO: trying to register non-static key.
+The code is fine but needs lockdep annotation, or maybe
+you didn't initialize this object before use?
+turning off the locking correctness validator.
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+Call Trace:
+<TASK>
+ dump_stack_lvl (lib/dump_stack.c:107)
+ register_lock_class (kernel/locking/lockdep.c:1289)
+ __lock_acquire (kernel/locking/lockdep.c:5015)
+ lock_acquire.part.0 (kernel/locking/lockdep.c:5756)
+ _raw_spin_lock_bh (kernel/locking/spinlock.c:178)
+ inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386)
+ tcp_disconnect (net/ipv4/tcp.c:2981)
+ inet_shutdown (net/ipv4/af_inet.c:935)
+ __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438)
+ __x64_sys_shutdown (net/socket.c:2445)
+ do_syscall_64 (arch/x86/entry/common.c:52)
+ entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+RIP: 0033:0x7f52ecd05a3d
+Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7
+48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
+ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030
+RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d
+RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004
+RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640
+R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0
+
+Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/af_inet6.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index 13a1833a4df5..959bfd9f6344 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -199,6 +199,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = SK_CAN_REUSE;
+
++ if (INET_PROTOSW_ICSK & answer_flags)
++ inet_init_csk_locks(sk);
++
+ inet = inet_sk(sk);
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+
+--
+2.43.0
+
--- /dev/null
+From 3c2fb71fcd92c98c689bdd8a2a2c278559759d4d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 17:55:15 -0800
+Subject: llc: Drop support for ETH_P_TR_802_2.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit e3f9bed9bee261e3347131764e42aeedf1ffea61 ]
+
+syzbot reported an uninit-value bug below. [0]
+
+llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2
+(0x0011), and syzbot abused the latter to trigger the bug.
+
+ write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16)
+
+llc_conn_handler() initialises local variables {saddr,daddr}.mac
+based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes
+them to __llc_lookup().
+
+However, the initialisation is done only when skb->protocol is
+htons(ETH_P_802_2), otherwise, __llc_lookup_established() and
+__llc_lookup_listener() will read garbage.
+
+The missing initialisation existed prior to commit 211ed865108e
+("net: delete all instances of special processing for token ring").
+
+It removed the part to kick out the token ring stuff but forgot to
+close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv().
+
+Let's remove llc_tr_packet_type and complete the deprecation.
+
+[0]:
+BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup net/llc/llc_conn.c:611 [inline]
+ llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+ __netif_receive_skb_one_core net/core/dev.c:5527 [inline]
+ __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641
+ netif_receive_skb_internal net/core/dev.c:5727 [inline]
+ netif_receive_skb+0x58/0x660 net/core/dev.c:5786
+ tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555
+ tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002
+ tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048
+ call_write_iter include/linux/fs.h:2020 [inline]
+ new_sync_write fs/read_write.c:491 [inline]
+ vfs_write+0x8ef/0x1490 fs/read_write.c:584
+ ksys_write+0x20f/0x4c0 fs/read_write.c:637
+ __do_sys_write fs/read_write.c:649 [inline]
+ __se_sys_write fs/read_write.c:646 [inline]
+ __x64_sys_write+0x93/0xd0 fs/read_write.c:646
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Local variable daddr created at:
+ llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+
+CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
+
+Fixes: 211ed865108e ("net: delete all instances of special processing for token ring")
+Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/llc_pdu.h | 6 ++----
+ net/llc/llc_core.c | 7 -------
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h
+index 7e73f8e5e497..1d55ba7c45be 100644
+--- a/include/net/llc_pdu.h
++++ b/include/net/llc_pdu.h
+@@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type,
+ */
+ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+ {
+- if (skb->protocol == htons(ETH_P_802_2))
+- memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
++ memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
+ }
+
+ /**
+@@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+ */
+ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da)
+ {
+- if (skb->protocol == htons(ETH_P_802_2))
+- memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
++ memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
+ }
+
+ /**
+diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
+index 6e387aadffce..4f16d9c88350 100644
+--- a/net/llc/llc_core.c
++++ b/net/llc/llc_core.c
+@@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = {
+ .func = llc_rcv,
+ };
+
+-static struct packet_type llc_tr_packet_type __read_mostly = {
+- .type = cpu_to_be16(ETH_P_TR_802_2),
+- .func = llc_rcv,
+-};
+-
+ static int __init llc_init(void)
+ {
+ dev_add_pack(&llc_packet_type);
+- dev_add_pack(&llc_tr_packet_type);
+ return 0;
+ }
+
+ static void __exit llc_exit(void)
+ {
+ dev_remove_pack(&llc_packet_type);
+- dev_remove_pack(&llc_tr_packet_type);
+ }
+
+ module_init(llc_init);
+--
+2.43.0
+
--- /dev/null
+From 4655ed34031dacfc8ec060c94def23c23f158ea5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 18:36:25 +0000
+Subject: llc: make llc_ui_sendmsg() more robust against bonding changes
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit dad555c816a50c6a6a8a86be1f9177673918c647 ]
+
+syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no
+headroom, but subsequently trying to push 14 bytes of Ethernet header [1]
+
+Like some others, llc_ui_sendmsg() releases the socket lock before
+calling sock_alloc_send_skb().
+Then it acquires it again, but does not redo all the sanity checks
+that were performed.
+
+This fix:
+
+- Uses LL_RESERVED_SPACE() to reserve space.
+- Check all conditions again after socket lock is held again.
+- Do not account Ethernet header for mtu limitation.
+
+[1]
+
+skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0
+
+ kernel BUG at net/core/skbuff.c:193 !
+Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
+Modules linked in:
+CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023
+pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+ pc : skb_panic net/core/skbuff.c:189 [inline]
+ pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+ lr : skb_panic net/core/skbuff.c:189 [inline]
+ lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+sp : ffff800096f97000
+x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000
+x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2
+x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0
+x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce
+x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001
+x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000
+x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400
+x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000
+x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714
+x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089
+Call trace:
+ skb_panic net/core/skbuff.c:189 [inline]
+ skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+ skb_push+0xf0/0x108 net/core/skbuff.c:2451
+ eth_header+0x44/0x1f8 net/ethernet/eth.c:83
+ dev_hard_header include/linux/netdevice.h:3188 [inline]
+ llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33
+ llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85
+ llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline]
+ llc_sap_next_state net/llc/llc_sap.c:182 [inline]
+ llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209
+ llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270
+ llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997
+ sock_sendmsg_nosec net/socket.c:730 [inline]
+ __sock_sendmsg net/socket.c:745 [inline]
+ sock_sendmsg+0x194/0x274 net/socket.c:767
+ splice_to_socket+0x7cc/0xd58 fs/splice.c:881
+ do_splice_from fs/splice.c:933 [inline]
+ direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142
+ splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088
+ do_splice_direct+0x20c/0x348 fs/splice.c:1194
+ do_sendfile+0x4bc/0xc70 fs/read_write.c:1254
+ __do_sys_sendfile64 fs/read_write.c:1322 [inline]
+ __se_sys_sendfile64 fs/read_write.c:1308 [inline]
+ __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308
+ __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline]
+ invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51
+ el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136
+ do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155
+ el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678
+ el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696
+ el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595
+Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000)
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/llc/af_llc.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
+index 9b06c380866b..20551cfb7da6 100644
+--- a/net/llc/af_llc.c
++++ b/net/llc/af_llc.c
+@@ -928,14 +928,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+ */
+ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+ {
++ DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+ struct sock *sk = sock->sk;
+ struct llc_sock *llc = llc_sk(sk);
+- DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+ int flags = msg->msg_flags;
+ int noblock = flags & MSG_DONTWAIT;
++ int rc = -EINVAL, copied = 0, hdrlen, hh_len;
+ struct sk_buff *skb = NULL;
++ struct net_device *dev;
+ size_t size = 0;
+- int rc = -EINVAL, copied = 0, hdrlen;
+
+ dprintk("%s: sending from %02X to %02X\n", __func__,
+ llc->laddr.lsap, llc->daddr.lsap);
+@@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+ if (rc)
+ goto out;
+ }
+- hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
++ dev = llc->dev;
++ hh_len = LL_RESERVED_SPACE(dev);
++ hdrlen = llc_ui_header_len(sk, addr);
+ size = hdrlen + len;
+- if (size > llc->dev->mtu)
+- size = llc->dev->mtu;
++ size = min_t(size_t, size, READ_ONCE(dev->mtu));
+ copied = size - hdrlen;
+ rc = -EINVAL;
+ if (copied < 0)
+ goto out;
+ release_sock(sk);
+- skb = sock_alloc_send_skb(sk, size, noblock, &rc);
++ skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc);
+ lock_sock(sk);
+ if (!skb)
+ goto out;
+- skb->dev = llc->dev;
++ if (sock_flag(sk, SOCK_ZAPPED) ||
++ llc->dev != dev ||
++ hdrlen != llc_ui_header_len(sk, addr) ||
++ hh_len != LL_RESERVED_SPACE(dev) ||
++ size > READ_ONCE(dev->mtu))
++ goto out;
++ skb->dev = dev;
+ skb->protocol = llc_proto_type(addr->sllc_arphrd);
+- skb_reserve(skb, hdrlen);
++ skb_reserve(skb, hh_len + hdrlen);
+ rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
+ if (rc)
+ goto out;
+--
+2.43.0
+
--- /dev/null
+From 673b7bbd4cec69e76f7c6790c24d1041bcf45ca4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 10:51:41 -0600
+Subject: net: fec: fix the unhandled context fault from smmu
+
+From: Shenwei Wang <shenwei.wang@nxp.com>
+
+[ Upstream commit 5e344807735023cd3a67c37a1852b849caa42620 ]
+
+When repeatedly changing the interface link speed using the command below:
+
+ethtool -s eth0 speed 100 duplex full
+ethtool -s eth0 speed 1000 duplex full
+
+The following errors may sometimes be reported by the ARM SMMU driver:
+
+[ 5395.035364] fec 5b040000.ethernet eth0: Link is Down
+[ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault:
+fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2
+[ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full -
+flow control off
+
+It is identified that the FEC driver does not properly stop the TX queue
+during the link speed transitions, and this results in the invalid virtual
+I/O address translations from the SMMU and causes the context faults.
+
+Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()")
+Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
+Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/freescale/fec_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
+index e08c7b572497..c107680985e4 100644
+--- a/drivers/net/ethernet/freescale/fec_main.c
++++ b/drivers/net/ethernet/freescale/fec_main.c
+@@ -2036,6 +2036,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+
+ /* if any of the above changed restart the FEC */
+ if (status_change) {
++ netif_stop_queue(ndev);
+ napi_disable(&fep->napi);
+ netif_tx_lock_bh(ndev);
+ fec_restart(ndev);
+@@ -2045,6 +2046,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+ }
+ } else {
+ if (fep->link) {
++ netif_stop_queue(ndev);
+ napi_disable(&fep->napi);
+ netif_tx_lock_bh(ndev);
+ fec_stop(ndev);
+--
+2.43.0
+
--- /dev/null
+From 4638a989e1c2afb54acc2b7dc2890487a0a9c764 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 16:58:59 -0800
+Subject: net: fix removing a namespace with conflicting altnames
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit d09486a04f5da0a812c26217213b89a3b1acf836 ]
+
+Mark reports a BUG() when a net namespace is removed.
+
+ kernel BUG at net/core/dev.c:11520!
+
+Physical interfaces moved outside of init_net get "refunded"
+to init_net when that namespace disappears. The main interface
+name may get overwritten in the process if it would have
+conflicted. We need to also discard all conflicting altnames.
+Recent fixes addressed ensuring that altnames get moved
+with the main interface, which surfaced this problem.
+
+Reported-by: Марк Коренберг <socketpair@gmail.com>
+Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/
+Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns")
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/dev.c | 9 +++++++++
+ net/core/dev.h | 3 +++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/net/core/dev.c b/net/core/dev.c
+index ad20bebe153f..add22ca0dff9 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -11509,6 +11509,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
+
+ static void __net_exit default_device_exit_net(struct net *net)
+ {
++ struct netdev_name_node *name_node, *tmp;
+ struct net_device *dev, *aux;
+ /*
+ * Push all migratable network devices back to the
+@@ -11531,6 +11532,14 @@ static void __net_exit default_device_exit_net(struct net *net)
+ snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+ if (netdev_name_in_use(&init_net, fb_name))
+ snprintf(fb_name, IFNAMSIZ, "dev%%d");
++
++ netdev_for_each_altname_safe(dev, name_node, tmp)
++ if (netdev_name_in_use(&init_net, name_node->name)) {
++ netdev_name_node_del(name_node);
++ synchronize_rcu();
++ __netdev_name_node_alt_destroy(name_node);
++ }
++
+ err = dev_change_net_namespace(dev, &init_net, fb_name);
+ if (err) {
+ pr_emerg("%s: failed to move %s to init_net: %d\n",
+diff --git a/net/core/dev.h b/net/core/dev.h
+index 5aa45f0fd4ae..3f5eb92396b6 100644
+--- a/net/core/dev.h
++++ b/net/core/dev.h
+@@ -64,6 +64,9 @@ int dev_change_name(struct net_device *dev, const char *newname);
+
+ #define netdev_for_each_altname(dev, namenode) \
+ list_for_each_entry((namenode), &(dev)->name_node->list, list)
++#define netdev_for_each_altname_safe(dev, namenode, next) \
++ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
++ list)
+
+ int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+--
+2.43.0
+
--- /dev/null
+From 846480519c20c1853b3e360271f8c0182d65e2a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:47:50 +0100
+Subject: net: micrel: Fix PTP frame parsing for lan8814
+
+From: Horatiu Vultur <horatiu.vultur@microchip.com>
+
+[ Upstream commit aaf632f7ab6dec57bc9329a438f94504fe8034b9 ]
+
+The HW has the capability to check each frame if it is a PTP frame,
+which domain it is, which ptp frame type it is, different ip address in
+the frame. And if one of these checks fail then the frame is not
+timestamp. Most of these checks were disabled except checking the field
+minorVersionPTP inside the PTP header. Meaning that once a partner sends
+a frame compliant to 8021AS which has minorVersionPTP set to 1, then the
+frame was not timestamp because the HW expected by default a value of 0
+in minorVersionPTP. This is exactly the same issue as on lan8841.
+Fix this issue by removing this check so the userspace can decide on this.
+
+Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy")
+Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Reviewed-by: Divya Koppera <divya.koppera@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/micrel.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
+index ce5ad4a82481..858175ca58cd 100644
+--- a/drivers/net/phy/micrel.c
++++ b/drivers/net/phy/micrel.c
+@@ -120,6 +120,11 @@
+ */
+ #define LAN8814_1PPM_FORMAT 17179
+
++#define PTP_RX_VERSION 0x0248
++#define PTP_TX_VERSION 0x0288
++#define PTP_MAX_VERSION(x) (((x) & GENMASK(7, 0)) << 8)
++#define PTP_MIN_VERSION(x) ((x) & GENMASK(7, 0))
++
+ #define PTP_RX_MOD 0x024F
+ #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3)
+ #define PTP_RX_TIMESTAMP_EN 0x024D
+@@ -3147,6 +3152,12 @@ static void lan8814_ptp_init(struct phy_device *phydev)
+ lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0);
+ lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0);
+
++ /* Disable checking for minorVersionPTP field */
++ lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION,
++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++ lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION,
++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++
+ skb_queue_head_init(&ptp_priv->tx_queue);
+ skb_queue_head_init(&ptp_priv->rx_queue);
+ INIT_LIST_HEAD(&ptp_priv->rx_ts_list);
+--
+2.43.0
+
--- /dev/null
+From 72948aec8a138908291cc00d2584c02eb45d3574 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 30 Dec 2023 22:40:37 +0200
+Subject: net/mlx5: Bridge, fix multicast packets sent to uplink
+
+From: Moshe Shemesh <moshe@nvidia.com>
+
+[ Upstream commit ec7cc38ef9f83553102e84c82536971a81630739 ]
+
+To enable multicast packets which are offloaded in bridge multicast
+offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should
+be set. Add this bit to FTE for the bridge multicast offload rules.
+
+Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets")
+Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
+Reviewed-by: Gal Pressman <gal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++
+ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 ++
+ include/linux/mlx5/fs.h | 1 +
+ include/linux/mlx5/mlx5_ifc.h | 2 +-
+ 4 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+index a7ed87e9d842..22dd30cf8033 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
+ i++;
+ }
+
++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+ rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+ dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16);
+ ether_addr_copy(dmac_v, entry->key.addr);
+@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
+ if (!rule_spec)
+ return ERR_PTR(-ENOMEM);
+
++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+ rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+
+ flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
+ dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
+ dest.vport.vhca_id = port->esw_owner_vhca_id;
+ }
++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+ handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1);
+
+ kvfree(rule_spec);
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+index a4b925331661..b29299c49ab3 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+ fte->flow_context.flow_tag);
+ MLX5_SET(flow_context, in_flow_context, flow_source,
+ fte->flow_context.flow_source);
++ MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en,
++ !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN));
+
+ MLX5_SET(flow_context, in_flow_context, extended_destination,
+ extended_dest);
+diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
+index 6f7725238abc..3fb428ce7d1c 100644
+--- a/include/linux/mlx5/fs.h
++++ b/include/linux/mlx5/fs.h
+@@ -132,6 +132,7 @@ struct mlx5_flow_handle;
+
+ enum {
+ FLOW_CONTEXT_HAS_TAG = BIT(0),
++ FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1),
+ };
+
+ struct mlx5_flow_context {
+diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
+index 3f7b664d625b..fb8d26a15df4 100644
+--- a/include/linux/mlx5/mlx5_ifc.h
++++ b/include/linux/mlx5/mlx5_ifc.h
+@@ -3557,7 +3557,7 @@ struct mlx5_ifc_flow_context_bits {
+ u8 action[0x10];
+
+ u8 extended_destination[0x1];
+- u8 reserved_at_81[0x1];
++ u8 uplink_hairpin_en[0x1];
+ u8 flow_source[0x2];
+ u8 encrypt_decrypt_type[0x4];
+ u8 destination_list_size[0x18];
+--
+2.43.0
+
--- /dev/null
+From 1dd205824d3c300e4463cf69b20f524b19dbc73d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 13:20:36 +0200
+Subject: net/mlx5: DR, Can't go to uplink vport on RX rule
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 ]
+
+Go-To-Vport action on RX is not allowed when the vport is uplink.
+In such case, the packet should be dropped.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Reviewed-by: Erez Shitrit <erezsh@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../mellanox/mlx5/core/steering/dr_action.c | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index 74fc318b5027..d2b65a0ce47b 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -874,11 +874,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+ action->sampler->tx_icm_addr;
+ break;
+ case DR_ACTION_TYP_VPORT:
+- attr.hit_gvmi = action->vport->caps->vhca_gvmi;
+- dest_action = action;
+- attr.final_icm_addr = rx_rule ?
+- action->vport->caps->icm_address_rx :
+- action->vport->caps->icm_address_tx;
++ if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) {
++ /* can't go to uplink on RX rule - dropping instead */
++ attr.final_icm_addr = nic_dmn->drop_icm_addr;
++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
++ } else {
++ attr.hit_gvmi = action->vport->caps->vhca_gvmi;
++ dest_action = action;
++ attr.final_icm_addr = rx_rule ?
++ action->vport->caps->icm_address_rx :
++ action->vport->caps->icm_address_tx;
++ }
+ break;
+ case DR_ACTION_TYP_POP_VLAN:
+ if (!rx_rule && !(dmn->ste_ctx->actions_caps &
+--
+2.43.0
+
--- /dev/null
+From f996a3313ebd84e0efdb7d61b590eec93fc7c6a2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 11:24:08 +0200
+Subject: net/mlx5: DR, Use the right GVMI number for drop action
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5665954293f13642f9c052ead83c1e9d8cff186f ]
+
+When FW provides ICM addresses for drop RX/TX, the provided capability
+is 64 bits that contain its GVMI as well as the ICM address itself.
+In case of TX DROP this GVMI is different from the GVMI that the
+domain is operating on.
+
+This patch fixes the action to use these GVMI IDs, as provided by FW.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index e3ec559369fa..74fc318b5027 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -788,6 +788,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+ switch (action_type) {
+ case DR_ACTION_TYP_DROP:
+ attr.final_icm_addr = nic_dmn->drop_icm_addr;
++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
+ break;
+ case DR_ACTION_TYP_FT:
+ dest_action = action;
+--
+2.43.0
+
--- /dev/null
+From 35519d6efbd840bc026139266c7df4248df4cc79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 31 Dec 2023 15:19:50 +0200
+Subject: net/mlx5: Fix a WARN upon a callback command failure
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit cc8091587779cfaddb6b29c9e9edb9079a282cad ]
+
+The below WARN [1] is reported once a callback command failed.
+
+As a callback runs under an interrupt context, needs to use the IRQ
+save/restore variant.
+
+[1]
+DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())
+WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353
+ lockdep_hardirqs_on_prepare+0x11b/0x180
+Modules linked in: vhost_net vhost tap mlx5_vfio_pci
+vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh
+vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle
+ip6table_nat ip6table_filter ip6_tables iptable_mangle
+xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink
+xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5
+auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi
+scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm
+mlx5_ib ib_uverbs ib_core fuse mlx5_core
+CPU: 15 PID: 0 Comm: swapper/15 Tainted: G W 6.7.0-rc4+ #1587
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
+rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180
+Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3
+ 76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1
+ e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00
+ 85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75
+RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
+RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888
+RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001
+R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1
+R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003
+FS: 0000000000000000(0000) GS:ffff88885fbc0000(0000)
+knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <IRQ>
+? __warn+0x81/0x170
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? report_bug+0xf8/0x1c0
+? handle_bug+0x3f/0x70
+? exc_invalid_op+0x13/0x60
+? asm_exc_invalid_op+0x16/0x20
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+trace_hardirqs_on+0x4a/0xa0
+raw_spin_unlock_irq+0x24/0x30
+cmd_status_err+0xc0/0x1a0 [mlx5_core]
+cmd_status_err+0x1a0/0x1a0 [mlx5_core]
+mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core]
+mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core]
+cmd_comp_notifier+0x1a/0x20 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+mlx5_eq_async_int+0xe7/0x200 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+irq_int_handler+0x11/0x20 [mlx5_core]
+__handle_irq_event_percpu+0x99/0x220
+? tick_irq_enter+0x5d/0x80
+handle_irq_event_percpu+0xf/0x40
+handle_irq_event+0x3a/0x60
+handle_edge_irq+0xa2/0x1c0
+__common_interrupt+0x55/0x140
+common_interrupt+0x7d/0xa0
+</IRQ>
+<TASK>
+asm_common_interrupt+0x22/0x40
+RIP: 0010:default_idle+0x13/0x20
+Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff
+ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb
+f4 <fa> c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00
+RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242
+RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c
+RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+? do_idle+0x1ec/0x210
+default_idle_call+0x6c/0x90
+do_idle+0x1ec/0x210
+cpu_startup_entry+0x26/0x30
+start_secondary+0x11b/0x150
+secondary_startup_64_no_verify+0x165/0x16b
+</TASK>
+irq event stamp: 833284
+hardirqs last enabled at (833283): [<ffffffff811c410c>]
+do_idle+0x1ec/0x210
+hardirqs last disabled at (833284): [<ffffffff81daf9ef>]
+common_interrupt+0xf/0xa0
+softirqs last enabled at (833224): [<ffffffff81dc199f>]
+__do_softirq+0x2bf/0x40e
+softirqs last disabled at (833177): [<ffffffff81178ddf>]
+irq_exit_rcu+0x7f/0xa0
+
+Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs")
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+index a7b1f9686c09..4957412ff1f6 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+@@ -1923,6 +1923,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+ {
+ const char *namep = mlx5_command_str(opcode);
+ struct mlx5_cmd_stats *stats;
++ unsigned long flags;
+
+ if (!err || !(strcmp(namep, "unknown command opcode")))
+ return;
+@@ -1930,7 +1931,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+ stats = xa_load(&dev->cmd.stats, opcode);
+ if (!stats)
+ return;
+- spin_lock_irq(&stats->lock);
++ spin_lock_irqsave(&stats->lock, flags);
+ stats->failed++;
+ if (err < 0)
+ stats->last_failed_errno = -err;
+@@ -1939,7 +1940,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+ stats->last_failed_mbox_status = status;
+ stats->last_failed_syndrome = syndrome;
+ }
+- spin_unlock_irq(&stats->lock);
++ spin_unlock_irqrestore(&stats->lock, flags);
+ }
+
+ /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */
+--
+2.43.0
+
--- /dev/null
+From fee9cf89467830f61dd17acaae0daeed512142b9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 14:01:54 -0800
+Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for
+ ASO
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 20cbf8cbb827094197f3b17db60d71449415db1e ]
+
+mlx5 devices have specific constants for choosing the CQ period mode. These
+constants do not have to match the constants used by the kernel software
+API for DIM period mode selection.
+
+Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+index 40c7be124041..58bd749b5e4d 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+@@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data)
+ mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
+ (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+
+- MLX5_SET(cqc, cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE);
++ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+ MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn);
+ MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index);
+ MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+--
+2.43.0
+
--- /dev/null
+From 67398c780b74733917ea136778b926b754cfccd1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Dec 2023 13:52:55 +0200
+Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 20f5468a7988dedd94a57ba8acd65ebda6a59723 ]
+
+All ConnectX devices have software parsing capability enabled, but it is
+more correct to set allow_swp only if capability exists, which for IPsec
+means that crypto offload is supported.
+
+Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check")
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+index e097f336e1c4..30507b7c2fb1 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+@@ -1062,8 +1062,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev,
+ void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+ bool allow_swp;
+
+- allow_swp =
+- mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev);
++ allow_swp = mlx5_geneve_tx_allowed(mdev) ||
++ (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO);
+ mlx5e_build_sq_param_common(mdev, param);
+ MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+ MLX5_SET(sqc, sqc, allow_swp, allow_swp);
+--
+2.43.0
+
--- /dev/null
+From fcb41c119b911abd8b24a93d58e8f373a36e3784 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:17:36 +0800
+Subject: net/mlx5e: fix a double-free in arfs_create_groups
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit 3c6d5189246f590e4e1f167991558bdb72a4738b ]
+
+When `in` allocated by kvzalloc fails, arfs_create_groups will free
+ft->g and return an error. However, arfs_create_table, the only caller of
+arfs_create_groups, will hold this error and call to
+mlx5e_destroy_flow_table, in which the ft->g will be freed again.
+
+Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++--------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+index bb7f86c993e5..e66f486faafe 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+
+ ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS,
+ sizeof(*ft->g), GFP_KERNEL);
+- in = kvzalloc(inlen, GFP_KERNEL);
+- if (!in || !ft->g) {
+- kfree(ft->g);
+- kvfree(in);
++ if (!ft->g)
+ return -ENOMEM;
++
++ in = kvzalloc(inlen, GFP_KERNEL);
++ if (!in) {
++ err = -ENOMEM;
++ goto err_free_g;
+ }
+
+ mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+@@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+ break;
+ default:
+ err = -EINVAL;
+- goto out;
++ goto err_free_in;
+ }
+
+ switch (type) {
+@@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+ break;
+ default:
+ err = -EINVAL;
+- goto out;
++ goto err_free_in;
+ }
+
+ MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+@@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+ MLX5_SET_CFG(in, end_flow_index, ix - 1);
+ ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+ if (IS_ERR(ft->g[ft->num_groups]))
+- goto err;
++ goto err_clean_group;
+ ft->num_groups++;
+
+ memset(in, 0, inlen);
+@@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+ MLX5_SET_CFG(in, end_flow_index, ix - 1);
+ ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+ if (IS_ERR(ft->g[ft->num_groups]))
+- goto err;
++ goto err_clean_group;
+ ft->num_groups++;
+
+ kvfree(in);
+ return 0;
+
+-err:
++err_clean_group:
+ err = PTR_ERR(ft->g[ft->num_groups]);
+ ft->g[ft->num_groups] = NULL;
+-out:
++err_free_in:
+ kvfree(in);
+-
++err_free_g:
++ kfree(ft->g);
++ ft->g = NULL;
+ return err;
+ }
+
+--
+2.43.0
+
--- /dev/null
+From a4f214befe9daf31c7e22b11d18352c1e952afd2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 17:29:01 +0800
+Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups
+
+From: Dinghao Liu <dinghao.liu@zju.edu.cn>
+
+[ Upstream commit aef855df7e1bbd5aa4484851561211500b22707e ]
+
+When kcalloc() for ft->g succeeds but kvzalloc() for in fails,
+fs_any_create_groups() will free ft->g. However, its caller
+fs_any_create_table() will free ft->g again through calling
+mlx5e_destroy_flow_table(), which will lead to a double-free.
+Fix this by setting ft->g to NULL in fs_any_create_groups().
+
+Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API")
+Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+index e1283531e0b8..671adbad0a40 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+@@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft)
+ in = kvzalloc(inlen, GFP_KERNEL);
+ if (!in || !ft->g) {
+ kfree(ft->g);
++ ft->g = NULL;
+ kvfree(in);
+ return -ENOMEM;
+ }
+--
+2.43.0
+
--- /dev/null
+From 23db336dc559a45fa3dfa9a231d8d09a0fb1c25a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Nov 2023 17:09:46 +0200
+Subject: net/mlx5e: Fix inconsistent hairpin RQT sizes
+
+From: Tariq Toukan <tariqt@nvidia.com>
+
+[ Upstream commit c20767fd45e82d64352db82d4fc8d281a43e4783 ]
+
+The processing of traffic in hairpin queues occurs in HW/FW and does not
+involve the cpus, hence the upper bound on max num channels does not
+apply to them. Using this bound for the hairpin RQT max_table_size is
+wrong. It could be too small, and cause the error below [1]. As the
+RQT size provided on init does not get modified later, use the same
+value for both actual and max table sizes.
+
+[1]
+mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1200): CREATE_RQT(0x916) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x538faf), err(-22)
+
+Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels")
+Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Gal Pressman <gal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+index 96af9e2ab1d8..b61d82f08e65 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -761,7 +761,7 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
+
+ err = mlx5e_rss_params_indir_init(&indir, mdev,
+ mlx5e_rqt_size(mdev, hp->num_channels),
+- mlx5e_rqt_size(mdev, priv->max_nch));
++ mlx5e_rqt_size(mdev, hp->num_channels));
+ if (err)
+ return err;
+
+--
+2.43.0
+
--- /dev/null
+From fa8499340268fb830497b9a417e690b2a2ee1e41 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 22 Nov 2023 18:32:11 -0800
+Subject: net/mlx5e: Fix operation precedence bug in port timestamping
+ napi_poll context
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 3876638b2c7ebb2c9d181de1191db0de8cac143a ]
+
+Indirection (*) is of lower precedence than postfix increment (++). Logic
+in napi_poll context would cause an out-of-bound read by first increment
+the pointer address by byte address space and then dereference the value.
+Rather, the intended logic was to dereference first and then increment the
+underlying value.
+
+Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+index af3928eddafd..803035d4e597 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+@@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
+ mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
+ out:
+ napi_consume_skb(skb, budget);
+- md_buff[*md_buff_sz++] = metadata_id;
++ md_buff[(*md_buff_sz)++] = metadata_id;
+ if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
+ !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
+ queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
+--
+2.43.0
+
--- /dev/null
+From 77963fd716035e94aa889dcddb5f193a5ac0f276 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Nov 2023 11:10:22 +0100
+Subject: net/mlx5e: Fix peer flow lists handling
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit d76fdd31f953ac5046555171620f2562715e9b71 ]
+
+The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP
+flag when list of peer flows has become empty. However, if any concurrent
+user holds a reference to a peer flow (for example, the neighbor update
+workqueue task is updating peer flow's parent encap entry concurrently),
+then the flow will not be removed from the peer list and, consecutively,
+DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls
+mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm
+will try to remove the flow from eswitch instances that it has never peered
+with causing either NULL pointer dereference when trying to remove the flow
+peer list head of peer_index that was never initialized or a warning if the
+list debug config is enabled[0].
+
+Fix the issue by always removing the peer flow from the list even when not
+releasing the last reference to it.
+
+[0]:
+
+[ 3102.985806] ------------[ cut here ]------------
+[ 3102.986223] list_del corruption, ffff888139110698->next is NULL
+[ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg
+ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding]
+[ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3
+[ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+[ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b
+[ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286
+[ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000
+[ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640
+[ 3102.997188] DEL flow 00000000be367878 on port 0
+[ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff
+[ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100
+[ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240
+[ 3103.000790] FS: 00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000
+[ 3103.001486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0
+[ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[ 3103.003787] Call Trace:
+[ 3103.004055] <TASK>
+[ 3103.004297] ? __warn+0x7d/0x130
+[ 3103.004623] ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.005094] ? report_bug+0xf1/0x1c0
+[ 3103.005439] ? console_unlock+0x4a/0xd0
+[ 3103.005806] ? handle_bug+0x3f/0x70
+[ 3103.006149] ? exc_invalid_op+0x13/0x60
+[ 3103.006531] ? asm_exc_invalid_op+0x16/0x20
+[ 3103.007430] ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.007910] mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core]
+[ 3103.008463] mlx5e_tc_del_flow+0x46/0x270 [mlx5_core]
+[ 3103.008944] mlx5e_flow_put+0x26/0x50 [mlx5_core]
+[ 3103.009401] mlx5e_delete_flower+0x25f/0x380 [mlx5_core]
+[ 3103.009901] tc_setup_cb_destroy+0xab/0x180
+[ 3103.010292] fl_hw_destroy_filter+0x99/0xc0 [cls_flower]
+[ 3103.010779] __fl_delete+0x2d4/0x2f0 [cls_flower]
+[ 3103.011207] fl_delete+0x36/0x80 [cls_flower]
+[ 3103.011614] tc_del_tfilter+0x56f/0x750
+[ 3103.011982] rtnetlink_rcv_msg+0xff/0x3a0
+[ 3103.012362] ? netlink_ack+0x1c7/0x4e0
+[ 3103.012719] ? rtnl_calcit.isra.44+0x130/0x130
+[ 3103.013134] netlink_rcv_skb+0x54/0x100
+[ 3103.013533] netlink_unicast+0x1ca/0x2b0
+[ 3103.013902] netlink_sendmsg+0x361/0x4d0
+[ 3103.014269] __sock_sendmsg+0x38/0x60
+[ 3103.014643] ____sys_sendmsg+0x1f2/0x200
+[ 3103.015018] ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.015265] ___sys_sendmsg+0x87/0xd0
+[ 3103.016608] ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.017014] ? ___sys_recvmsg+0x9b/0xd0
+[ 3103.017381] ? ttwu_do_activate.isra.137+0x58/0x180
+[ 3103.017821] ? wake_up_q+0x49/0x90
+[ 3103.018157] ? futex_wake+0x137/0x160
+[ 3103.018521] ? __sys_sendmsg+0x51/0x90
+[ 3103.018882] __sys_sendmsg+0x51/0x90
+[ 3103.019230] ? exit_to_user_mode_prepare+0x56/0x130
+[ 3103.019670] do_syscall_64+0x3c/0x80
+[ 3103.020017] entry_SYSCALL_64_after_hwframe+0x46/0xb0
+[ 3103.020469] RIP: 0033:0x7f4254811ef4
+[ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b
+[ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
+[ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4
+[ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012
+[ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0
+[ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001
+[ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0
+[ 3103.025931] </TASK>
+[ 3103.026182] ---[ end trace 0000000000000000 ]---
+[ 3103.027033] ------------[ cut here ]------------
+
+Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer")
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Mark Bloch <mbloch@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+index b61d82f08e65..404dd1d9b28b 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -2014,9 +2014,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
+ list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
+ if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev))
+ continue;
++
++ list_del(&peer_flow->peer_flows);
+ if (refcount_dec_and_test(&peer_flow->refcnt)) {
+ mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow);
+- list_del(&peer_flow->peer_flows);
+ kfree(peer_flow);
+ }
+ }
+--
+2.43.0
+
--- /dev/null
+From 87127fb62eda0f2490d2a3d442cd9fdbdbf2732a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Nov 2023 11:08:10 +0200
+Subject: net/mlx5e: Ignore IPsec replay window values on sender side
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 315a597f9bcfe7fe9980985031413457bee95510 ]
+
+XFRM stack doesn't prevent from users to configure replay window
+in TX side and strongswan sets replay_window to be 1. It causes
+to failures in validation logic when trying to offload the SA.
+
+Replay window is not relevant in TX side and should be ignored.
+
+Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes")
+Signed-off-by: Aya Levin <ayal@nvidia.com>
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+index 161c5190c236..05612d9c6080 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+@@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+ /* iv len */
+ aes_gcm->icv_len = x->aead->alg_icv_len;
+
++ attrs->dir = x->xso.dir;
++
+ /* esn */
+ if (x->props.flags & XFRM_STATE_ESN) {
+ attrs->replay_esn.trigger = true;
+ attrs->replay_esn.esn = sa_entry->esn_state.esn;
+ attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb;
+ attrs->replay_esn.overlap = sa_entry->esn_state.overlap;
++ if (attrs->dir == XFRM_DEV_OFFLOAD_OUT)
++ goto skip_replay_window;
++
+ switch (x->replay_esn->replay_window) {
+ case 32:
+ attrs->replay_esn.replay_window =
+@@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+ }
+ }
+
+- attrs->dir = x->xso.dir;
++skip_replay_window:
+ /* spi */
+ attrs->spi = be32_to_cpu(x->id.spi);
+
+@@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev,
+ return -EINVAL;
+ }
+
+- if (x->replay_esn && x->replay_esn->replay_window != 32 &&
++ if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN &&
++ x->replay_esn->replay_window != 32 &&
+ x->replay_esn->replay_window != 64 &&
+ x->replay_esn->replay_window != 128 &&
+ x->replay_esn->replay_window != 256) {
+--
+2.43.0
+
--- /dev/null
+From ab880a9dc6e8c24858ed3edae56d7d323782c2f6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 19:59:14 -0800
+Subject: net: mvpp2: clear BM pool before initialization
+
+From: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+
+[ Upstream commit 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 ]
+
+Register value persist after booting the kernel using
+kexec which results in kernel panic. Thus clear the
+BM pool registers before initialisation to fix the issue.
+
+Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
+Signed-off-by: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 27 ++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+index 93137606869e..065f07392c96 100644
+--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
++++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+@@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv)
+ mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val);
+ }
+
++/* Cleanup pool before actual initialization in the OS */
++static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id)
++{
++ unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu());
++ u32 val;
++ int i;
++
++ /* Drain the BM from all possible residues left by firmware */
++ for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++)
++ mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id));
++
++ put_cpu();
++
++ /* Stop the BM pool */
++ val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id));
++ val |= MVPP2_BM_STOP_MASK;
++ mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val);
++}
++
+ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+ {
+ enum dma_data_direction dma_dir = DMA_FROM_DEVICE;
+ int i, err, poolnum = MVPP2_BM_POOLS_NUM;
+ struct mvpp2_port *port;
+
++ if (priv->percpu_pools)
++ poolnum = mvpp2_get_nrxqs(priv) * 2;
++
++ /* Clean up the pool state in case it contains stale state */
++ for (i = 0; i < poolnum; i++)
++ mvpp2_bm_pool_cleanup(priv, i);
++
+ if (priv->percpu_pools) {
+ for (i = 0; i < priv->port_count; i++) {
+ port = priv->port_list[i];
+@@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+ }
+ }
+
+- poolnum = mvpp2_get_nrxqs(priv) * 2;
+ for (i = 0; i < poolnum; i++) {
+ /* the pool in use */
+ int pn = i / (poolnum / 2);
+--
+2.43.0
+
--- /dev/null
+From 3a2f87de96b7fc7b6543d41e9fefa4c13dfa7cfc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 17:48:39 -0800
+Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv
+
+From: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+
+[ Upstream commit 13e788deb7348cc88df34bed736c3b3b9927ea52 ]
+
+Syzcaller UBSAN crash occurs in rds_cmsg_recv(),
+which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1),
+but with array size of 4 (RDS_RX_MAX_TRACES).
+Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from
+trace.rx_trace_pos[i] in rds_recv_track_latency(),
+with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the
+off-by-one bounds check in rds_recv_track_latency() to prevent
+a potential crash in rds_cmsg_recv().
+
+Found by syzcaller:
+=================================================================
+UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39
+index 4 is out of range for type 'u64 [4]'
+CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
+BIOS 1.15.0-1 04/01/2014
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106
+ ubsan_epilogue lib/ubsan.c:217 [inline]
+ __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348
+ rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585
+ rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716
+ sock_recvmsg_nosec net/socket.c:1044 [inline]
+ sock_recvmsg+0xe2/0x160 net/socket.c:1066
+ __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246
+ __do_sys_recvfrom net/socket.c:2264 [inline]
+ __se_sys_recvfrom net/socket.c:2260 [inline]
+ __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+==================================================================
+
+Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
+Reported-by: Chenyuan Yang <chenyuan0y@gmail.com>
+Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/
+Signed-off-by: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/rds/af_rds.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
+index 01c4cdfef45d..8435a20968ef 100644
+--- a/net/rds/af_rds.c
++++ b/net/rds/af_rds.c
+@@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
+
+ rs->rs_rx_traces = trace.rx_traces;
+ for (i = 0; i < rs->rs_rx_traces; i++) {
+- if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
++ if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
+ rs->rs_rx_traces = 0;
+ return -EFAULT;
+ }
+--
+2.43.0
+
--- /dev/null
+From 95f0d3dbe719a42cc4c5614e70cc0a6a71b2f833 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 15:28:43 +0200
+Subject: net/sched: flower: Fix chain template offload
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 ]
+
+When a qdisc is deleted from a net device the stack instructs the
+underlying driver to remove its flow offload callback from the
+associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
+then continues to replay the removal of the filters in the block for
+this driver by iterating over the chains in the block and invoking the
+'reoffload' operation of the classifier being used. In turn, the
+classifier in its 'reoffload' operation prepares and emits a
+'FLOW_CLS_DESTROY' command for each filter.
+
+However, the stack does not do the same for chain templates and the
+underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
+a qdisc is deleted. This results in a memory leak [1] which can be
+reproduced using [2].
+
+Fix by introducing a 'tmplt_reoffload' operation and have the stack
+invoke it with the appropriate arguments as part of the replay.
+Implement the operation in the sole classifier that supports chain
+templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
+command based on whether a flow offload callback is being bound to a
+filter block or being unbound from one.
+
+As far as I can tell, the issue happens since cited commit which
+reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
+in __tcf_block_put(). The order cannot be reversed as the filter block
+is expected to be freed after flushing all the chains.
+
+[1]
+unreferenced object 0xffff888107e28800 (size 2048):
+ comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+ hex dump (first 32 bytes):
+ b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[......
+ 01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................
+ backtrace:
+ [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+ [<ffffffff81ab374e>] __kmalloc+0x4e/0x90
+ [<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
+ [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+ [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+ [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+ [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+ [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+ [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+ [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+ [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+ [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+ [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+ [<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
+ [<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
+ [<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
+unreferenced object 0xffff88816d2c0400 (size 1024):
+ comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+ hex dump (first 32 bytes):
+ 40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8.....
+ 10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m....
+ backtrace:
+ [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+ [<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
+ [<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
+ [<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
+ [<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
+ [<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
+ [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+ [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+ [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+ [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+ [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+ [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+ [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+ [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+ [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+ [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+
+[2]
+ # tc qdisc add dev swp1 clsact
+ # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
+ # tc qdisc del dev swp1 clsact
+ # devlink dev reload pci/0000:06:00.0
+
+Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/sch_generic.h | 4 ++++
+ net/sched/cls_api.c | 9 ++++++++-
+ net/sched/cls_flower.c | 23 +++++++++++++++++++++++
+ 3 files changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
+index dcb9160e6467..959a7725c27b 100644
+--- a/include/net/sch_generic.h
++++ b/include/net/sch_generic.h
+@@ -375,6 +375,10 @@ struct tcf_proto_ops {
+ struct nlattr **tca,
+ struct netlink_ext_ack *extack);
+ void (*tmplt_destroy)(void *tmplt_priv);
++ void (*tmplt_reoffload)(struct tcf_chain *chain,
++ bool add,
++ flow_setup_cb_t *cb,
++ void *cb_priv);
+ struct tcf_exts * (*get_exts)(const struct tcf_proto *tp,
+ u32 handle);
+
+diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
+index 1976bd163986..02c594baa1d9 100644
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -1536,6 +1536,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+ chain_prev = chain,
+ chain = __tcf_get_next_chain(block, chain),
+ tcf_chain_put(chain_prev)) {
++ if (chain->tmplt_ops && add)
++ chain->tmplt_ops->tmplt_reoffload(chain, true, cb,
++ cb_priv);
+ for (tp = __tcf_get_next_proto(chain, NULL); tp;
+ tp_prev = tp,
+ tp = __tcf_get_next_proto(chain, tp),
+@@ -1551,6 +1554,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+ goto err_playback_remove;
+ }
+ }
++ if (chain->tmplt_ops && !add)
++ chain->tmplt_ops->tmplt_reoffload(chain, false, cb,
++ cb_priv);
+ }
+
+ return 0;
+@@ -2971,7 +2977,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+ ops = tcf_proto_lookup_ops(name, true, extack);
+ if (IS_ERR(ops))
+ return PTR_ERR(ops);
+- if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
++ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump ||
++ !ops->tmplt_reoffload) {
+ NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+ module_put(ops->owner);
+ return -EOPNOTSUPP;
+diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
+index e5314a31f75a..efb9d2811b73 100644
+--- a/net/sched/cls_flower.c
++++ b/net/sched/cls_flower.c
+@@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv)
+ kfree(tmplt);
+ }
+
++static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add,
++ flow_setup_cb_t *cb, void *cb_priv)
++{
++ struct fl_flow_tmplt *tmplt = chain->tmplt_priv;
++ struct flow_cls_offload cls_flower = {};
++
++ cls_flower.rule = flow_rule_alloc(0);
++ if (!cls_flower.rule)
++ return;
++
++ cls_flower.common.chain_index = chain->index;
++ cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE :
++ FLOW_CLS_TMPLT_DESTROY;
++ cls_flower.cookie = (unsigned long) tmplt;
++ cls_flower.rule->match.dissector = &tmplt->dissector;
++ cls_flower.rule->match.mask = &tmplt->mask;
++ cls_flower.rule->match.key = &tmplt->dummy_key;
++
++ cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
++ kfree(cls_flower.rule);
++}
++
+ static int fl_dump_key_val(struct sk_buff *skb,
+ void *val, int val_type,
+ void *mask, int mask_type, int len)
+@@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
+ .bind_class = fl_bind_class,
+ .tmplt_create = fl_tmplt_create,
+ .tmplt_destroy = fl_tmplt_destroy,
++ .tmplt_reoffload = fl_tmplt_reoffload,
+ .tmplt_dump = fl_tmplt_dump,
+ .get_exts = fl_get_exts,
+ .owner = THIS_MODULE,
+--
+2.43.0
+
--- /dev/null
+From 1777c87b90b1b1898a582ff004304306f589756b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 12:32:10 +0800
+Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump
+
+From: Wen Gu <guwen@linux.alibaba.com>
+
+[ Upstream commit dbc153fd3c142909e564bb256da087e13fbf239c ]
+
+A crash was found when dumping SMC-D connections. It can be reproduced
+by following steps:
+
+- run nginx/wrk test:
+ smc_run nginx
+ smc_run wrk -t 16 -c 1000 -d <duration> -H 'Connection: Close' <URL>
+
+- continuously dump SMC-D connections in parallel:
+ watch -n 1 'smcss -D'
+
+ BUG: kernel NULL pointer dereference, address: 0000000000000030
+ CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E 6.7.0+ #55
+ RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+ Call Trace:
+ <TASK>
+ ? __die+0x24/0x70
+ ? page_fault_oops+0x66/0x150
+ ? exc_page_fault+0x69/0x140
+ ? asm_exc_page_fault+0x26/0x30
+ ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+ ? __kmalloc_node_track_caller+0x35d/0x430
+ ? __alloc_skb+0x77/0x170
+ smc_diag_dump_proto+0xd0/0xf0 [smc_diag]
+ smc_diag_dump+0x26/0x60 [smc_diag]
+ netlink_dump+0x19f/0x320
+ __netlink_dump_start+0x1dc/0x300
+ smc_diag_handler_dump+0x6a/0x80 [smc_diag]
+ ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag]
+ sock_diag_rcv_msg+0x121/0x140
+ ? __pfx_sock_diag_rcv_msg+0x10/0x10
+ netlink_rcv_skb+0x5a/0x110
+ sock_diag_rcv+0x28/0x40
+ netlink_unicast+0x22a/0x330
+ netlink_sendmsg+0x1f8/0x420
+ __sock_sendmsg+0xb0/0xc0
+ ____sys_sendmsg+0x24e/0x300
+ ? copy_msghdr_from_user+0x62/0x80
+ ___sys_sendmsg+0x7c/0xd0
+ ? __do_fault+0x34/0x160
+ ? do_read_fault+0x5f/0x100
+ ? do_fault+0xb0/0x110
+ ? __handle_mm_fault+0x2b0/0x6c0
+ __sys_sendmsg+0x4d/0x80
+ do_syscall_64+0x69/0x180
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+It is possible that the connection is in process of being established
+when we dump it. Assumed that the connection has been registered in a
+link group by smc_conn_create() but the rmb_desc has not yet been
+initialized by smc_buf_create(), thus causing the illegal access to
+conn->rmb_desc. So fix it by checking before dump.
+
+Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support")
+Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_diag.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
+index 5cc376834c57..fb9e5cc1285e 100644
+--- a/net/smc/smc_diag.c
++++ b/net/smc/smc_diag.c
+@@ -163,7 +163,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+ }
+ if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd &&
+ (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+- !list_empty(&smc->conn.lgr->list)) {
++ !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) {
+ struct smc_connection *conn = &smc->conn;
+ struct smcd_diag_dmbinfo dinfo;
+ struct smcd_dev *smcd = conn->lgr->smcd;
+--
+2.43.0
+
--- /dev/null
+From 174002e959ef2d8df58a9f06047e3f6d941a7e96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 19:19:09 +0100
+Subject: net: stmmac: Wait a bit for the reset to take effect
+
+From: Bernd Edlinger <bernd.edlinger@hotmail.de>
+
+[ Upstream commit a5f5eee282a0aae80227697e1d9c811b1726d31d ]
+
+otherwise the synopsys_id value may be read out wrong,
+because the GMAC_VERSION register might still be in reset
+state, for at least 1 us after the reset is de-asserted.
+
+Add a wait for 10 us before continuing to be on the safe side.
+
+> From what have you got that delay value?
+
+Just try and error, with very old linux versions and old gcc versions
+the synopsys_id was read out correctly most of the time (but not always),
+with recent linux versions and recnet gcc versions it was read out
+wrongly most of the time, but again not always.
+I don't have access to the VHDL code in question, so I cannot
+tell why it takes so long to get the correct values, I also do not
+have more than a few hardware samples, so I cannot tell how long
+this timeout must be in worst case.
+Experimentally I can tell that the register is read several times
+as zero immediately after the reset is de-asserted, also adding several
+no-ops is not enough, adding a printk is enough, also udelay(1) seems to
+be enough but I tried that not very often, and I have not access to many
+hardware samples to be 100% sure about the necessary delay.
+And since the udelay here is only executed once per device instance,
+it seems acceptable to delay the boot for 10 us.
+
+BTW: my hardware's synopsys id is 0x37.
+
+Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control")
+Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
+Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+index 49b81daf7411..d094c3c1e2ee 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -7467,6 +7467,9 @@ int stmmac_dvr_probe(struct device *device,
+ dev_err(priv->device, "unable to bring out of ahb reset: %pe\n",
+ ERR_PTR(ret));
+
++ /* Wait a bit for the reset to take effect */
++ udelay(10);
++
+ /* Init MAC and get the capabilities */
+ ret = stmmac_hw_init(priv);
+ if (ret)
+--
+2.43.0
+
--- /dev/null
+From 541e41a88a00522cc6ce415e5481902002c27b4a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:34:32 +0100
+Subject: netfilter: nf_tables: restrict anonymous set and map names to 16
+ bytes
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit b462579b2b86a8f5230543cadd3a4836be27baf7 ]
+
+nftables has two types of sets/maps, one where userspace defines the
+name, and anonymous sets/maps, where userspace defines a template name.
+
+For the latter, kernel requires presence of exactly one "%d".
+nftables uses "__set%d" and "__map%d" for this. The kernel will
+expand the format specifier and replaces it with the smallest unused
+number.
+
+As-is, userspace could define a template name that allows to move
+the set name past the 256 bytes upperlimit (post-expansion).
+
+I don't see how this could be a problem, but I would prefer if userspace
+cannot do this, so add a limit of 16 bytes for the '%d' template name.
+
+16 bytes is the old total upper limit for set names that existed when
+nf_tables was merged initially.
+
+Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index f032c29f1da6..5282e8377782 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -24,6 +24,7 @@
+ #include <net/sock.h>
+
+ #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
++#define NFT_SET_MAX_ANONLEN 16
+
+ unsigned int nf_tables_net_id __read_mostly;
+
+@@ -4411,6 +4412,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+ if (p[1] != 'd' || strchr(p + 2, '%'))
+ return -EINVAL;
+
++ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
++ return -EINVAL;
++
+ inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+ if (inuse == NULL)
+ return -ENOMEM;
+--
+2.43.0
+
--- /dev/null
+From 3f0829fa1a89fd1aab6d15af0a0d0f7eb428dd1d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 16:38:25 +0100
+Subject: netfilter: nf_tables: validate NFPROTO_* family
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit d0009effa8862c20a13af4cb7475d9771b905693 ]
+
+Several expressions explicitly refer to NF_INET_* hook definitions
+from expr->ops->validate, however, family is not validated.
+
+Bail out with EOPNOTSUPP in case they are used from unsupported
+families.
+
+Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables")
+Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression")
+Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression")
+Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching")
+Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support")
+Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support")
+Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_compat.c | 12 ++++++++++++
+ net/netfilter/nft_flow_offload.c | 5 +++++
+ net/netfilter/nft_nat.c | 5 +++++
+ net/netfilter/nft_rt.c | 5 +++++
+ net/netfilter/nft_socket.c | 5 +++++
+ net/netfilter/nft_synproxy.c | 7 +++++--
+ net/netfilter/nft_tproxy.c | 5 +++++
+ net/netfilter/nft_xfrm.c | 5 +++++
+ 8 files changed, 47 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
+index 5284cd2ad532..f0eeda97bfcd 100644
+--- a/net/netfilter/nft_compat.c
++++ b/net/netfilter/nft_compat.c
+@@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx,
+ unsigned int hook_mask = 0;
+ int ret;
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_BRIDGE &&
++ ctx->family != NFPROTO_ARP)
++ return -EOPNOTSUPP;
++
+ if (nft_is_base_chain(ctx->chain)) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+@@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx,
+ unsigned int hook_mask = 0;
+ int ret;
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_BRIDGE &&
++ ctx->family != NFPROTO_ARP)
++ return -EOPNOTSUPP;
++
+ if (nft_is_base_chain(ctx->chain)) {
+ const struct nft_base_chain *basechain =
+ nft_base_chain(ctx->chain);
+diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
+index ab3362c483b4..397351fa4d5f 100644
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+ {
+ unsigned int hook_mask = (1 << NF_INET_FORWARD);
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ return nft_chain_validate_hooks(ctx->chain, hook_mask);
+ }
+
+diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
+index 583885ce7232..808f5802c270 100644
+--- a/net/netfilter/nft_nat.c
++++ b/net/netfilter/nft_nat.c
+@@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx,
+ struct nft_nat *priv = nft_expr_priv(expr);
+ int err;
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+ if (err < 0)
+ return err;
+diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
+index 35a2c28caa60..24d977138572 100644
+--- a/net/netfilter/nft_rt.c
++++ b/net/netfilter/nft_rt.c
+@@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
+ const struct nft_rt *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ switch (priv->key) {
+ case NFT_RT_NEXTHOP4:
+ case NFT_RT_NEXTHOP6:
+diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
+index 9ed85be79452..f30163e2ca62 100644
+--- a/net/netfilter/nft_socket.c
++++ b/net/netfilter/nft_socket.c
+@@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+ {
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ return nft_chain_validate_hooks(ctx->chain,
+ (1 << NF_INET_PRE_ROUTING) |
+ (1 << NF_INET_LOCAL_IN) |
+diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
+index 13da882669a4..1d737f89dfc1 100644
+--- a/net/netfilter/nft_synproxy.c
++++ b/net/netfilter/nft_synproxy.c
+@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+ break;
+ #endif
+ case NFPROTO_INET:
+- case NFPROTO_BRIDGE:
+ err = nf_synproxy_ipv4_init(snet, ctx->net);
+ if (err)
+ goto nf_ct_failure;
+@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
+ break;
+ #endif
+ case NFPROTO_INET:
+- case NFPROTO_BRIDGE:
+ nf_synproxy_ipv4_fini(snet, ctx->net);
+ nf_synproxy_ipv6_fini(snet, ctx->net);
+ break;
+@@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+ {
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+ (1 << NF_INET_FORWARD));
+ }
+diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
+index ae15cd693f0e..71412adb73d4 100644
+--- a/net/netfilter/nft_tproxy.c
++++ b/net/netfilter/nft_tproxy.c
+@@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx,
+ const struct nft_expr *expr,
+ const struct nft_data **data)
+ {
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+ }
+
+diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
+index 452f8587adda..1c866757db55 100644
+--- a/net/netfilter/nft_xfrm.c
++++ b/net/netfilter/nft_xfrm.c
+@@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
+ const struct nft_xfrm *priv = nft_expr_priv(expr);
+ unsigned int hooks;
+
++ if (ctx->family != NFPROTO_IPV4 &&
++ ctx->family != NFPROTO_IPV6 &&
++ ctx->family != NFPROTO_INET)
++ return -EOPNOTSUPP;
++
+ switch (priv->dir) {
+ case XFRM_POLICY_IN:
+ hooks = (1 << NF_INET_FORWARD) |
+--
+2.43.0
+
--- /dev/null
+From 03c58469dab167d91526495d6be164f559516dbe Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:11:32 +0100
+Subject: netfilter: nft_limit: reject configurations that cause integer
+ overflow
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa ]
+
+Reject bogus configs where internal token counter wraps around.
+This only occurs with very very large requests, such as 17gbyte/s.
+
+Its better to reject this rather than having incorrect ratelimit.
+
+Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_limit.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
+index 79039afde34e..cefa25e0dbb0 100644
+--- a/net/netfilter/nft_limit.c
++++ b/net/netfilter/nft_limit.c
+@@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
+ static int nft_limit_init(struct nft_limit_priv *priv,
+ const struct nlattr * const tb[], bool pkts)
+ {
++ u64 unit, tokens, rate_with_burst;
+ bool invert = false;
+- u64 unit, tokens;
+
+ if (tb[NFTA_LIMIT_RATE] == NULL ||
+ tb[NFTA_LIMIT_UNIT] == NULL)
+ return -EINVAL;
+
+ priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
++ if (priv->rate == 0)
++ return -EINVAL;
++
+ unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+- priv->nsecs = unit * NSEC_PER_SEC;
+- if (priv->rate == 0 || priv->nsecs < unit)
++ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
+ return -EOVERFLOW;
+
+ if (tb[NFTA_LIMIT_BURST])
+@@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv,
+ if (pkts && priv->burst == 0)
+ priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+
+- if (priv->rate + priv->burst < priv->rate)
++ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
+ return -EOVERFLOW;
+
+ if (pkts) {
+- tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
++ u64 tmp = div64_u64(priv->nsecs, priv->rate);
++
++ if (check_mul_overflow(tmp, priv->burst, &tokens))
++ return -EOVERFLOW;
+ } else {
++ u64 tmp;
++
+ /* The token bucket size limits the number of tokens can be
+ * accumulated. tokens_max specifies the bucket size.
+ * tokens_max = unit * (rate + burst) / rate.
+ */
+- tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
+- priv->rate);
++ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
++ return -EOVERFLOW;
++
++ tokens = div64_u64(tmp, priv->rate);
+ }
+
+ if (tb[NFTA_LIMIT_FLAGS]) {
+--
+2.43.0
+
--- /dev/null
+From 9b0508de9d77bca0679a689ebf30f1cb59ba3392 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jan 2024 09:59:41 +0300
+Subject: netfs, fscache: Prevent Oops in fscache_put_cache()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit 3be0b3ed1d76c6703b9ee482b55f7e01c369cc68 ]
+
+This function dereferences "cache" and then checks if it's
+IS_ERR_OR_NULL(). Check first, then dereference.
+
+Fixes: 9549332df4ed ("fscache: Implement cache registration")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/e84bc740-3502-4f16-982a-a40d5676615c@moroto.mountain/ # v2
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fscache/cache.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
+index d645f8b302a2..9397ed39b0b4 100644
+--- a/fs/fscache/cache.c
++++ b/fs/fscache/cache.c
+@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
+ void fscache_put_cache(struct fscache_cache *cache,
+ enum fscache_cache_trace where)
+ {
+- unsigned int debug_id = cache->debug_id;
++ unsigned int debug_id;
+ bool zero;
+ int ref;
+
+ if (IS_ERR_OR_NULL(cache))
+ return;
+
++ debug_id = cache->debug_id;
+ zero = __refcount_dec_and_test(&cache->ref, &ref);
+ trace_fscache_cache(debug_id, ref - 1, where);
+
+--
+2.43.0
+
--- /dev/null
+From e0e707fa22ae61e59a539bf4dbd7beb2f21590eb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 09:18:07 +0800
+Subject: netlink: fix potential sleeping issue in mqueue_flush_file
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 234ec0b6034b16869d45128b8cd2dc6ffe596f04 ]
+
+I analyze the potential sleeping issue of the following processes:
+Thread A Thread B
+... netlink_create //ref = 1
+do_mq_notify ...
+ sock = netlink_getsockbyfilp ... //ref = 2
+ info->notify_sock = sock; ...
+... netlink_sendmsg
+... skb = netlink_alloc_large_skb //skb->head is vmalloced
+... netlink_unicast
+... sk = netlink_getsockbyportid //ref = 3
+... netlink_sendskb
+... __netlink_sendskb
+... skb_queue_tail //put skb to sk_receive_queue
+... sock_put //ref = 2
+... ...
+... netlink_release
+... deferred_put_nlk_sk //ref = 1
+mqueue_flush_file
+ spin_lock
+ remove_notification
+ netlink_sendskb
+ sock_put //ref = 0
+ sk_free
+ ...
+ __sk_destruct
+ netlink_sock_destruct
+ skb_queue_purge //get skb from sk_receive_queue
+ ...
+ __skb_queue_purge_reason
+ kfree_skb_reason
+ __kfree_skb
+ ...
+ skb_release_all
+ skb_release_head_state
+ netlink_skb_destructor
+ vfree(skb->head) //sleeping while holding spinlock
+
+In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
+vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
+When the mqueue executes flush, the sleeping bug will occur. Use
+vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue.
+
+Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netlink/af_netlink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index eb086b06d60d..d9107b545d36 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb)
+ if (is_vmalloc_addr(skb->head)) {
+ if (!skb->cloned ||
+ !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+- vfree(skb->head);
++ vfree_atomic(skb->head);
+
+ skb->head = NULL;
+ }
+--
+2.43.0
+
--- /dev/null
+From 3ee2c71d0f6a323db1850cf4f2af474918609467 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Dec 2023 00:19:15 +0100
+Subject: rcu: Defer RCU kthreads wakeup when CPU is dying
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit e787644caf7628ad3269c1fbd321c3255cf51710 ]
+
+When the CPU goes idle for the last time during the CPU down hotplug
+process, RCU reports a final quiescent state for the current CPU. If
+this quiescent state propagates up to the top, some tasks may then be
+woken up to complete the grace period: the main grace period kthread
+and/or the expedited main workqueue (or kworker).
+
+If those kthreads have a SCHED_FIFO policy, the wake up can indirectly
+arm the RT bandwith timer to the local offline CPU. Since this happens
+after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the
+timer gets ignored. Therefore if the RCU kthreads are waiting for RT
+bandwidth to be available, they may never be actually scheduled.
+
+This triggers TREE03 rcutorture hangs:
+
+ rcu: INFO: rcu_preempt self-detected stall on CPU
+ rcu: 4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved)
+ rcu: (t=21035 jiffies g=938281 q=40787 ncpus=6)
+ rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
+ rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior.
+ rcu: RCU grace-period kthread stack dump:
+ task:rcu_preempt state:R running task stack:14896 pid:14 tgid:14 ppid:2 flags:0x00004000
+ Call Trace:
+ <TASK>
+ __schedule+0x2eb/0xa80
+ schedule+0x1f/0x90
+ schedule_timeout+0x163/0x270
+ ? __pfx_process_timeout+0x10/0x10
+ rcu_gp_fqs_loop+0x37c/0x5b0
+ ? __pfx_rcu_gp_kthread+0x10/0x10
+ rcu_gp_kthread+0x17c/0x200
+ kthread+0xde/0x110
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork+0x2b/0x40
+ ? __pfx_kthread+0x10/0x10
+ ret_from_fork_asm+0x1b/0x30
+ </TASK>
+
+The situation can't be solved with just unpinning the timer. The hrtimer
+infrastructure and the nohz heuristics involved in finding the best
+remote target for an unpinned timer would then also need to handle
+enqueues from an offline CPU in the most horrendous way.
+
+So fix this on the RCU side instead and defer the wake up to an online
+CPU if it's too late for the local one.
+
+Reported-by: Paul E. McKenney <paulmck@kernel.org>
+Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/rcu/tree.c | 34 +++++++++++++++++++++++++++++++++-
+ kernel/rcu/tree_exp.h | 3 +--
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
+index 3ac3c846105f..157f3ca2a9b5 100644
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
+ return needmore;
+ }
+
++static void swake_up_one_online_ipi(void *arg)
++{
++ struct swait_queue_head *wqh = arg;
++
++ swake_up_one(wqh);
++}
++
++static void swake_up_one_online(struct swait_queue_head *wqh)
++{
++ int cpu = get_cpu();
++
++ /*
++ * If called from rcutree_report_cpu_starting(), wake up
++ * is dangerous that late in the CPU-down hotplug process. The
++ * scheduler might queue an ignored hrtimer. Defer the wake up
++ * to an online CPU instead.
++ */
++ if (unlikely(cpu_is_offline(cpu))) {
++ int target;
++
++ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
++ cpu_online_mask);
++
++ smp_call_function_single(target, swake_up_one_online_ipi,
++ wqh, 0);
++ put_cpu();
++ } else {
++ put_cpu();
++ swake_up_one(wqh);
++ }
++}
++
+ /*
+ * Awaken the grace-period kthread. Don't do a self-awaken (unless in an
+ * interrupt or softirq handler, in which case we just might immediately
+@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
+ return;
+ WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+ WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
+- swake_up_one(&rcu_state.gp_wq);
++ swake_up_one_online(&rcu_state.gp_wq);
+ }
+
+ /*
+diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
+index 6d7cea5d591f..2ac440bc7e10 100644
+--- a/kernel/rcu/tree_exp.h
++++ b/kernel/rcu/tree_exp.h
+@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
+ return ret;
+ }
+
+-
+ /*
+ * Report the exit from RCU read-side critical section for the last task
+ * that queued itself during or before the current expedited preemptible-RCU
+@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
+ raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+ if (wake) {
+ smp_mb(); /* EGP done before wake_up(). */
+- swake_up_one(&rcu_state.expedited_wq);
++ swake_up_one_online(&rcu_state.expedited_wq);
+ }
+ break;
+ }
+--
+2.43.0
+
--- /dev/null
+From 5f7f956733145671f4d20cd9caf937583e2b50ad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Oct 2023 12:55:11 +0100
+Subject: rxrpc, afs: Allow afs to pin rxrpc_peer objects
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 72904d7b9bfbf2dd146254edea93958bc35bbbfe ]
+
+Change rxrpc's API such that:
+
+ (1) A new function, rxrpc_kernel_lookup_peer(), is provided to look up an
+ rxrpc_peer record for a remote address and a corresponding function,
+ rxrpc_kernel_put_peer(), is provided to dispose of it again.
+
+ (2) When setting up a call, the rxrpc_peer object used during a call is
+ now passed in rather than being set up by rxrpc_connect_call(). For
+ afs, this meenat passing it to rxrpc_kernel_begin_call() rather than
+ the full address (the service ID then has to be passed in as a
+ separate parameter).
+
+ (3) A new function, rxrpc_kernel_remote_addr(), is added so that afs can
+ get a pointer to the transport address for display purposed, and
+ another, rxrpc_kernel_remote_srx(), to gain a pointer to the full
+ rxrpc address.
+
+ (4) The function to retrieve the RTT from a call, rxrpc_kernel_get_srtt(),
+ is then altered to take a peer. This now returns the RTT or -1 if
+ there are insufficient samples.
+
+ (5) Rename rxrpc_kernel_get_peer() to rxrpc_kernel_call_get_peer().
+
+ (6) Provide a new function, rxrpc_kernel_get_peer(), to get a ref on a
+ peer the caller already has.
+
+This allows the afs filesystem to pin the rxrpc_peer records that it is
+using, allowing faster lookups and pointer comparisons rather than
+comparing sockaddr_rxrpc contents. It also makes it easier to get hold of
+the RTT. The following changes are made to afs:
+
+ (1) The addr_list struct's addrs[] elements now hold a peer struct pointer
+ and a service ID rather than a sockaddr_rxrpc.
+
+ (2) When displaying the transport address, rxrpc_kernel_remote_addr() is
+ used.
+
+ (3) The port arg is removed from afs_alloc_addrlist() since it's always
+ overridden.
+
+ (4) afs_merge_fs_addr4() and afs_merge_fs_addr6() do peer lookup and may
+ now return an error that must be handled.
+
+ (5) afs_find_server() now takes a peer pointer to specify the address.
+
+ (6) afs_find_server(), afs_compare_fs_alists() and afs_merge_fs_addr[46]{}
+ now do peer pointer comparison rather than address comparison.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c | 125 ++++++++++++++++++-----------------
+ fs/afs/cmservice.c | 5 +-
+ fs/afs/fs_probe.c | 11 +--
+ fs/afs/internal.h | 26 ++++----
+ fs/afs/proc.c | 9 +--
+ fs/afs/rotate.c | 6 +-
+ fs/afs/rxrpc.c | 10 +--
+ fs/afs/server.c | 41 ++----------
+ fs/afs/vl_alias.c | 55 +--------------
+ fs/afs/vl_list.c | 15 +++--
+ fs/afs/vl_probe.c | 12 ++--
+ fs/afs/vl_rotate.c | 6 +-
+ fs/afs/vlclient.c | 22 ++++--
+ include/net/af_rxrpc.h | 15 +++--
+ include/trace/events/rxrpc.h | 3 +
+ net/rxrpc/af_rxrpc.c | 62 ++++++++++++++---
+ net/rxrpc/ar-internal.h | 2 +-
+ net/rxrpc/call_object.c | 17 ++---
+ net/rxrpc/peer_object.c | 58 ++++++++++------
+ net/rxrpc/sendmsg.c | 11 ++-
+ 20 files changed, 273 insertions(+), 238 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index ac05a59e9d46..519821f5aedc 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -13,26 +13,33 @@
+ #include "internal.h"
+ #include "afs_fs.h"
+
++static void afs_free_addrlist(struct rcu_head *rcu)
++{
++ struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu);
++ unsigned int i;
++
++ for (i = 0; i < alist->nr_addrs; i++)
++ rxrpc_kernel_put_peer(alist->addrs[i].peer);
++}
++
+ /*
+ * Release an address list.
+ */
+ void afs_put_addrlist(struct afs_addr_list *alist)
+ {
+ if (alist && refcount_dec_and_test(&alist->usage))
+- kfree_rcu(alist, rcu);
++ call_rcu(&alist->rcu, afs_free_addrlist);
+ }
+
+ /*
+ * Allocate an address list.
+ */
+-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+- unsigned short service,
+- unsigned short port)
++struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id)
+ {
+ struct afs_addr_list *alist;
+ unsigned int i;
+
+- _enter("%u,%u,%u", nr, service, port);
++ _enter("%u,%u", nr, service_id);
+
+ if (nr > AFS_MAX_ADDRESSES)
+ nr = AFS_MAX_ADDRESSES;
+@@ -44,16 +51,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+ refcount_set(&alist->usage, 1);
+ alist->max_addrs = nr;
+
+- for (i = 0; i < nr; i++) {
+- struct sockaddr_rxrpc *srx = &alist->addrs[i].srx;
+- srx->srx_family = AF_RXRPC;
+- srx->srx_service = service;
+- srx->transport_type = SOCK_DGRAM;
+- srx->transport_len = sizeof(srx->transport.sin6);
+- srx->transport.sin6.sin6_family = AF_INET6;
+- srx->transport.sin6.sin6_port = htons(port);
+- }
+-
++ for (i = 0; i < nr; i++)
++ alist->addrs[i].service_id = service_id;
+ return alist;
+ }
+
+@@ -126,7 +125,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+ if (!vllist->servers[0].server)
+ goto error_vl;
+
+- alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
++ alist = afs_alloc_addrlist(nr, service);
+ if (!alist)
+ goto error;
+
+@@ -197,9 +196,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+ }
+
+ if (family == AF_INET)
+- afs_merge_fs_addr4(alist, x[0], xport);
++ ret = afs_merge_fs_addr4(net, alist, x[0], xport);
+ else
+- afs_merge_fs_addr6(alist, x, xport);
++ ret = afs_merge_fs_addr6(net, alist, x, xport);
++ if (ret < 0)
++ goto error;
+
+ } while (p < end);
+
+@@ -271,25 +272,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
+ /*
+ * Merge an IPv4 entry into a fileserver address list.
+ */
+-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
++int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist,
++ __be32 xdr, u16 port)
+ {
+- struct sockaddr_rxrpc *srx;
+- u32 addr = ntohl(xdr);
++ struct sockaddr_rxrpc srx;
++ struct rxrpc_peer *peer;
+ int i;
+
+ if (alist->nr_addrs >= alist->max_addrs)
+- return;
++ return 0;
+
+- for (i = 0; i < alist->nr_ipv4; i++) {
+- struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin;
+- u32 a_addr = ntohl(a->sin_addr.s_addr);
+- u16 a_port = ntohs(a->sin_port);
++ srx.srx_family = AF_RXRPC;
++ srx.transport_type = SOCK_DGRAM;
++ srx.transport_len = sizeof(srx.transport.sin);
++ srx.transport.sin.sin_family = AF_INET;
++ srx.transport.sin.sin_port = htons(port);
++ srx.transport.sin.sin_addr.s_addr = xdr;
+
+- if (addr == a_addr && port == a_port)
+- return;
+- if (addr == a_addr && port < a_port)
+- break;
+- if (addr < a_addr)
++ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
++ if (!peer)
++ return -ENOMEM;
++
++ for (i = 0; i < alist->nr_ipv4; i++) {
++ if (peer == alist->addrs[i].peer) {
++ rxrpc_kernel_put_peer(peer);
++ return 0;
++ }
++ if (peer <= alist->addrs[i].peer)
+ break;
+ }
+
+@@ -298,38 +307,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+ alist->addrs + i,
+ sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+
+- srx = &alist->addrs[i].srx;
+- srx->srx_family = AF_RXRPC;
+- srx->transport_type = SOCK_DGRAM;
+- srx->transport_len = sizeof(srx->transport.sin);
+- srx->transport.sin.sin_family = AF_INET;
+- srx->transport.sin.sin_port = htons(port);
+- srx->transport.sin.sin_addr.s_addr = xdr;
++ alist->addrs[i].peer = peer;
+ alist->nr_ipv4++;
+ alist->nr_addrs++;
++ return 0;
+ }
+
+ /*
+ * Merge an IPv6 entry into a fileserver address list.
+ */
+-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
++int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
++ __be32 *xdr, u16 port)
+ {
+- struct sockaddr_rxrpc *srx;
+- int i, diff;
++ struct sockaddr_rxrpc srx;
++ struct rxrpc_peer *peer;
++ int i;
+
+ if (alist->nr_addrs >= alist->max_addrs)
+- return;
++ return 0;
+
+- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+- struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6;
+- u16 a_port = ntohs(a->sin6_port);
++ srx.srx_family = AF_RXRPC;
++ srx.transport_type = SOCK_DGRAM;
++ srx.transport_len = sizeof(srx.transport.sin6);
++ srx.transport.sin6.sin6_family = AF_INET6;
++ srx.transport.sin6.sin6_port = htons(port);
++ memcpy(&srx.transport.sin6.sin6_addr, xdr, 16);
+
+- diff = memcmp(xdr, &a->sin6_addr, 16);
+- if (diff == 0 && port == a_port)
+- return;
+- if (diff == 0 && port < a_port)
+- break;
+- if (diff < 0)
++ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
++ if (!peer)
++ return -ENOMEM;
++
++ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
++ if (peer == alist->addrs[i].peer) {
++ rxrpc_kernel_put_peer(peer);
++ return 0;
++ }
++ if (peer <= alist->addrs[i].peer)
+ break;
+ }
+
+@@ -337,15 +350,9 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+ memmove(alist->addrs + i + 1,
+ alist->addrs + i,
+ sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+-
+- srx = &alist->addrs[i].srx;
+- srx->srx_family = AF_RXRPC;
+- srx->transport_type = SOCK_DGRAM;
+- srx->transport_len = sizeof(srx->transport.sin6);
+- srx->transport.sin6.sin6_family = AF_INET6;
+- srx->transport.sin6.sin6_port = htons(port);
+- memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
++ alist->addrs[i].peer = peer;
+ alist->nr_addrs++;
++ return 0;
+ }
+
+ /*
+diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
+index d4ddb20d6732..99a3f20bc786 100644
+--- a/fs/afs/cmservice.c
++++ b/fs/afs/cmservice.c
+@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
+ {
+ struct sockaddr_rxrpc srx;
+ struct afs_server *server;
++ struct rxrpc_peer *peer;
+
+- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
++ peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
+
+- server = afs_find_server(call->net, &srx);
++ server = afs_find_server(call->net, peer);
+ if (!server) {
+ trace_afs_cm_no_server(call, &srx);
+ return 0;
+diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
+index 3dd24842f277..58d28b82571e 100644
+--- a/fs/afs/fs_probe.c
++++ b/fs/afs/fs_probe.c
+@@ -101,6 +101,7 @@ static void afs_fs_probe_not_done(struct afs_net *net,
+ void afs_fileserver_probe_result(struct afs_call *call)
+ {
+ struct afs_addr_list *alist = call->alist;
++ struct afs_address *addr = &alist->addrs[call->addr_ix];
+ struct afs_server *server = call->server;
+ unsigned int index = call->addr_ix;
+ unsigned int rtt_us = 0, cap0;
+@@ -153,12 +154,12 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ if (call->service_id == YFS_FS_SERVICE) {
+ server->probe.is_yfs = true;
+ set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx.srx_service = call->service_id;
++ addr->service_id = call->service_id;
+ } else {
+ server->probe.not_yfs = true;
+ if (!server->probe.is_yfs) {
+ clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx.srx_service = call->service_id;
++ addr->service_id = call->service_id;
+ }
+ cap0 = ntohl(call->tmp);
+ if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+@@ -167,7 +168,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
+ }
+
+- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
++ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+ if (rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
+@@ -181,8 +182,8 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ out:
+ spin_unlock(&server->probe_lock);
+
+- _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
+- &server->uuid, index, &alist->addrs[index].srx.transport,
++ _debug("probe %pU [%u] %pISpc rtt=%d ret=%d",
++ &server->uuid, index, rxrpc_kernel_remote_addr(alist->addrs[index].peer),
+ rtt_us, ret);
+
+ return afs_done_one_fs_probe(call->net, server);
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index e2adb314ab6a..ec08b4a7e499 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -72,6 +72,11 @@ enum afs_call_state {
+ AFS_CALL_COMPLETE, /* Completed or failed */
+ };
+
++struct afs_address {
++ struct rxrpc_peer *peer;
++ u16 service_id;
++};
++
+ /*
+ * List of server addresses.
+ */
+@@ -87,9 +92,7 @@ struct afs_addr_list {
+ enum dns_lookup_status status:8;
+ unsigned long failed; /* Mask of addrs that failed locally/ICMP */
+ unsigned long responded; /* Mask of addrs that responded */
+- struct {
+- struct sockaddr_rxrpc srx;
+- } addrs[] __counted_by(max_addrs);
++ struct afs_address addrs[] __counted_by(max_addrs);
+ #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+ };
+
+@@ -420,7 +423,7 @@ struct afs_vlserver {
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+- unsigned int rtt; /* RTT in uS */
++ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
+ u32 abort_code;
+ short error;
+ unsigned short flags;
+@@ -537,7 +540,7 @@ struct afs_server {
+ atomic_t probe_outstanding;
+ spinlock_t probe_lock;
+ struct {
+- unsigned int rtt; /* RTT in uS */
++ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */
+ u32 abort_code;
+ short error;
+ bool responded:1;
+@@ -964,9 +967,7 @@ static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist
+ refcount_inc(&alist->usage);
+ return alist;
+ }
+-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
+- unsigned short,
+- unsigned short);
++extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id);
+ extern void afs_put_addrlist(struct afs_addr_list *);
+ extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+ const char *, size_t, char,
+@@ -977,8 +978,10 @@ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+ extern int afs_end_cursor(struct afs_addr_cursor *);
+
+-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
+-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
++extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
++ __be32 xdr, u16 port);
++extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
++ __be32 *xdr, u16 port);
+
+ /*
+ * callback.c
+@@ -1405,8 +1408,7 @@ extern void __exit afs_clean_up_permit_cache(void);
+ */
+ extern spinlock_t afs_server_peer_lock;
+
+-extern struct afs_server *afs_find_server(struct afs_net *,
+- const struct sockaddr_rxrpc *);
++extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
+ extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+ extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
+ extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
+diff --git a/fs/afs/proc.c b/fs/afs/proc.c
+index ab9cd986cfd9..8a65a06908d2 100644
+--- a/fs/afs/proc.c
++++ b/fs/afs/proc.c
+@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+ for (i = 0; i < alist->nr_addrs; i++)
+ seq_printf(m, " %c %pISpc\n",
+ alist->preferred == i ? '>' : '-',
+- &alist->addrs[i].srx.transport);
++ rxrpc_kernel_remote_addr(alist->addrs[i].peer));
+ }
+ seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+ seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+@@ -398,9 +398,10 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
+ seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n",
+ alist->version, alist->responded, alist->failed);
+ for (i = 0; i < alist->nr_addrs; i++)
+- seq_printf(m, " [%x] %pISpc%s\n",
+- i, &alist->addrs[i].srx.transport,
+- alist->preferred == i ? "*" : "");
++ seq_printf(m, " [%x] %pISpc%s rtt=%d\n",
++ i, rxrpc_kernel_remote_addr(alist->addrs[i].peer),
++ alist->preferred == i ? "*" : "",
++ rxrpc_kernel_get_srtt(alist->addrs[i].peer));
+ return 0;
+ }
+
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 46081e5da6f5..59aed7a6dd11 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -113,7 +113,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ struct afs_server *server;
+ struct afs_vnode *vnode = op->file[0].vnode;
+ struct afs_error e;
+- u32 rtt;
++ unsigned int rtt;
+ int error = op->ac.error, i;
+
+ _enter("%lx[%d],%lx[%d],%d,%d",
+@@ -420,7 +420,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ }
+
+ op->index = -1;
+- rtt = U32_MAX;
++ rtt = UINT_MAX;
+ for (i = 0; i < op->server_list->nr_servers; i++) {
+ struct afs_server *s = op->server_list->servers[i].server;
+
+@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+
+ _debug("address [%u] %u/%u %pISp",
+ op->index, op->ac.index, op->ac.alist->nr_addrs,
+- &op->ac.alist->addrs[op->ac.index].srx.transport);
++ rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
+
+ _leave(" = t");
+ return true;
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index 181317126e43..2603db03b7ff 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -296,7 +296,8 @@ static void afs_notify_end_request_tx(struct sock *sock,
+ */
+ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ {
+- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx;
++ struct afs_address *addr = &ac->alist->addrs[ac->index];
++ struct rxrpc_peer *peer = addr->peer;
+ struct rxrpc_call *rxcall;
+ struct msghdr msg;
+ struct kvec iov[1];
+@@ -304,7 +305,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ s64 tx_total_len;
+ int ret;
+
+- _enter(",{%pISp},", &srx->transport);
++ _enter(",{%pISp},", rxrpc_kernel_remote_addr(addr->peer));
+
+ ASSERT(call->type != NULL);
+ ASSERT(call->type->name != NULL);
+@@ -333,7 +334,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ }
+
+ /* create a call */
+- rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
++ rxcall = rxrpc_kernel_begin_call(call->net->socket, peer, call->key,
+ (unsigned long)call,
+ tx_total_len,
+ call->max_lifespan,
+@@ -341,6 +342,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ (call->async ?
+ afs_wake_up_async_call :
+ afs_wake_up_call_waiter),
++ addr->service_id,
+ call->upgrade,
+ (call->intr ? RXRPC_PREINTERRUPTIBLE :
+ RXRPC_UNINTERRUPTIBLE),
+@@ -461,7 +463,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
+ max = m + 1;
+ pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+ msg, call->type->name,
+- &call->alist->addrs[call->addr_ix].srx.transport);
++ rxrpc_kernel_remote_addr(call->alist->addrs[call->addr_ix].peer));
+ }
+ }
+
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index b8e2d211d4a1..5b5fa94005c9 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -21,13 +21,12 @@ static void __afs_put_server(struct afs_net *, struct afs_server *);
+ /*
+ * Find a server by one of its addresses.
+ */
+-struct afs_server *afs_find_server(struct afs_net *net,
+- const struct sockaddr_rxrpc *srx)
++struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+ {
+ const struct afs_addr_list *alist;
+ struct afs_server *server = NULL;
+ unsigned int i;
+- int seq = 1, diff;
++ int seq = 1;
+
+ rcu_read_lock();
+
+@@ -38,37 +37,11 @@ struct afs_server *afs_find_server(struct afs_net *net,
+ seq++; /* 2 on the 1st/lockless path, otherwise odd */
+ read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+
+- if (srx->transport.family == AF_INET6) {
+- const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
+- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+- alist = rcu_dereference(server->addresses);
+- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+- b = &alist->addrs[i].srx.transport.sin6;
+- diff = ((u16 __force)a->sin6_port -
+- (u16 __force)b->sin6_port);
+- if (diff == 0)
+- diff = memcmp(&a->sin6_addr,
+- &b->sin6_addr,
+- sizeof(struct in6_addr));
+- if (diff == 0)
+- goto found;
+- }
+- }
+- } else {
+- const struct sockaddr_in *a = &srx->transport.sin, *b;
+- hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+- alist = rcu_dereference(server->addresses);
+- for (i = 0; i < alist->nr_ipv4; i++) {
+- b = &alist->addrs[i].srx.transport.sin;
+- diff = ((u16 __force)a->sin_port -
+- (u16 __force)b->sin_port);
+- if (diff == 0)
+- diff = ((u32 __force)a->sin_addr.s_addr -
+- (u32 __force)b->sin_addr.s_addr);
+- if (diff == 0)
+- goto found;
+- }
+- }
++ hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
++ alist = rcu_dereference(server->addresses);
++ for (i = 0; i < alist->nr_addrs; i++)
++ if (alist->addrs[i].peer == peer)
++ goto found;
+ }
+
+ server = NULL;
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index d3c0df70a1a5..6fdf9f1bedc0 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -32,55 +32,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
+ return volume;
+ }
+
+-/*
+- * Compare two addresses.
+- */
+-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
+- const struct sockaddr_rxrpc *srx_b)
+-{
+- short port_a, port_b;
+- int addr_a, addr_b, diff;
+-
+- diff = (short)srx_a->transport_type - (short)srx_b->transport_type;
+- if (diff)
+- goto out;
+-
+- switch (srx_a->transport_type) {
+- case AF_INET: {
+- const struct sockaddr_in *a = &srx_a->transport.sin;
+- const struct sockaddr_in *b = &srx_b->transport.sin;
+- addr_a = ntohl(a->sin_addr.s_addr);
+- addr_b = ntohl(b->sin_addr.s_addr);
+- diff = addr_a - addr_b;
+- if (diff == 0) {
+- port_a = ntohs(a->sin_port);
+- port_b = ntohs(b->sin_port);
+- diff = port_a - port_b;
+- }
+- break;
+- }
+-
+- case AF_INET6: {
+- const struct sockaddr_in6 *a = &srx_a->transport.sin6;
+- const struct sockaddr_in6 *b = &srx_b->transport.sin6;
+- diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16);
+- if (diff == 0) {
+- port_a = ntohs(a->sin6_port);
+- port_b = ntohs(b->sin6_port);
+- diff = port_a - port_b;
+- }
+- break;
+- }
+-
+- default:
+- WARN_ON(1);
+- diff = 1;
+- }
+-
+-out:
+- return diff;
+-}
+-
+ /*
+ * Compare the address lists of a pair of fileservers.
+ */
+@@ -94,9 +45,9 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
+ lb = rcu_dereference(server_b->addresses);
+
+ while (a < la->nr_addrs && b < lb->nr_addrs) {
+- const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx;
+- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx;
+- int diff = afs_compare_addrs(srx_a, srx_b);
++ unsigned long pa = (unsigned long)la->addrs[a].peer;
++ unsigned long pb = (unsigned long)lb->addrs[b].peer;
++ long diff = pa - pb;
+
+ if (diff < 0) {
+ a++;
+diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
+index acc48216136a..ba89140eee9e 100644
+--- a/fs/afs/vl_list.c
++++ b/fs/afs/vl_list.c
+@@ -83,14 +83,15 @@ static u16 afs_extract_le16(const u8 **_b)
+ /*
+ * Build a VL server address list from a DNS queried server list.
+ */
+-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
++static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net,
++ const u8 **_b, const u8 *end,
+ u8 nr_addrs, u16 port)
+ {
+ struct afs_addr_list *alist;
+ const u8 *b = *_b;
+ int ret = -EINVAL;
+
+- alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
++ alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE);
+ if (!alist)
+ return ERR_PTR(-ENOMEM);
+ if (nr_addrs == 0)
+@@ -109,7 +110,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+ goto error;
+ }
+ memcpy(x, b, 4);
+- afs_merge_fs_addr4(alist, x[0], port);
++ ret = afs_merge_fs_addr4(net, alist, x[0], port);
++ if (ret < 0)
++ goto error;
+ b += 4;
+ break;
+
+@@ -119,7 +122,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+ goto error;
+ }
+ memcpy(x, b, 16);
+- afs_merge_fs_addr6(alist, x, port);
++ ret = afs_merge_fs_addr6(net, alist, x, port);
++ if (ret < 0)
++ goto error;
+ b += 16;
+ break;
+
+@@ -247,7 +252,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
+ /* Extract the addresses - note that we can't skip this as we
+ * have to advance the payload pointer.
+ */
+- addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
++ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port);
+ if (IS_ERR(addrs)) {
+ ret = PTR_ERR(addrs);
+ goto error_2;
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index bdd9372e3fb2..9551aef07cee 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -48,6 +48,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ {
+ struct afs_addr_list *alist = call->alist;
+ struct afs_vlserver *server = call->vlserver;
++ struct afs_address *addr = &alist->addrs[call->addr_ix];
+ unsigned int server_index = call->server_index;
+ unsigned int rtt_us = 0;
+ unsigned int index = call->addr_ix;
+@@ -106,16 +107,16 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ if (call->service_id == YFS_VL_SERVICE) {
+ server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
+ set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx.srx_service = call->service_id;
++ addr->service_id = call->service_id;
+ } else {
+ server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+ if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
+ clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+- alist->addrs[index].srx.srx_service = call->service_id;
++ addr->service_id = call->service_id;
+ }
+ }
+
+- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
++ rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+ if (rtt_us < server->probe.rtt) {
+ server->probe.rtt = rtt_us;
+ server->rtt = rtt_us;
+@@ -130,8 +131,9 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ out:
+ spin_unlock(&server->probe_lock);
+
+- _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+- server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret);
++ _debug("probe [%u][%u] %pISpc rtt=%d ret=%d",
++ server_index, index, rxrpc_kernel_remote_addr(addr->peer),
++ rtt_us, ret);
+
+ afs_done_one_vl_probe(server, have_result);
+ }
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index e52b9d4c8a0a..f8f255c966ae 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -92,7 +92,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ struct afs_addr_list *alist;
+ struct afs_vlserver *vlserver;
+ struct afs_error e;
+- u32 rtt;
++ unsigned int rtt;
+ int error = vc->ac.error, i;
+
+ _enter("%lx[%d],%lx[%d],%d,%d",
+@@ -194,7 +194,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ goto selected_server;
+
+ vc->index = -1;
+- rtt = U32_MAX;
++ rtt = UINT_MAX;
+ for (i = 0; i < vc->server_list->nr_servers; i++) {
+ struct afs_vlserver *s = vc->server_list->servers[i].server;
+
+@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+
+ _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+
+- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport);
++ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer));
+ return true;
+
+ next_server:
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 00fca3c66ba6..41e7932d75c6 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -208,7 +208,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+ count = ntohl(*bp);
+
+ nentries = min(nentries, count);
+- alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
++ alist = afs_alloc_addrlist(nentries, FS_SERVICE);
+ if (!alist)
+ return -ENOMEM;
+ alist->version = uniquifier;
+@@ -230,9 +230,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+ alist = call->ret_alist;
+ bp = call->buffer;
+ count = min(call->count, 4U);
+- for (i = 0; i < count; i++)
+- if (alist->nr_addrs < call->count2)
+- afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
++ for (i = 0; i < count; i++) {
++ if (alist->nr_addrs < call->count2) {
++ ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT);
++ if (ret < 0)
++ return ret;
++ }
++ }
+
+ call->count -= count;
+ if (call->count > 0)
+@@ -450,7 +454,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+ if (call->count > YFS_MAXENDPOINTS)
+ return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num);
+
+- alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
++ alist = afs_alloc_addrlist(call->count, FS_SERVICE);
+ if (!alist)
+ return -ENOMEM;
+ alist->version = uniquifier;
+@@ -488,14 +492,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+ if (ntohl(bp[0]) != sizeof(__be32) * 2)
+ return afs_protocol_error(
+ call, afs_eproto_yvl_fsendpt4_len);
+- afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
++ ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2]));
++ if (ret < 0)
++ return ret;
+ bp += 3;
+ break;
+ case YFS_ENDPOINT_IPV6:
+ if (ntohl(bp[0]) != sizeof(__be32) * 5)
+ return afs_protocol_error(
+ call, afs_eproto_yvl_fsendpt6_len);
+- afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
++ ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5]));
++ if (ret < 0)
++ return ret;
+ bp += 6;
+ break;
+ default:
+diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
+index 5531dd08061e..0754c463224a 100644
+--- a/include/net/af_rxrpc.h
++++ b/include/net/af_rxrpc.h
+@@ -15,6 +15,7 @@ struct key;
+ struct sock;
+ struct socket;
+ struct rxrpc_call;
++struct rxrpc_peer;
+ enum rxrpc_abort_reason;
+
+ enum rxrpc_interruptibility {
+@@ -41,13 +42,14 @@ void rxrpc_kernel_new_call_notification(struct socket *,
+ rxrpc_notify_new_call_t,
+ rxrpc_discard_new_call_t);
+ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+- struct sockaddr_rxrpc *srx,
++ struct rxrpc_peer *peer,
+ struct key *key,
+ unsigned long user_call_ID,
+ s64 tx_total_len,
+ u32 hard_timeout,
+ gfp_t gfp,
+ rxrpc_notify_rx_t notify_rx,
++ u16 service_id,
+ bool upgrade,
+ enum rxrpc_interruptibility interruptibility,
+ unsigned int debug_id);
+@@ -60,9 +62,14 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
+ u32, int, enum rxrpc_abort_reason);
+ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call);
+ void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call);
+-void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
+- struct sockaddr_rxrpc *);
+-bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *);
++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
++ struct sockaddr_rxrpc *srx, gfp_t gfp);
++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer);
++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer);
++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call);
++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer);
++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer);
++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *);
+ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
+ rxrpc_user_attach_call_t, unsigned long, gfp_t,
+ unsigned int);
+diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
+index f7e537f64db4..4c1ef7b3705c 100644
+--- a/include/trace/events/rxrpc.h
++++ b/include/trace/events/rxrpc.h
+@@ -178,7 +178,9 @@
+ #define rxrpc_peer_traces \
+ EM(rxrpc_peer_free, "FREE ") \
+ EM(rxrpc_peer_get_accept, "GET accept ") \
++ EM(rxrpc_peer_get_application, "GET app ") \
+ EM(rxrpc_peer_get_bundle, "GET bundle ") \
++ EM(rxrpc_peer_get_call, "GET call ") \
+ EM(rxrpc_peer_get_client_conn, "GET cln-conn") \
+ EM(rxrpc_peer_get_input, "GET input ") \
+ EM(rxrpc_peer_get_input_error, "GET inpt-err") \
+@@ -187,6 +189,7 @@
+ EM(rxrpc_peer_get_service_conn, "GET srv-conn") \
+ EM(rxrpc_peer_new_client, "NEW client ") \
+ EM(rxrpc_peer_new_prealloc, "NEW prealloc") \
++ EM(rxrpc_peer_put_application, "PUT app ") \
+ EM(rxrpc_peer_put_bundle, "PUT bundle ") \
+ EM(rxrpc_peer_put_call, "PUT call ") \
+ EM(rxrpc_peer_put_conn, "PUT conn ") \
+diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
+index fa8aec78f63d..465bfe5eb061 100644
+--- a/net/rxrpc/af_rxrpc.c
++++ b/net/rxrpc/af_rxrpc.c
+@@ -258,16 +258,62 @@ static int rxrpc_listen(struct socket *sock, int backlog)
+ return ret;
+ }
+
++/**
++ * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address
++ * @sock: The socket through which it will be accessed
++ * @srx: The network address
++ * @gfp: Allocation flags
++ *
++ * Lookup or create a remote transport endpoint record for the specified
++ * address and return it with a ref held.
++ */
++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
++ struct sockaddr_rxrpc *srx, gfp_t gfp)
++{
++ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
++ int ret;
++
++ ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
++ if (ret < 0)
++ return ERR_PTR(ret);
++
++ return rxrpc_lookup_peer(rx->local, srx, gfp);
++}
++EXPORT_SYMBOL(rxrpc_kernel_lookup_peer);
++
++/**
++ * rxrpc_kernel_get_peer - Get a reference on a peer
++ * @peer: The peer to get a reference on.
++ *
++ * Get a record for the remote peer in a call.
++ */
++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer)
++{
++ return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL;
++}
++EXPORT_SYMBOL(rxrpc_kernel_get_peer);
++
++/**
++ * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference
++ * @peer: The peer to drop a ref on
++ */
++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer)
++{
++ rxrpc_put_peer(peer, rxrpc_peer_put_application);
++}
++EXPORT_SYMBOL(rxrpc_kernel_put_peer);
++
+ /**
+ * rxrpc_kernel_begin_call - Allow a kernel service to begin a call
+ * @sock: The socket on which to make the call
+- * @srx: The address of the peer to contact
++ * @peer: The peer to contact
+ * @key: The security context to use (defaults to socket setting)
+ * @user_call_ID: The ID to use
+ * @tx_total_len: Total length of data to transmit during the call (or -1)
+ * @hard_timeout: The maximum lifespan of the call in sec
+ * @gfp: The allocation constraints
+ * @notify_rx: Where to send notifications instead of socket queue
++ * @service_id: The ID of the service to contact
+ * @upgrade: Request service upgrade for call
+ * @interruptibility: The call is interruptible, or can be canceled.
+ * @debug_id: The debug ID for tracing to be assigned to the call
+@@ -280,13 +326,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
+ * supplying @srx and @key.
+ */
+ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+- struct sockaddr_rxrpc *srx,
++ struct rxrpc_peer *peer,
+ struct key *key,
+ unsigned long user_call_ID,
+ s64 tx_total_len,
+ u32 hard_timeout,
+ gfp_t gfp,
+ rxrpc_notify_rx_t notify_rx,
++ u16 service_id,
+ bool upgrade,
+ enum rxrpc_interruptibility interruptibility,
+ unsigned int debug_id)
+@@ -295,13 +342,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+ struct rxrpc_call_params p;
+ struct rxrpc_call *call;
+ struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+- int ret;
+
+ _enter(",,%x,%lx", key_serial(key), user_call_ID);
+
+- ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
+- if (ret < 0)
+- return ERR_PTR(ret);
++ if (WARN_ON_ONCE(peer->local != rx->local))
++ return ERR_PTR(-EIO);
+
+ lock_sock(&rx->sk);
+
+@@ -319,12 +364,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
++ cp.peer = peer;
+ cp.key = key;
+ cp.security_level = rx->min_sec_level;
+ cp.exclusive = false;
+ cp.upgrade = upgrade;
+- cp.service_id = srx->srx_service;
+- call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id);
++ cp.service_id = service_id;
++ call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id);
+ /* The socket has been unlocked. */
+ if (!IS_ERR(call)) {
+ call->notify_rx = notify_rx;
+diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
+index e8b43408136a..5d5b19f20d1e 100644
+--- a/net/rxrpc/ar-internal.h
++++ b/net/rxrpc/ar-internal.h
+@@ -364,6 +364,7 @@ struct rxrpc_conn_proto {
+
+ struct rxrpc_conn_parameters {
+ struct rxrpc_local *local; /* Representation of local endpoint */
++ struct rxrpc_peer *peer; /* Representation of remote endpoint */
+ struct key *key; /* Security details */
+ bool exclusive; /* T if conn is exclusive */
+ bool upgrade; /* T if service ID can be upgraded */
+@@ -867,7 +868,6 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long
+ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
+ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
+ struct rxrpc_conn_parameters *,
+- struct sockaddr_rxrpc *,
+ struct rxrpc_call_params *, gfp_t,
+ unsigned int);
+ void rxrpc_start_call_timer(struct rxrpc_call *call);
+diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
+index f10b37c14772..0943e54370ba 100644
+--- a/net/rxrpc/call_object.c
++++ b/net/rxrpc/call_object.c
+@@ -193,7 +193,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
+ * Allocate a new client call.
+ */
+ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+- struct sockaddr_rxrpc *srx,
+ struct rxrpc_conn_parameters *cp,
+ struct rxrpc_call_params *p,
+ gfp_t gfp,
+@@ -211,10 +210,12 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+ now = ktime_get_real();
+ call->acks_latest_ts = now;
+ call->cong_tstamp = now;
+- call->dest_srx = *srx;
++ call->dest_srx = cp->peer->srx;
++ call->dest_srx.srx_service = cp->service_id;
+ call->interruptibility = p->interruptibility;
+ call->tx_total_len = p->tx_total_len;
+ call->key = key_get(cp->key);
++ call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call);
+ call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call);
+ call->security_level = cp->security_level;
+ if (p->kernel)
+@@ -306,10 +307,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
+
+ _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+
+- call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp);
+- if (!call->peer)
+- goto error;
+-
+ ret = rxrpc_look_up_bundle(call, gfp);
+ if (ret < 0)
+ goto error;
+@@ -334,7 +331,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
+ */
+ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
+ struct rxrpc_conn_parameters *cp,
+- struct sockaddr_rxrpc *srx,
+ struct rxrpc_call_params *p,
+ gfp_t gfp,
+ unsigned int debug_id)
+@@ -349,13 +345,18 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
+
+ _enter("%p,%lx", rx, p->user_call_ID);
+
++ if (WARN_ON_ONCE(!cp->peer)) {
++ release_sock(&rx->sk);
++ return ERR_PTR(-EIO);
++ }
++
+ limiter = rxrpc_get_call_slot(p, gfp);
+ if (!limiter) {
+ release_sock(&rx->sk);
+ return ERR_PTR(-ERESTARTSYS);
+ }
+
+- call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id);
++ call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id);
+ if (IS_ERR(call)) {
+ release_sock(&rx->sk);
+ up(limiter);
+diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
+index 8d7a715a0bb1..49dcda67a0d5 100644
+--- a/net/rxrpc/peer_object.c
++++ b/net/rxrpc/peer_object.c
+@@ -22,6 +22,8 @@
+ #include <net/ip6_route.h>
+ #include "ar-internal.h"
+
++static const struct sockaddr_rxrpc rxrpc_null_addr;
++
+ /*
+ * Hash a peer key.
+ */
+@@ -457,39 +459,53 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
+ }
+
+ /**
+- * rxrpc_kernel_get_peer - Get the peer address of a call
++ * rxrpc_kernel_get_call_peer - Get the peer address of a call
+ * @sock: The socket on which the call is in progress.
+ * @call: The call to query
+- * @_srx: Where to place the result
+ *
+- * Get the address of the remote peer in a call.
++ * Get a record for the remote peer in a call.
+ */
+-void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
+- struct sockaddr_rxrpc *_srx)
++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call)
+ {
+- *_srx = call->peer->srx;
++ return call->peer;
+ }
+-EXPORT_SYMBOL(rxrpc_kernel_get_peer);
++EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
+
+ /**
+ * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
+- * @sock: The socket on which the call is in progress.
+- * @call: The call to query
+- * @_srtt: Where to store the SRTT value.
++ * @peer: The peer to query
+ *
+- * Get the call's peer smoothed RTT in uS.
++ * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples.
+ */
+-bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call,
+- u32 *_srtt)
++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
+ {
+- struct rxrpc_peer *peer = call->peer;
++ return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX;
++}
++EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
+
+- if (peer->rtt_count == 0) {
+- *_srtt = 1000000; /* 1S */
+- return false;
+- }
++/**
++ * rxrpc_kernel_remote_srx - Get the address of a peer
++ * @peer: The peer to query
++ *
++ * Get a pointer to the address from a peer record. The caller is responsible
++ * for making sure that the address is not deallocated.
++ */
++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer)
++{
++ return peer ? &peer->srx : &rxrpc_null_addr;
++}
++EXPORT_SYMBOL(rxrpc_kernel_remote_srx);
+
+- *_srtt = call->peer->srtt_us >> 3;
+- return true;
++/**
++ * rxrpc_kernel_remote_addr - Get the peer transport address of a call
++ * @peer: The peer to query
++ *
++ * Get a pointer to the transport address from a peer record. The caller is
++ * responsible for making sure that the address is not deallocated.
++ */
++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer)
++{
++ return (const struct sockaddr *)
++ (peer ? &peer->srx.transport : &rxrpc_null_addr.transport);
+ }
+-EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
++EXPORT_SYMBOL(rxrpc_kernel_remote_addr);
+diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
+index 8e0b94714e84..5677d5690a02 100644
+--- a/net/rxrpc/sendmsg.c
++++ b/net/rxrpc/sendmsg.c
+@@ -572,6 +572,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+ __acquires(&call->user_mutex)
+ {
+ struct rxrpc_conn_parameters cp;
++ struct rxrpc_peer *peer;
+ struct rxrpc_call *call;
+ struct key *key;
+
+@@ -584,21 +585,29 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+ return ERR_PTR(-EDESTADDRREQ);
+ }
+
++ peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL);
++ if (!peer) {
++ release_sock(&rx->sk);
++ return ERR_PTR(-ENOMEM);
++ }
++
+ key = rx->key;
+ if (key && !rx->key->payload.data[0])
+ key = NULL;
+
+ memset(&cp, 0, sizeof(cp));
+ cp.local = rx->local;
++ cp.peer = peer;
+ cp.key = rx->key;
+ cp.security_level = rx->min_sec_level;
+ cp.exclusive = rx->exclusive | p->exclusive;
+ cp.upgrade = p->upgrade;
+ cp.service_id = srx->srx_service;
+- call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL,
++ call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL,
+ atomic_inc_return(&rxrpc_debug_id));
+ /* The socket is now unlocked */
+
++ rxrpc_put_peer(peer, rxrpc_peer_put_application);
+ _leave(" = %p\n", call);
+ return call;
+ }
+--
+2.43.0
+
--- /dev/null
+From 7f04c082cab699672ebde045251234ff678693af Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 19:16:42 -0800
+Subject: selftest: Don't reuse port for SO_INCOMING_CPU test.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 97de5a15edf2d22184f5ff588656030bbb7fa358 ]
+
+Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to
+fire somewhat randomly.
+
+ # # RUN so_incoming_cpu.before_reuseport.test3 ...
+ # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0)
+ # # test3: Test terminated by assertion
+ # # FAIL so_incoming_cpu.before_reuseport.test3
+ # not ok 3 so_incoming_cpu.before_reuseport.test3
+
+When the test failed, not-yet-accepted CLOSE_WAIT sockets received
+SYN with a "challenging" SEQ number, which was sent from an unexpected
+CPU that did not create the receiver.
+
+The test basically does:
+
+ 1. for each cpu:
+ 1-1. create a server
+ 1-2. set SO_INCOMING_CPU
+
+ 2. for each cpu:
+ 2-1. set cpu affinity
+ 2-2. create some clients
+ 2-3. let clients connect() to the server on the same cpu
+ 2-4. close() clients
+
+ 3. for each server:
+ 3-1. accept() all child sockets
+ 3-2. check if all children have the same SO_INCOMING_CPU with the server
+
+The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse.
+
+In a loop of 2., close() changed the client state to FIN_WAIT_2, and
+the peer transitioned to CLOSE_WAIT.
+
+In another loop of 2., connect() happened to select the same port of
+the FIN_WAIT_2 socket, and it was reused as the default value of
+net.ipv4.tcp_tw_reuse is 2.
+
+As a result, the new client sent SYN to the CLOSE_WAIT socket from
+a different CPU, and the receiver's sk_incoming_cpu was overwritten
+with unexpected CPU ID.
+
+Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket
+responded with Challenge ACK. The new client properly returned RST
+and effectively killed the CLOSE_WAIT socket.
+
+This way, all clients were created successfully, but the error was
+detected later by 3-2., ASSERT_EQ(cpu, i).
+
+To avoid the failure, let's make sure that (i) the number of clients
+is less than the number of available ports and (ii) such reuse never
+happens.
+
+Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Tested-by: Jakub Kicinski <kuba@kernel.org>
+Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++-----
+ 1 file changed, 50 insertions(+), 18 deletions(-)
+
+diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c
+index a14818164102..e9fa14e10732 100644
+--- a/tools/testing/selftests/net/so_incoming_cpu.c
++++ b/tools/testing/selftests/net/so_incoming_cpu.c
+@@ -3,19 +3,16 @@
+ #define _GNU_SOURCE
+ #include <sched.h>
+
++#include <fcntl.h>
++
+ #include <netinet/in.h>
+ #include <sys/socket.h>
+ #include <sys/sysinfo.h>
+
+ #include "../kselftest_harness.h"
+
+-#define CLIENT_PER_SERVER 32 /* More sockets, more reliable */
+-#define NR_SERVER self->nproc
+-#define NR_CLIENT (CLIENT_PER_SERVER * NR_SERVER)
+-
+ FIXTURE(so_incoming_cpu)
+ {
+- int nproc;
+ int *servers;
+ union {
+ struct sockaddr addr;
+@@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen)
+ .when_to_set = AFTER_ALL_LISTEN,
+ };
+
++static void write_sysctl(struct __test_metadata *_metadata,
++ char *filename, char *string)
++{
++ int fd, len, ret;
++
++ fd = open(filename, O_WRONLY);
++ ASSERT_NE(fd, -1);
++
++ len = strlen(string);
++ ret = write(fd, string, len);
++ ASSERT_EQ(ret, len);
++}
++
++static void setup_netns(struct __test_metadata *_metadata)
++{
++ ASSERT_EQ(unshare(CLONE_NEWNET), 0);
++ ASSERT_EQ(system("ip link set lo up"), 0);
++
++ write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001");
++ write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0");
++}
++
++#define NR_PORT (60001 - 10000 - 1)
++#define NR_CLIENT_PER_SERVER_DEFAULT 32
++static int nr_client_per_server, nr_server, nr_client;
++
+ FIXTURE_SETUP(so_incoming_cpu)
+ {
+- self->nproc = get_nprocs();
+- ASSERT_LE(2, self->nproc);
++ setup_netns(_metadata);
++
++ nr_server = get_nprocs();
++ ASSERT_LE(2, nr_server);
++
++ if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT)
++ nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT;
++ else
++ nr_client_per_server = NR_PORT / nr_server;
++
++ nr_client = nr_client_per_server * nr_server;
+
+- self->servers = malloc(sizeof(int) * NR_SERVER);
++ self->servers = malloc(sizeof(int) * nr_server);
+ ASSERT_NE(self->servers, NULL);
+
+ self->in_addr.sin_family = AF_INET;
+@@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu)
+ {
+ int i;
+
+- for (i = 0; i < NR_SERVER; i++)
++ for (i = 0; i < nr_server; i++)
+ close(self->servers[i]);
+
+ free(self->servers);
+@@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata,
+ if (variant->when_to_set == BEFORE_LISTEN)
+ set_so_incoming_cpu(_metadata, fd, cpu);
+
+- /* We don't use CLIENT_PER_SERVER here not to block
++ /* We don't use nr_client_per_server here not to block
+ * this test at connect() if SO_INCOMING_CPU is broken.
+ */
+- ret = listen(fd, NR_CLIENT);
++ ret = listen(fd, nr_client);
+ ASSERT_EQ(ret, 0);
+
+ if (variant->when_to_set == AFTER_LISTEN)
+@@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata,
+ {
+ int i, ret;
+
+- for (i = 0; i < NR_SERVER; i++) {
++ for (i = 0; i < nr_server; i++) {
+ self->servers[i] = create_server(_metadata, self, variant, i);
+
+ if (i == 0) {
+@@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata,
+ }
+
+ if (variant->when_to_set == AFTER_ALL_LISTEN) {
+- for (i = 0; i < NR_SERVER; i++)
++ for (i = 0; i < nr_server; i++)
+ set_so_incoming_cpu(_metadata, self->servers[i], i);
+ }
+ }
+@@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata,
+ cpu_set_t cpu_set;
+ int i, j, fd, ret;
+
+- for (i = 0; i < NR_SERVER; i++) {
++ for (i = 0; i < nr_server; i++) {
+ CPU_ZERO(&cpu_set);
+
+ CPU_SET(i, &cpu_set);
+@@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata,
+ ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
+ ASSERT_EQ(ret, 0);
+
+- for (j = 0; j < CLIENT_PER_SERVER; j++) {
++ for (j = 0; j < nr_client_per_server; j++) {
+ fd = socket(AF_INET, SOCK_STREAM, 0);
+ ASSERT_NE(fd, -1);
+
+@@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+ int i, j, fd, cpu, ret, total = 0;
+ socklen_t len = sizeof(int);
+
+- for (i = 0; i < NR_SERVER; i++) {
+- for (j = 0; j < CLIENT_PER_SERVER; j++) {
++ for (i = 0; i < nr_server; i++) {
++ for (j = 0; j < nr_client_per_server; j++) {
+ /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */
+ fd = accept(self->servers[i], &self->addr, &self->addrlen);
+ ASSERT_NE(fd, -1);
+@@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+ }
+ }
+
+- ASSERT_EQ(total, NR_CLIENT);
++ ASSERT_EQ(total, nr_client);
+ TH_LOG("SO_INCOMING_CPU is very likely to be "
+ "working correctly with %d sockets.", total);
+ }
+--
+2.43.0
+
--- /dev/null
+From e1c2375e884d125e1bb994d6ca1b503b345d0b2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 15:59:17 +0800
+Subject: selftests: bonding: do not test arp/ns target with mode
+ balance-alb/tlb
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit a2933a8759a62269754e54733d993b19de870e84 ]
+
+The prio_arp/ns tests hard code the mode to active-backup. At the same
+time, The balance-alb/tlb modes do not support arp/ns target. So remove
+the prio_arp/ns tests from the loop and only test active-backup mode.
+
+Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests")
+Reported-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+index c54d1697f439..d508486cc0bd 100755
+--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
++++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+@@ -162,7 +162,7 @@ prio_arp()
+ local mode=$1
+
+ for primary_reselect in 0 1 2; do
+- prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
++ prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
+ log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect"
+ done
+ }
+@@ -178,7 +178,7 @@ prio_ns()
+ fi
+
+ for primary_reselect in 0 1 2; do
+- prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
++ prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
+ log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect"
+ done
+ }
+@@ -194,9 +194,9 @@ prio()
+
+ for mode in $modes; do
+ prio_miimon $mode
+- prio_arp $mode
+- prio_ns $mode
+ done
++ prio_arp "active-backup"
++ prio_ns "active-backup"
+ }
+
+ arp_validate_test()
+--
+2.43.0
+
--- /dev/null
+From 7ea5a6e9ccc49ca9a909ce3a6a447474f910df84 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 19:12:32 -0500
+Subject: selftests: bonding: Increase timeout to 1200s
+
+From: Benjamin Poirier <bpoirier@nvidia.com>
+
+[ Upstream commit b01f15a7571b7aa222458bc9bf26ab59bd84e384 ]
+
+When tests are run by runner.sh, bond_options.sh gets killed before
+it can complete:
+
+make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding"
+ [...]
+ # timeout set to 120
+ # selftests: drivers/net/bonding: bond_options.sh
+ # TEST: prio (active-backup miimon primary_reselect 0) [ OK ]
+ # TEST: prio (active-backup miimon primary_reselect 1) [ OK ]
+ # TEST: prio (active-backup miimon primary_reselect 2) [ OK ]
+ # TEST: prio (active-backup arp_ip_target primary_reselect 0) [ OK ]
+ # TEST: prio (active-backup arp_ip_target primary_reselect 1) [ OK ]
+ # TEST: prio (active-backup arp_ip_target primary_reselect 2) [ OK ]
+ #
+ not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds
+
+This test includes many sleep statements, at least some of which are
+related to timers in the operation of the bonding driver itself. Increase
+the test timeout to allow the test to complete.
+
+I ran the test in slightly different VMs (including one without HW
+virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and
+13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s.
+
+Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/drivers/net/bonding/settings | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings
+index 6091b45d226b..79b65bdf05db 100644
+--- a/tools/testing/selftests/drivers/net/bonding/settings
++++ b/tools/testing/selftests/drivers/net/bonding/settings
+@@ -1 +1 @@
+-timeout=120
++timeout=1200
+--
+2.43.0
+
--- /dev/null
+From 0e8d9de5247c4b0e9e3d8c0b504d84ca340c109d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 12:35:28 -0800
+Subject: selftests: fill in some missing configs for net
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 ]
+
+We are missing a lot of config options from net selftests,
+it seems:
+
+tun/tap: CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP
+fib_tests: CONFIG_NET_SCH_FQ_CODEL
+l2tp: CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH
+sctp-vrf: CONFIG_INET_DIAG
+txtimestamp: CONFIG_NET_CLS_U32
+vxlan_mdb: CONFIG_BRIDGE_VLAN_FILTERING
+gre_gso: CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE
+srv6_end_dt*_l3vpn: CONFIG_IPV6_SEG6_LWTUNNEL
+ip_local_port_range: CONFIG_MPTCP
+fib_test: CONFIG_NET_CLS_BASIC
+rtnetlink: CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE
+ CONFIG_NET_IPGRE, CONFIG_BONDING
+fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING
+vxlan_mdb: CONFIG_NET_ACT_GACT
+tls: CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305
+psample: CONFIG_PSAMPLE
+fcnal: CONFIG_TCP_MD5SIG
+
+Try to add them in a semi-alphabetical order.
+
+Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test")
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE")
+Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
+index 8da562a9ae87..19ff75051660 100644
+--- a/tools/testing/selftests/net/config
++++ b/tools/testing/selftests/net/config
+@@ -1,5 +1,6 @@
+ CONFIG_USER_NS=y
+ CONFIG_NET_NS=y
++CONFIG_BONDING=m
+ CONFIG_BPF_SYSCALL=y
+ CONFIG_TEST_BPF=m
+ CONFIG_NUMA=y
+@@ -14,9 +15,13 @@ CONFIG_VETH=y
+ CONFIG_NET_IPVTI=y
+ CONFIG_IPV6_VTI=y
+ CONFIG_DUMMY=y
++CONFIG_BRIDGE_VLAN_FILTERING=y
+ CONFIG_BRIDGE=y
++CONFIG_CRYPTO_CHACHA20POLY1305=m
+ CONFIG_VLAN_8021Q=y
+ CONFIG_IFB=y
++CONFIG_INET_DIAG=y
++CONFIG_IP_GRE=m
+ CONFIG_NETFILTER=y
+ CONFIG_NETFILTER_ADVANCED=y
+ CONFIG_NF_CONNTRACK=m
+@@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m
+ CONFIG_IP_NF_IPTABLES=m
+ CONFIG_IP6_NF_NAT=m
+ CONFIG_IP_NF_NAT=m
++CONFIG_IPV6_GRE=m
++CONFIG_IPV6_SEG6_LWTUNNEL=y
++CONFIG_L2TP_ETH=m
++CONFIG_L2TP_IP=m
++CONFIG_L2TP=m
++CONFIG_L2TP_V3=y
++CONFIG_MACSEC=m
++CONFIG_MACVLAN=y
++CONFIG_MACVTAP=y
++CONFIG_MPLS=y
++CONFIG_MPTCP=y
+ CONFIG_NF_TABLES=m
+ CONFIG_NF_TABLES_IPV6=y
+ CONFIG_NF_TABLES_IPV4=y
+ CONFIG_NFT_NAT=m
++CONFIG_NET_ACT_GACT=m
++CONFIG_NET_CLS_BASIC=m
++CONFIG_NET_CLS_U32=m
++CONFIG_NET_IPGRE_DEMUX=m
++CONFIG_NET_IPGRE=m
++CONFIG_NET_SCH_FQ_CODEL=m
++CONFIG_NET_SCH_HTB=m
+ CONFIG_NET_SCH_FQ=m
+ CONFIG_NET_SCH_ETF=m
+ CONFIG_NET_SCH_NETEM=y
++CONFIG_PSAMPLE=m
++CONFIG_TCP_MD5SIG=y
+ CONFIG_TEST_BLACKHOLE_DEV=m
+ CONFIG_KALLSYMS=y
++CONFIG_TLS=m
+ CONFIG_TRACEPOINTS=y
+ CONFIG_NET_DROP_MONITOR=m
+ CONFIG_NETDEVSIM=m
+@@ -48,7 +74,9 @@ CONFIG_BAREUDP=m
+ CONFIG_IPV6_IOAM6_LWTUNNEL=y
+ CONFIG_CRYPTO_SM4_GENERIC=y
+ CONFIG_AMT=m
++CONFIG_TUN=y
+ CONFIG_VXLAN=m
+ CONFIG_IP_SCTP=m
+ CONFIG_NETFILTER_XT_MATCH_POLICY=m
+ CONFIG_CRYPTO_ARIA=y
++CONFIG_XFRM_INTERFACE=m
+--
+2.43.0
+
--- /dev/null
+From 59a87296fd28cff283ab62c5fcad3bbbbda290a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 11:58:15 -0800
+Subject: selftests: net: fix rps_default_mask with >32 CPUs
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 ]
+
+If there is more than 32 cpus the bitmask will start to contain
+commas, leading to:
+
+./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected
+
+Remove the commas, bash doesn't interpret leading zeroes as oct
+so that should be good enough. Switch to bash, Simon reports that
+not all shells support this type of substitution.
+
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/rps_default_mask.sh | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
+index a26c5624429f..4287a8529890 100755
+--- a/tools/testing/selftests/net/rps_default_mask.sh
++++ b/tools/testing/selftests/net/rps_default_mask.sh
+@@ -1,4 +1,4 @@
+-#!/bin/sh
++#!/bin/bash
+ # SPDX-License-Identifier: GPL-2.0
+
+ readonly ksft_skip=4
+@@ -33,6 +33,10 @@ chk_rps() {
+
+ rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+ printf "%-60s" "$msg"
++
++ # In case there is more than 32 CPUs we need to remove commas from masks
++ rps_mask=${rps_mask//,}
++ expected_rps_mask=${expected_rps_mask//,}
+ if [ $rps_mask -eq $expected_rps_mask ]; then
+ echo "[ ok ]"
+ else
+--
+2.43.0
+
--- /dev/null
+From dfca3552b6e84cfec030272e59570cf545b3148a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 22:05:29 -0800
+Subject: selftests: netdevsim: fix the udp_tunnel_nic test
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0879020a7817e7ce636372c016b4528f541c9f4d ]
+
+This test is missing a whole bunch of checks for interface
+renaming and one ifup. Presumably it was only used on a system
+with renaming disabled and NetworkManager running.
+
+Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra")
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+index 1b08e042cf94..185b02d2d4cd 100755
+--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
++++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+@@ -269,6 +269,7 @@ for port in 0 1; do
+ echo 1 > $NSIM_DEV_SYS/new_port
+ fi
+ NSIM_NETDEV=`get_netdev_name old_netdevs`
++ ifconfig $NSIM_NETDEV up
+
+ msg="new NIC device created"
+ exp0=( 0 0 0 0 )
+@@ -430,6 +431,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "overflow NIC table"
+@@ -487,6 +489,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "overflow NIC table"
+@@ -543,6 +546,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ overflow_table0 "destroy NIC"
+@@ -572,6 +576,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+@@ -632,6 +637,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+@@ -687,6 +693,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+@@ -746,6 +753,7 @@ for port in 0 1; do
+ fi
+
+ echo $port > $NSIM_DEV_SYS/new_port
++ NSIM_NETDEV=`get_netdev_name old_netdevs`
+ ifconfig $NSIM_NETDEV up
+
+ msg="create VxLANs v6"
+@@ -876,6 +884,7 @@ msg="re-add a port"
+
+ echo 2 > $NSIM_DEV_SYS/del_port
+ echo 2 > $NSIM_DEV_SYS/new_port
++NSIM_NETDEV=`get_netdev_name old_netdevs`
+ check_tables
+
+ msg="replace VxLAN in overflow table"
+--
+2.43.0
+
ksmbd-don-t-increment-epoch-if-current-state-and-request-state-are-same.patch
ksmbd-send-lease-break-notification-on-file_rename_information.patch
ksmbd-add-missing-set_freezable-for-freezable-kthread.patch
+sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch
+wifi-mac80211-fix-potential-sta-link-leak.patch
+btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch
+net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch
+selftests-bonding-increase-timeout-to-1200s.patch
+tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch
+bnxt_en-wait-for-flr-to-complete-during-probe.patch
+bnxt_en-prevent-kernel-warning-when-running-offline-.patch
+vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch
+llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch
+llc-drop-support-for-eth_p_tr_802_2.patch
+udp-fix-busy-polling.patch
+idpf-distinguish-vports-by-the-dev_port-attribute.patch
+net-fix-removing-a-namespace-with-conflicting-altnam.patch
+tun-fix-missing-dropped-counter-in-tun_xdp_act.patch
+tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch
+dpll-fix-broken-error-path-in-dpll_pin_alloc.patch
+dpll-fix-pin-dump-crash-for-rebound-module.patch
+dpll-fix-userspace-availability-of-pins.patch
+dpll-fix-register-pin-with-unregistered-parent-pin.patch
+net-micrel-fix-ptp-frame-parsing-for-lan8814.patch
+net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch
+netfs-fscache-prevent-oops-in-fscache_put_cache.patch
+tracing-ensure-visibility-when-inserting-an-element-.patch
+afs-hide-silly-rename-files-from-userspace.patch
+afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch
+afs-add-comments-on-abort-handling.patch
+afs-turn-the-afs_addr_list-address-array-into-an-arr.patch
+rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch
+afs-handle-the-vio-and-uaeio-aborts-explicitly.patch
+afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch
+afs-wrap-most-op-error-accesses-with-inline-funcs.patch
+afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch
+afs-simplify-error-handling.patch
+afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch
+tcp-add-memory-barrier-to-tcp_push.patch
+selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch
+netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch
+ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch
+selftests-fill-in-some-missing-configs-for-net.patch
+net-sched-flower-fix-chain-template-offload.patch
+net-mlx5e-fix-operation-precedence-bug-in-port-times.patch
+net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch
+net-mlx5e-fix-peer-flow-lists-handling.patch
+net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch
+net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch
+net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch
+net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch
+net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch
+net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch
+net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch
+net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch
+net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch
+rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch
+netfilter-nft_limit-reject-configurations-that-cause.patch
+netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch
+netfilter-nf_tables-validate-nfproto_-family.patch
+net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch
+net-mvpp2-clear-bm-pool-before-initialization.patch
+selftests-net-fix-rps_default_mask-with-32-cpus.patch
+selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch
+xsk-recycle-buffer-in-case-rx-queue-was-full.patch
+xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch
+xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch
+ice-work-on-pre-xdp-prog-frag-count.patch
+i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch
+ice-remove-redundant-xdp_rxq_info-registration.patch
+intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch
+ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch
+xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch
+i40e-set-xdp_rxq_info-frag_size.patch
+i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch
+fjes-fix-memleaks-in-fjes_hw_setup.patch
+selftests-bonding-do-not-test-arp-ns-target-with-mod.patch
+net-fec-fix-the-unhandled-context-fault-from-smmu.patch
+tsnep-remove-fcs-for-xdp-data-path.patch
+tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch
--- /dev/null
+From 58c1e7163139f42ce19e100c31cfa906196959e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 22:06:28 +0100
+Subject: SUNRPC: use request size to initialize bio_vec in svc_udp_sendto()
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+[ Upstream commit 1d9cabe2817edd215779dc9c2fe5e7ab9aac0704 ]
+
+Use the proper size when setting up the bio_vec, as otherwise only
+zero-length UDP packets will be sent.
+
+Fixes: baabf59c2414 ("SUNRPC: Convert svc_udp_sendto() to use the per-socket bio_vec array")
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sunrpc/svcsock.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
+index 998687421fa6..e0ce4276274b 100644
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -717,12 +717,12 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
+ ARRAY_SIZE(rqstp->rq_bvec), xdr);
+
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+- count, 0);
++ count, rqstp->rq_res.len);
+ err = sock_sendmsg(svsk->sk_sock, &msg);
+ if (err == -ECONNREFUSED) {
+ /* ICMP error on earlier request. */
+ iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+- count, 0);
++ count, rqstp->rq_res.len);
+ err = sock_sendmsg(svsk->sk_sock, &msg);
+ }
+
+--
+2.43.0
+
--- /dev/null
+From 38e4f18aaf076cdae4088f54afc329320ce2f0f3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:01:33 -0800
+Subject: tcp: Add memory barrier to tcp_push()
+
+From: Salvatore Dipietro <dipiets@amazon.com>
+
+[ Upstream commit 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 ]
+
+On CPUs with weak memory models, reads and updates performed by tcp_push
+to the sk variables can get reordered leaving the socket throttled when
+it should not. The tasklet running tcp_wfree() may also not observe the
+memory updates in time and will skip flushing any packets throttled by
+tcp_push(), delaying the sending. This can pathologically cause 40ms
+extra latency due to bad interactions with delayed acks.
+
+Adding a memory barrier in tcp_push removes the bug, similarly to the
+previous commit bf06200e732d ("tcp: tsq: fix nonagle handling").
+smp_mb__after_atomic() is used to not incur in unnecessary overhead
+on x86 since not affected.
+
+Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu
+22.04 and Apache Tomcat 9.0.83 running the basic servlet below:
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+public class HelloWorldServlet extends HttpServlet {
+ @Override
+ protected void doGet(HttpServletRequest request, HttpServletResponse response)
+ throws ServletException, IOException {
+ response.setContentType("text/html;charset=utf-8");
+ OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8");
+ String s = "a".repeat(3096);
+ osw.write(s,0,s.length());
+ osw.flush();
+ }
+}
+
+Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS
+c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+
+values is observed while, with the patch, the extra latency disappears.
+
+No patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello
+ ...
+ 50.000% 0.91ms
+ 75.000% 1.13ms
+ 90.000% 1.46ms
+ 99.000% 1.74ms
+ 99.900% 1.89ms
+ 99.990% 41.95ms <<< 40+ ms extra latency
+ 99.999% 48.32ms
+100.000% 48.96ms
+
+With patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello
+ ...
+ 50.000% 0.90ms
+ 75.000% 1.13ms
+ 90.000% 1.45ms
+ 99.000% 1.72ms
+ 99.900% 1.83ms
+ 99.990% 2.11ms <<< no 40+ ms extra latency
+ 99.999% 2.53ms
+100.000% 2.62ms
+
+Patch has been also tested on x86 (m7i.2xlarge instance) which it is not
+affected by this issue and the patch doesn't introduce any additional
+delay.
+
+Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc")
+Signed-off-by: Salvatore Dipietro <dipiets@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index ff6838ca2e58..7bce79beca2b 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
+ if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+ NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+ set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
++ smp_mb__after_atomic();
+ }
+ /* It is possible TX completion already happened
+ * before we set TSQ_THROTTLED.
+--
+2.43.0
+
--- /dev/null
+From a96f3a5cb5848f1ff49b6839fc043d33bc94ec3b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 09:20:19 +0800
+Subject: tcp: make sure init the accept_queue's spinlocks once
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 198bc90e0e734e5f98c3d2833e8390cac3df61b2 ]
+
+When I run syz's reproduction C program locally, it causes the following
+issue:
+pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0!
+WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7
+30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90
+RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900
+RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff
+R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000
+R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000
+FS: 00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000
+CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0
+Call Trace:
+<IRQ>
+ _raw_spin_unlock (kernel/locking/spinlock.c:186)
+ inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321)
+ inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358)
+ tcp_check_req (net/ipv4/tcp_minisocks.c:868)
+ tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260)
+ ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205)
+ ip_local_deliver_finish (net/ipv4/ip_input.c:234)
+ __netif_receive_skb_one_core (net/core/dev.c:5529)
+ process_backlog (./include/linux/rcupdate.h:779)
+ __napi_poll (net/core/dev.c:6533)
+ net_rx_action (net/core/dev.c:6604)
+ __do_softirq (./arch/x86/include/asm/jump_label.h:27)
+ do_softirq (kernel/softirq.c:454 kernel/softirq.c:441)
+</IRQ>
+<TASK>
+ __local_bh_enable_ip (kernel/softirq.c:381)
+ __dev_queue_xmit (net/core/dev.c:4374)
+ ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235)
+ __ip_queue_xmit (net/ipv4/ip_output.c:535)
+ __tcp_transmit_skb (net/ipv4/tcp_output.c:1462)
+ tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469)
+ tcp_rcv_state_process (net/ipv4/tcp_input.c:6657)
+ tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929)
+ __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968)
+ release_sock (net/core/sock.c:3536)
+ inet_wait_for_connect (net/ipv4/af_inet.c:609)
+ __inet_stream_connect (net/ipv4/af_inet.c:702)
+ inet_stream_connect (net/ipv4/af_inet.c:748)
+ __sys_connect (./include/linux/file.h:45 net/socket.c:2064)
+ __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070)
+ do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82)
+ entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+ RIP: 0033:0x7fa10ff05a3d
+ Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89
+ c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+ RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a
+ RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d
+ RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
+ RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000
+ R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640
+ R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20
+</TASK>
+
+The issue triggering process is analyzed as follows:
+Thread A Thread B
+tcp_v4_rcv //receive ack TCP packet inet_shutdown
+ tcp_check_req tcp_disconnect //disconnect sock
+ ... tcp_set_state(sk, TCP_CLOSE)
+ inet_csk_complete_hashdance ...
+ inet_csk_reqsk_queue_add inet_listen //start listen
+ spin_lock(&queue->rskq_lock) inet_csk_listen_start
+ ... reqsk_queue_alloc
+ ... spin_lock_init
+ spin_unlock(&queue->rskq_lock) //warning
+
+When the socket receives the ACK packet during the three-way handshake,
+it will hold spinlock. And then the user actively shutdowns the socket
+and listens to the socket immediately, the spinlock will be initialized.
+When the socket is going to release the spinlock, a warning is generated.
+Also the same issue to fastopenq.lock.
+
+Move init spinlock to inet_create and inet_accept to make sure init the
+accept_queue's spinlocks once.
+
+Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue")
+Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
+Reported-by: Ming Shu <sming56@aliyun.com>
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_connection_sock.h | 8 ++++++++
+ net/core/request_sock.c | 3 ---
+ net/ipv4/af_inet.c | 3 +++
+ net/ipv4/inet_connection_sock.c | 4 ++++
+ 4 files changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index d0a2f827d5f2..9ab4bf704e86 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk)
+ return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops;
+ }
+
++static inline void inet_init_csk_locks(struct sock *sk)
++{
++ struct inet_connection_sock *icsk = inet_csk(sk);
++
++ spin_lock_init(&icsk->icsk_accept_queue.rskq_lock);
++ spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock);
++}
++
+ #endif /* _INET_CONNECTION_SOCK_H */
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c
+index f35c2e998406..63de5c635842 100644
+--- a/net/core/request_sock.c
++++ b/net/core/request_sock.c
+@@ -33,9 +33,6 @@
+
+ void reqsk_queue_alloc(struct request_sock_queue *queue)
+ {
+- spin_lock_init(&queue->rskq_lock);
+-
+- spin_lock_init(&queue->fastopenq.lock);
+ queue->fastopenq.rskq_rst_head = NULL;
+ queue->fastopenq.rskq_rst_tail = NULL;
+ queue->fastopenq.qlen = 0;
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index ea0b0334a0fb..1c58bd72e124 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -330,6 +330,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
+ if (INET_PROTOSW_REUSE & answer_flags)
+ sk->sk_reuse = SK_CAN_REUSE;
+
++ if (INET_PROTOSW_ICSK & answer_flags)
++ inet_init_csk_locks(sk);
++
+ inet = inet_sk(sk);
+ inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index 394a498c2823..762817d6c8d7 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -730,6 +730,10 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+ }
+ if (req)
+ reqsk_put(req);
++
++ if (newsk)
++ inet_init_csk_locks(newsk);
++
+ return newsk;
+ out_err:
+ newsk = NULL;
+--
+2.43.0
+
--- /dev/null
+From bb9fda497e2b1f6927c979fc39505e3da60e56bb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 16:09:28 +0100
+Subject: tracing: Ensure visibility when inserting an element into tracing_map
+
+From: Petr Pavlu <petr.pavlu@suse.com>
+
+[ Upstream commit 2b44760609e9eaafc9d234a6883d042fc21132a7 ]
+
+Running the following two commands in parallel on a multi-processor
+AArch64 machine can sporadically produce an unexpected warning about
+duplicate histogram entries:
+
+ $ while true; do
+ echo hist:key=id.syscall:val=hitcount > \
+ /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
+ cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
+ sleep 0.001
+ done
+ $ stress-ng --sysbadaddr $(nproc)
+
+The warning looks as follows:
+
+[ 2911.172474] ------------[ cut here ]------------
+[ 2911.173111] Duplicates detected: 1
+[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408
+[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E)
+[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1
+[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G E 6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01
+[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018
+[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
+[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.185310] sp : ffff8000a1513900
+[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001
+[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008
+[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180
+[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff
+[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8
+[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731
+[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c
+[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8
+[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000
+[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480
+[ 2911.194259] Call trace:
+[ 2911.194626] tracing_map_sort_entries+0x3e0/0x408
+[ 2911.195220] hist_show+0x124/0x800
+[ 2911.195692] seq_read_iter+0x1d4/0x4e8
+[ 2911.196193] seq_read+0xe8/0x138
+[ 2911.196638] vfs_read+0xc8/0x300
+[ 2911.197078] ksys_read+0x70/0x108
+[ 2911.197534] __arm64_sys_read+0x24/0x38
+[ 2911.198046] invoke_syscall+0x78/0x108
+[ 2911.198553] el0_svc_common.constprop.0+0xd0/0xf8
+[ 2911.199157] do_el0_svc+0x28/0x40
+[ 2911.199613] el0_svc+0x40/0x178
+[ 2911.200048] el0t_64_sync_handler+0x13c/0x158
+[ 2911.200621] el0t_64_sync+0x1a8/0x1b0
+[ 2911.201115] ---[ end trace 0000000000000000 ]---
+
+The problem appears to be caused by CPU reordering of writes issued from
+__tracing_map_insert().
+
+The check for the presence of an element with a given key in this
+function is:
+
+ val = READ_ONCE(entry->val);
+ if (val && keys_match(key, val->key, map->key_size)) ...
+
+The write of a new entry is:
+
+ elt = get_free_elt(map);
+ memcpy(elt->key, key, map->key_size);
+ entry->val = elt;
+
+The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;"
+stores may become visible in the reversed order on another CPU. This
+second CPU might then incorrectly determine that a new key doesn't match
+an already present val->key and subsequently insert a new element,
+resulting in a duplicate.
+
+Fix the problem by adding a write barrier between
+"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for
+good measure, also use WRITE_ONCE(entry->val, elt) for publishing the
+element. The sequence pairs with the mentioned "READ_ONCE(entry->val);"
+and the "val->key" check which has an address dependency.
+
+The barrier is placed on a path executed when adding an element for
+a new key. Subsequent updates targeting the same key remain unaffected.
+
+From the user's perspective, the issue was introduced by commit
+c193707dde77 ("tracing: Remove code which merges duplicates"), which
+followed commit cbf4100efb8f ("tracing: Add support to detect and avoid
+duplicates"). The previous code operated differently; it inherently
+expected potential races which result in duplicates but merged them
+later when they occurred.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com
+
+Fixes: c193707dde77 ("tracing: Remove code which merges duplicates")
+Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
+Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/tracing_map.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
+index c774e560f2f9..a4dcf0f24352 100644
+--- a/kernel/trace/tracing_map.c
++++ b/kernel/trace/tracing_map.c
+@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+ }
+
+ memcpy(elt->key, key, map->key_size);
+- entry->val = elt;
++ /*
++ * Ensure the initialization is visible and
++ * publish the elt.
++ */
++ smp_wmb();
++ WRITE_ONCE(entry->val, elt);
+ atomic64_inc(&map->hits);
+
+ return entry->val;
+--
+2.43.0
+
--- /dev/null
+From 96e806d8a4c5fc2829444a2c28f953e512431242 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:18 +0100
+Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c ]
+
+The fill ring of the XDP socket may contain not enough buffers to
+completey fill the RX queue during socket creation. In this case the
+flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX
+queue is not completely filled during polling.
+
+Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled
+during XDP socket creation.
+
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index 456e0336f3f6..9aeff2b37a61 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1762,6 +1762,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx)
+ allocated--;
+ }
+ }
++
++ /* set need wakeup flag immediately if ring is not filled completely,
++ * first polling would be too late as need wakeup signalisation would
++ * be delayed for an indefinite time
++ */
++ if (xsk_uses_need_wakeup(rx->xsk_pool)) {
++ int desc_available = tsnep_rx_desc_available(rx);
++
++ if (desc_available)
++ xsk_set_rx_need_wakeup(rx->xsk_pool);
++ else
++ xsk_clear_rx_need_wakeup(rx->xsk_pool);
++ }
+ }
+
+ static bool tsnep_pending(struct tsnep_queue *queue)
+--
+2.43.0
+
--- /dev/null
+From 2d21d1e8559b9b89588155510d926751ac77c1ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:17 +0100
+Subject: tsnep: Remove FCS for XDP data path
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b ]
+
+The RX data buffer includes the FCS. The FCS is already stripped for the
+normal data path. But for the XDP data path the FCS is included and
+acts like additional/useless data.
+
+Remove the FCS from the RX data buffer also for XDP.
+
+Fixes: 65b28c810035 ("tsnep: Add XDP RX support")
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index df40c720e7b2..456e0336f3f6 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1485,7 +1485,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
+
+ xdp_prepare_buff(&xdp, page_address(entry->page),
+ XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE,
+- length, false);
++ length - ETH_FCS_LEN, false);
+
+ consume = tsnep_xdp_run_prog(rx, prog, &xdp,
+ &xdp_status, tx_nq, tx);
+@@ -1568,7 +1568,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
+ prefetch(entry->xdp->data);
+ length = __le32_to_cpu(entry->desc_wb->properties) &
+ TSNEP_DESC_LENGTH_MASK;
+- xsk_buff_set_size(entry->xdp, length);
++ xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN);
+ xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+
+ /* RX metadata with timestamps is in front of actual data,
+--
+2.43.0
+
--- /dev/null
+From 1cc6dc39a13a171888b77d808036b7e7b1013f78 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:56 +0800
+Subject: tun: add missing rx stats accounting in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit f1084c427f55d573fcd5688d9ba7b31b78019716 ]
+
+The TUN can be used as vhost-net backend, and it is necessary to
+count the packets transmitted from TUN to vhost-net/virtio-net.
+However, there are some places in the receive path that were not
+taken into account when using XDP. It would be beneficial to also
+include new accounting for successfully received bytes using
+dev_sw_netstats_rx_add.
+
+Fixes: 761876c857cb ("tap: XDP support")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index 237fef557ba5..4a4f8c8e79fa 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ dev_core_stats_rx_dropped_inc(tun->dev);
+ return err;
+ }
++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+@@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ dev_core_stats_rx_dropped_inc(tun->dev);
+ return err;
+ }
++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+ break;
+ case XDP_PASS:
+ break;
+--
+2.43.0
+
--- /dev/null
+From c8a46f874ec240aa00fd746cab6f00c704ef999a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:35 +0800
+Subject: tun: fix missing dropped counter in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 ]
+
+The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes
+dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions.
+Unfortunately, that commit missed the dropped counter when error
+occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes
+this issue.
+
+Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index afa5497f7c35..237fef557ba5 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+ switch (act) {
+ case XDP_REDIRECT:
+ err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+- if (err)
++ if (err) {
++ dev_core_stats_rx_dropped_inc(tun->dev);
+ return err;
++ }
+ break;
+ case XDP_TX:
+ err = tun_xdp_tx(tun->dev, xdp);
+- if (err < 0)
++ if (err < 0) {
++ dev_core_stats_rx_dropped_inc(tun->dev);
+ return err;
++ }
+ break;
+ case XDP_PASS:
+ break;
+--
+2.43.0
+
--- /dev/null
+From 40295213484304936b40dafc21ab65a5dd7cce8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 20:17:49 +0000
+Subject: udp: fix busy polling
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a54d51fb2dfb846aedf3751af501e9688db447f5 ]
+
+Generic sk_busy_loop_end() only looks at sk->sk_receive_queue
+for presence of packets.
+
+Problem is that for UDP sockets after blamed commit, some packets
+could be present in another queue: udp_sk(sk)->reader_queue
+
+In some cases, a busy poller could spin until timeout expiration,
+even if some packets are available in udp_sk(sk)->reader_queue.
+
+v3: - make sk_busy_loop_end() nicer (Willem)
+
+v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats.
+ - add a sk_is_inet() check in sk_is_udp() (Willem feedback)
+ - add a sk_is_inet() check in sk_is_tcp().
+
+Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/skmsg.h | 6 ------
+ include/net/inet_sock.h | 5 -----
+ include/net/sock.h | 18 +++++++++++++++++-
+ net/core/sock.c | 11 +++++++++--
+ 4 files changed, 26 insertions(+), 14 deletions(-)
+
+diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
+index c953b8c0d2f4..bd4418377bac 100644
+--- a/include/linux/skmsg.h
++++ b/include/linux/skmsg.h
+@@ -500,12 +500,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
+ return !!psock->saved_data_ready;
+ }
+
+-static inline bool sk_is_udp(const struct sock *sk)
+-{
+- return sk->sk_type == SOCK_DGRAM &&
+- sk->sk_protocol == IPPROTO_UDP;
+-}
+-
+ #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+
+ #define BPF_F_STRPARSER (1UL << 1)
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index 74db6d97cae1..8d5fe15b0f6f 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -310,11 +310,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
+ #define inet_assign_bit(nr, sk, val) \
+ assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+
+-static inline bool sk_is_inet(struct sock *sk)
+-{
+- return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
+-}
+-
+ /**
+ * sk_to_full_sk - Access to a full socket
+ * @sk: pointer to a socket
+diff --git a/include/net/sock.h b/include/net/sock.h
+index 0201136b0b9c..f9a9f61fa122 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2794,9 +2794,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
+ &skb_shinfo(skb)->tskey);
+ }
+
++static inline bool sk_is_inet(const struct sock *sk)
++{
++ int family = READ_ONCE(sk->sk_family);
++
++ return family == AF_INET || family == AF_INET6;
++}
++
+ static inline bool sk_is_tcp(const struct sock *sk)
+ {
+- return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
++ return sk_is_inet(sk) &&
++ sk->sk_type == SOCK_STREAM &&
++ sk->sk_protocol == IPPROTO_TCP;
++}
++
++static inline bool sk_is_udp(const struct sock *sk)
++{
++ return sk_is_inet(sk) &&
++ sk->sk_type == SOCK_DGRAM &&
++ sk->sk_protocol == IPPROTO_UDP;
+ }
+
+ static inline bool sk_is_stream_unix(const struct sock *sk)
+diff --git a/net/core/sock.c b/net/core/sock.c
+index d02534c77413..e5d43a068f8e 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -107,6 +107,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/poll.h>
+ #include <linux/tcp.h>
++#include <linux/udp.h>
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+ #include <linux/user_namespace.h>
+@@ -4148,8 +4149,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
+ {
+ struct sock *sk = p;
+
+- return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+- sk_busy_loop_timeout(sk, start_time);
++ if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
++ return true;
++
++ if (sk_is_udp(sk) &&
++ !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
++ return true;
++
++ return sk_busy_loop_timeout(sk, start_time);
+ }
+ EXPORT_SYMBOL(sk_busy_loop_end);
+ #endif /* CONFIG_NET_RX_BUSY_POLL */
+--
+2.43.0
+
--- /dev/null
+From b3cc206ba9b0e1375f74191078ca0f84f0e06365 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 21:03:06 +0800
+Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING
+
+From: Lin Ma <linma@zju.edu.cn>
+
+[ Upstream commit 6c21660fe221a15c789dee2bc2fd95516bc5aeaf ]
+
+In the vlan_changelink function, a loop is used to parse the nested
+attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to
+obtain the struct ifla_vlan_qos_mapping. These two nested attributes are
+checked in the vlan_validate_qos_map function, which calls
+nla_validate_nested_deprecated with the vlan_map_policy.
+
+However, this deprecated validator applies a LIBERAL strictness, allowing
+the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC.
+Consequently, the loop in vlan_changelink may parse an attribute of type
+IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of
+struct ifla_vlan_qos_mapping, which is not necessarily true.
+
+To address this issue and ensure compatibility, this patch introduces two
+type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING.
+
+Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API")
+Signed-off-by: Lin Ma <linma@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/8021q/vlan_netlink.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
+index 214532173536..a3b68243fd4b 100644
+--- a/net/8021q/vlan_netlink.c
++++ b/net/8021q/vlan_netlink.c
+@@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
+ }
+ if (data[IFLA_VLAN_INGRESS_QOS]) {
+ nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++ continue;
+ m = nla_data(attr);
+ vlan_dev_set_ingress_priority(dev, m->to, m->from);
+ }
+ }
+ if (data[IFLA_VLAN_EGRESS_QOS]) {
+ nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++ continue;
+ m = nla_data(attr);
+ err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+ if (err)
+--
+2.43.0
+
--- /dev/null
+From fd2890505f1dd291a5dc74f190d704c71b303d92 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Jan 2024 18:17:44 +0200
+Subject: wifi: mac80211: fix potential sta-link leak
+
+From: Johannes Berg <johannes.berg@intel.com>
+
+[ Upstream commit b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 ]
+
+When a station is allocated, links are added but not
+set to valid yet (e.g. during connection to an AP MLD),
+we might remove the station without ever marking links
+valid, and leak them. Fix that.
+
+Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal")
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Reviewed-by: Ilan Peer <ilan.peer@intel.com>
+Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
+Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/mac80211/sta_info.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
+index 0ba613dd1cc4..c33decbb97f2 100644
+--- a/net/mac80211/sta_info.c
++++ b/net/mac80211/sta_info.c
+@@ -404,7 +404,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
+ int i;
+
+ for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+- if (!(sta->sta.valid_links & BIT(i)))
++ struct link_sta_info *link_sta;
++
++ link_sta = rcu_access_pointer(sta->link[i]);
++ if (!link_sta)
+ continue;
+
+ sta_remove_link(sta, i, false);
+--
+2.43.0
+
--- /dev/null
+From ce88e3847c9d2c03cbd9e1a47d10d20adfe0bdc3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:00 +0100
+Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit fbadd83a612c3b7aad2987893faca6bd24aaebb3 ]
+
+XSK ZC Rx path calculates the size of data that will be posted to XSK Rx
+queue via subtracting xdp_buff::data_end from xdp_buff::data.
+
+In bpf_xdp_frags_increase_tail(), when underlying memory type of
+xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail
+fragment, so that later on user space will be able to take into account
+the amount of bytes added by XDP program.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/filter.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 6575288b8580..cee53838310f 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -4091,6 +4091,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+ memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+ skb_frag_size_add(frag, offset);
+ sinfo->xdp_frags_size += offset;
++ if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
++ xsk_buff_get_tail(xdp)->data_end += offset;
+
+ return 0;
+ }
+--
+2.43.0
+
--- /dev/null
+From b345185d903cd3418f8b01e7cdd56bdcb02fcac4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:54 +0100
+Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit c5114710c8ce86b8317e9b448f4fd15c711c2a82 ]
+
+Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory
+type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens:
+
+[1136314.192256] BUG: kernel NULL pointer dereference, address:
+0000000000000034
+[1136314.203943] #PF: supervisor read access in kernel mode
+[1136314.213768] #PF: error_code(0x0000) - not-present page
+[1136314.223550] PGD 0 P4D 0
+[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
+[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
+[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
+BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
+[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
+[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
+[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
+[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
+0000000000000000
+[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
+ffffc9003168c000
+[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
+0000000000010000
+[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
+0000000000000001
+[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
+0000000000000001
+[1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
+knlGS:0000000000000000
+[1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
+00000000007706f0
+[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
+0000000000000000
+[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
+0000000000000400
+[1136314.431890] PKRU: 55555554
+[1136314.439143] Call Trace:
+[1136314.446058] <IRQ>
+[1136314.452465] ? __die+0x20/0x70
+[1136314.459881] ? page_fault_oops+0x15b/0x440
+[1136314.468305] ? exc_page_fault+0x6a/0x150
+[1136314.476491] ? asm_exc_page_fault+0x22/0x30
+[1136314.484927] ? __xdp_return+0x6c/0x210
+[1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0
+[1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
+[1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice]
+[1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice]
+[1136314.528506] ice_napi_poll+0x467/0x670 [ice]
+[1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0
+[1136314.546010] __napi_poll+0x29/0x1b0
+[1136314.553462] net_rx_action+0x133/0x270
+[1136314.561619] __do_softirq+0xbe/0x28e
+[1136314.569303] do_softirq+0x3f/0x60
+
+This comes from __xdp_return() call with xdp_buff argument passed as
+NULL which is supposed to be consumed by xsk_buff_free() call.
+
+To address this properly, in ZC case, a node that represents the frag
+being removed has to be pulled out of xskb_list. Introduce
+appropriate xsk helpers to do such node operation and use them
+accordingly within bpf_xdp_adjust_tail().
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++++
+ net/core/filter.c | 42 ++++++++++++++++++++++++++++++++------
+ 2 files changed, 62 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 7290eb721c07..5425f7ad5ebd 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+ return ret;
+ }
+
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++ struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
++
++ list_del(&xskb->xskb_list_node);
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
++ struct xdp_buff_xsk *frag;
++
++ frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
++ xskb_list_node);
++ return &frag->xdp;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+ xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+@@ -310,6 +327,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+ return NULL;
+ }
+
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++ return NULL;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+ }
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 1737884be52f..6575288b8580 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -83,6 +83,7 @@
+ #include <net/netfilter/nf_conntrack_bpf.h>
+ #include <net/netkit.h>
+ #include <linux/un.h>
++#include <net/xdp_sock_drv.h>
+
+ #include "dev.h"
+
+@@ -4094,6 +4095,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+ return 0;
+ }
+
++static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
++ struct xdp_mem_info *mem_info, bool release)
++{
++ struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
++
++ if (release) {
++ xsk_buff_del_tail(zc_frag);
++ __xdp_return(NULL, mem_info, false, zc_frag);
++ } else {
++ zc_frag->data_end -= shrink;
++ }
++}
++
++static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
++ int shrink)
++{
++ struct xdp_mem_info *mem_info = &xdp->rxq->mem;
++ bool release = skb_frag_size(frag) == shrink;
++
++ if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
++ bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
++ goto out;
++ }
++
++ if (release) {
++ struct page *page = skb_frag_page(frag);
++
++ __xdp_return(page_address(page), mem_info, false, NULL);
++ }
++
++out:
++ return release;
++}
++
+ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+ {
+ struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+@@ -4108,12 +4143,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+
+ len_free += shrink;
+ offset -= shrink;
+-
+- if (skb_frag_size(frag) == shrink) {
+- struct page *page = skb_frag_page(frag);
+-
+- __xdp_return(page_address(page), &xdp->rxq->mem,
+- false, NULL);
++ if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
+ n_frags_free++;
+ } else {
+ skb_frag_size_sub(frag, shrink);
+--
+2.43.0
+
--- /dev/null
+From 4b6c54fb2e4f8e8a3de0a7e6e4fd37f4ccaf58d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:53 +0100
+Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit f7f6aa8e24383fbb11ac55942e66da9660110f80 ]
+
+XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
+used by drivers to notify data path whether xdp_buff contains fragments
+or not. Data path looks up mentioned flag on first buffer that occupies
+the linear part of xdp_buff, so drivers only modify it there. This is
+sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
+stack or it resides within struct representing driver's queue and
+fragments are carried via skb_frag_t structs. IOW, we are dealing with
+only one xdp_buff.
+
+ZC mode though relies on list of xdp_buff structs that is carried via
+xsk_buff_pool::xskb_list, so ZC data path has to make sure that
+fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
+xsk_buff_free() could misbehave if it would be executed against xdp_buff
+that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
+take place when within supplied XDP program bpf_xdp_adjust_tail() is
+used with negative offset that would in turn release the tail fragment
+from multi-buffer frame.
+
+Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
+result in releasing all the nodes from xskb_list that were produced by
+driver before XDP program execution, which is not what is intended -
+only tail fragment should be deleted from xskb_list and then it should
+be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
+make it up to user space, so from AF_XDP application POV there would be
+no traffic running, however due to free_list getting constantly new
+nodes, driver will be able to feed HW Rx queue with recycled buffers.
+Bottom line is that instead of traffic being redirected to user space,
+it would be continuously dropped.
+
+To fix this, let us clear the mentioned flag on xsk_buff_pool side
+during xdp_buff initialization, which is what should have been done
+right from the start of XSK multi-buffer support.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
+ drivers/net/ethernet/intel/ice/ice_xsk.c | 1 -
+ include/net/xdp_sock_drv.h | 1 +
+ net/xdp/xsk_buff_pool.c | 1 +
+ 4 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index e99fa854d17f..fede0bb3e047 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
+ xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
+ i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
+ &rx_bytes, xdp_res, &failure);
+- first->flags = 0;
+ next_to_clean = next_to_process;
+ if (failure)
+ break;
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 99954508184f..951f84bfdf2b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -891,7 +891,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
+
+ if (!first) {
+ first = xdp;
+- xdp_buff_clear_frags_flag(first);
+ } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
+ break;
+ }
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 1f6fc8c7a84c..7290eb721c07 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -152,6 +152,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+ xdp->data_meta = xdp->data;
+ xdp->data_end = xdp->data + size;
++ xdp->flags = 0;
+ }
+
+ static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
+diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
+index 49cb9f9a09be..b0a611677865 100644
+--- a/net/xdp/xsk_buff_pool.c
++++ b/net/xdp/xsk_buff_pool.c
+@@ -541,6 +541,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+
+ xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+ xskb->xdp.data_meta = xskb->xdp.data;
++ xskb->xdp.flags = 0;
+
+ if (pool->dma_need_sync) {
+ dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+--
+2.43.0
+
--- /dev/null
+From c2fe6af64698a43889e90bbda82f4a926d00e464 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:52 +0100
+Subject: xsk: recycle buffer in case Rx queue was full
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 269009893146c495f41e9572dd9319e787c2eba9 ]
+
+Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce
+descriptor to XSK Rx queue.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xdp/xsk.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
+index 3da0b52f308d..688e641cd278 100644
+--- a/net/xdp/xsk.c
++++ b/net/xdp/xsk.c
+@@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+ contd = XDP_PKT_CONTD;
+
+ err = __xsk_rcv_zc(xs, xskb, len, contd);
+- if (err || likely(!frags))
+- goto out;
++ if (err)
++ goto err;
++ if (likely(!frags))
++ return 0;
+
+ xskb_list = &xskb->pool->xskb_list;
+ list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+@@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+ len = pos->xdp.data_end - pos->xdp.data;
+ err = __xsk_rcv_zc(xs, pos, len, contd);
+ if (err)
+- return err;
++ goto err;
+ list_del(&pos->xskb_list_node);
+ }
+
+-out:
++ return 0;
++err:
++ xsk_buff_free(xdp);
+ return err;
+ }
+
+--
+2.43.0
+