From: Sasha Levin Date: Sat, 27 Jan 2024 12:47:04 +0000 (-0500) Subject: Fixes for 6.7 X-Git-Tag: v6.1.76~69 X-Git-Url: http://git.ipfire.org/cgi-bin/gitweb.cgi?a=commitdiff_plain;h=1a92676e05b7a84e498cba8ba00db21238c02245;p=thirdparty%2Fkernel%2Fstable-queue.git Fixes for 6.7 Signed-off-by: Sasha Levin --- diff --git a/queue-6.7/afs-add-comments-on-abort-handling.patch b/queue-6.7/afs-add-comments-on-abort-handling.patch new file mode 100644 index 00000000000..3aca94d00ea --- /dev/null +++ b/queue-6.7/afs-add-comments-on-abort-handling.patch @@ -0,0 +1,189 @@ +From d4c784e91102c0fe5bfb9431f7842247c55a3d6d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Oct 2023 08:42:18 +0100 +Subject: afs: Add comments on abort handling + +From: David Howells + +[ Upstream commit fe245c8fcdac339e6b42076c828a6bede3a5e948 ] + +Add some comments on AFS abort code handling in the rotation algorithm and +adjust the errors produced to match. + +Reported-by: Jeffrey E Altman +Signed-off-by: David Howells +Reviewed-by: Jeffrey Altman +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/rotate.c | 101 ++++++++++++++++++++++++++++++++++++++++++------ + 1 file changed, 90 insertions(+), 11 deletions(-) + +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index a840c3588ebb..a3d127953ac6 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -13,6 +13,7 @@ + #include + #include "internal.h" + #include "afs_fs.h" ++#include "protocol_uae.h" + + /* + * Begin iteration through a server list, starting with the vnode's last used +@@ -143,6 +144,11 @@ bool afs_select_fileserver(struct afs_operation *op) + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. ++ * ++ * Note that various V* errors should not be sent to a cache manager ++ * by a fileserver as they should be translated to more modern UAE* ++ * errors instead. IBM AFS and OpenAFS fileservers, however, do leak ++ * these abort codes. + */ + switch (op->ac.abort_code) { + case VNOVOL: +@@ -150,6 +156,11 @@ bool afs_select_fileserver(struct afs_operation *op) + * - May indicate that the VL is wrong - retry once and compare + * the results. + * - May indicate that the fileserver couldn't attach to the vol. ++ * - The volume might have been temporarily removed so that it can ++ * be replaced by a volume restore. "vos" might have ended one ++ * transaction and has yet to create the next. ++ * - The volume might not be blessed or might not be in-service ++ * (administrative action). + */ + if (op->flags & AFS_OPERATION_VNOVOL) { + op->error = -EREMOTEIO; +@@ -183,16 +194,56 @@ bool afs_select_fileserver(struct afs_operation *op) + _leave(" = t [vnovol]"); + return true; + +- case VSALVAGE: /* TODO: Should this return an error or iterate? */ + case VVOLEXISTS: +- case VNOSERVICE: + case VONLINE: +- case VDISKFULL: +- case VOVERQUOTA: +- op->error = afs_abort_to_error(op->ac.abort_code); ++ /* These should not be returned from the fileserver. */ ++ pr_warn("Fileserver returned unexpected abort %d\n", ++ op->ac.abort_code); ++ op->error = -EREMOTEIO; ++ goto next_server; ++ ++ case VNOSERVICE: ++ /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver ++ * if the volume was neither in-service nor administratively ++ * blessed. All usage was replaced by VNOVOL because AFS 3.1 and ++ * earlier cache managers did not handle VNOSERVICE and assumed ++ * it was the client OSes errno 105. ++ * ++ * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the ++ * fileserver idle dead time error which was sent in place of ++ * RX_CALL_TIMEOUT (-3). The error was intended to be sent if the ++ * fileserver took too long to send a reply to the client. ++ * RX_CALL_TIMEOUT would have caused the cache manager to mark the ++ * server down whereas VNOSERVICE since AFS 3.2 would cause cache ++ * manager to temporarily (up to 15 minutes) mark the volume ++ * instance as unusable. ++ * ++ * The idle dead logic resulted in cache inconsistency since a ++ * state changing call that the cache manager assumed was dead ++ * could still be processed to completion by the fileserver. This ++ * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer ++ * returned. However, many 1.4.8 through 1.6.24 fileservers are ++ * still in existence. ++ * ++ * AuriStorFS fileservers have never returned VNOSERVICE. ++ * ++ * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. ++ */ ++ case RX_CALL_TIMEOUT: ++ op->error = -ETIMEDOUT; + goto next_server; + ++ case VSALVAGING: /* This error should not be leaked to cache managers ++ * but is from OpenAFS demand attach fileservers. ++ * It should be treated as an alias for VOFFLINE. ++ */ ++ case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */ + case VOFFLINE: ++ /* The volume is in use by the volserver or another volume utility ++ * for an operation that might alter the contents. The volume is ++ * expected to come back but it might take a long time (could be ++ * days). ++ */ + if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { + afs_busy(op->volume, op->ac.abort_code); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); +@@ -207,11 +258,20 @@ bool afs_select_fileserver(struct afs_operation *op) + } + goto busy; + +- case VSALVAGING: +- case VRESTARTING: ++ case VRESTARTING: /* The fileserver is either shutting down or starting up. */ + case VBUSY: +- /* Retry after going round all the servers unless we +- * have a file lock we need to maintain. ++ /* The volume is in use by the volserver or another volume ++ * utility for an operation that is not expected to alter the ++ * contents of the volume. VBUSY does not need to be returned ++ * for a ROVOL or BACKVOL bound to an ITBusy volserver ++ * transaction. The fileserver is permitted to continue serving ++ * content from ROVOLs and BACKVOLs during an ITBusy transaction ++ * because the content will not change. However, many fileserver ++ * releases do return VBUSY for ROVOL and BACKVOL instances under ++ * many circumstances. ++ * ++ * Retry after going round all the servers unless we have a file ++ * lock we need to maintain. + */ + if (op->flags & AFS_OPERATION_NO_VSLEEP) { + op->error = -EBUSY; +@@ -226,7 +286,7 @@ bool afs_select_fileserver(struct afs_operation *op) + if (!afs_sleep_and_retry(op)) + goto failed; + +- /* Retry with same server & address */ ++ /* Retry with same server & address */ + _leave(" = t [vbusy]"); + return true; + } +@@ -270,10 +330,29 @@ bool afs_select_fileserver(struct afs_operation *op) + + goto restart_from_beginning; + ++ case VDISKFULL: ++ case UAENOSPC: ++ /* The partition is full. Only applies to RWVOLs. ++ * Translate locally and return ENOSPC. ++ * No replicas to failover to. ++ */ ++ op->error = -ENOSPC; ++ goto failed_but_online; ++ ++ case VOVERQUOTA: ++ case UAEDQUOT: ++ /* Volume is full. Only applies to RWVOLs. ++ * Translate locally and return EDQUOT. ++ * No replicas to failover to. ++ */ ++ op->error = -EDQUOT; ++ goto failed_but_online; ++ + default: ++ op->error = afs_abort_to_error(op->ac.abort_code); ++ failed_but_online: + clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); +- op->error = afs_abort_to_error(op->ac.abort_code); + goto failed; + } + +-- +2.43.0 + diff --git a/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch b/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch new file mode 100644 index 00000000000..8188664c71f --- /dev/null +++ b/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch @@ -0,0 +1,352 @@ +From 2885f7375cc37abe94fcd4895fa663b6b24e4904 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Oct 2023 09:54:07 +0100 +Subject: afs: Don't put afs_call in afs_wait_for_call_to_complete() + +From: David Howells + +[ Upstream commit 6f2ff7e89bd05677f4c08fccafcf625ca3e09c1c ] + +Don't put the afs_call struct in afs_wait_for_call_to_complete() but rather +have the caller do it. This will allow the caller to fish stuff out of the +afs_call struct rather than the afs_addr_cursor struct, thereby allowing a +subsequent patch to subsume it. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/fs_operation.c | 7 +++-- + fs/afs/fsclient.c | 5 ++- + fs/afs/internal.h | 2 +- + fs/afs/rxrpc.c | 73 ++++++++++++++++--------------------------- + fs/afs/vlclient.c | 64 ++++++++++++++++++++++--------------- + 5 files changed, 75 insertions(+), 76 deletions(-) + +diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c +index bfb9a7634bd9..1c22d6e77846 100644 +--- a/fs/afs/fs_operation.c ++++ b/fs/afs/fs_operation.c +@@ -191,8 +191,11 @@ void afs_wait_for_operation(struct afs_operation *op) + else + op->ac.error = -ENOTSUPP; + +- if (op->call) +- op->error = afs_wait_for_call_to_complete(op->call, &op->ac); ++ if (op->call) { ++ afs_wait_for_call_to_complete(op->call, &op->ac); ++ op->error = op->ac.error; ++ afs_put_call(op->call); ++ } + } + + switch (op->error) { +diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c +index 6821ce0f9d63..020073387111 100644 +--- a/fs/afs/fsclient.c ++++ b/fs/afs/fsclient.c +@@ -1612,6 +1612,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, + { + struct afs_call *call; + __be32 *bp; ++ int ret; + + _enter(""); + +@@ -1627,7 +1628,9 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, + + call->server = afs_use_server(server, afs_server_trace_give_up_cb); + afs_make_call(ac, call, GFP_NOFS); +- return afs_wait_for_call_to_complete(call, ac); ++ afs_wait_for_call_to_complete(call, ac); ++ afs_put_call(call); ++ return ret; + } + + /* +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index 1a306df267b0..45c4526b56be 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -1291,7 +1291,7 @@ extern void __net_exit afs_close_socket(struct afs_net *); + extern void afs_charge_preallocation(struct work_struct *); + extern void afs_put_call(struct afs_call *); + extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t); +-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *); ++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac); + extern struct afs_call *afs_alloc_flat_call(struct afs_net *, + const struct afs_call_type *, + size_t, size_t); +diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c +index 2603db03b7ff..dad8efadbc44 100644 +--- a/fs/afs/rxrpc.c ++++ b/fs/afs/rxrpc.c +@@ -575,48 +575,44 @@ static void afs_deliver_to_call(struct afs_call *call) + /* + * Wait synchronously for a call to complete and clean up the call struct. + */ +-long afs_wait_for_call_to_complete(struct afs_call *call, +- struct afs_addr_cursor *ac) ++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac) + { +- long ret; + bool rxrpc_complete = false; + +- DECLARE_WAITQUEUE(myself, current); +- + _enter(""); + +- ret = call->error; +- if (ret < 0) +- goto out; ++ if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { ++ DECLARE_WAITQUEUE(myself, current); ++ ++ add_wait_queue(&call->waitq, &myself); ++ for (;;) { ++ set_current_state(TASK_UNINTERRUPTIBLE); ++ ++ /* deliver any messages that are in the queue */ ++ if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && ++ call->need_attention) { ++ call->need_attention = false; ++ __set_current_state(TASK_RUNNING); ++ afs_deliver_to_call(call); ++ continue; ++ } + +- add_wait_queue(&call->waitq, &myself); +- for (;;) { +- set_current_state(TASK_UNINTERRUPTIBLE); +- +- /* deliver any messages that are in the queue */ +- if (!afs_check_call_state(call, AFS_CALL_COMPLETE) && +- call->need_attention) { +- call->need_attention = false; +- __set_current_state(TASK_RUNNING); +- afs_deliver_to_call(call); +- continue; +- } ++ if (afs_check_call_state(call, AFS_CALL_COMPLETE)) ++ break; + +- if (afs_check_call_state(call, AFS_CALL_COMPLETE)) +- break; ++ if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { ++ /* rxrpc terminated the call. */ ++ rxrpc_complete = true; ++ break; ++ } + +- if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) { +- /* rxrpc terminated the call. */ +- rxrpc_complete = true; +- break; ++ schedule(); + } + +- schedule(); ++ remove_wait_queue(&call->waitq, &myself); ++ __set_current_state(TASK_RUNNING); + } + +- remove_wait_queue(&call->waitq, &myself); +- __set_current_state(TASK_RUNNING); +- + if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) { + if (rxrpc_complete) { + afs_set_call_complete(call, call->error, call->abort_code); +@@ -635,23 +631,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call, + ac->error = call->error; + spin_unlock_bh(&call->state_lock); + +- ret = ac->error; +- switch (ret) { +- case 0: +- ret = call->ret0; +- call->ret0 = 0; +- +- fallthrough; +- case -ECONNABORTED: ++ if (call->error == 0 || call->error == -ECONNABORTED) + ac->responded = true; +- break; +- } +- +-out: +- _debug("call complete"); +- afs_put_call(call); +- _leave(" = %p", (void *)ret); +- return ret; + } + + /* +diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c +index 41e7932d75c6..650534892a20 100644 +--- a/fs/afs/vlclient.c ++++ b/fs/afs/vlclient.c +@@ -106,12 +106,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call) + return 0; + } + +-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call) +-{ +- kfree(call->ret_vldb); +- afs_flat_call_destructor(call); +-} +- + /* + * VL.GetEntryByNameU operation type. + */ +@@ -119,7 +113,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = { + .name = "VL.GetEntryByNameU", + .op = afs_VL_GetEntryByNameU, + .deliver = afs_deliver_vl_get_entry_by_name_u, +- .destructor = afs_destroy_vl_get_entry_by_name_u, ++ .destructor = afs_flat_call_destructor, + }; + + /* +@@ -166,7 +160,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, + + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); +- return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac); ++ afs_wait_for_call_to_complete(call, &vc->ac); ++ afs_put_call(call); ++ if (vc->ac.error) { ++ kfree(entry); ++ return ERR_PTR(vc->ac.error); ++ } ++ return entry; + } + + /* +@@ -249,12 +249,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) + return 0; + } + +-static void afs_vl_get_addrs_u_destructor(struct afs_call *call) +-{ +- afs_put_addrlist(call->ret_alist); +- return afs_flat_call_destructor(call); +-} +- + /* + * VL.GetAddrsU operation type. + */ +@@ -262,7 +256,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = { + .name = "VL.GetAddrsU", + .op = afs_VL_GetAddrsU, + .deliver = afs_deliver_vl_get_addrs_u, +- .destructor = afs_vl_get_addrs_u_destructor, ++ .destructor = afs_flat_call_destructor, + }; + + /* +@@ -273,6 +267,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, + const uuid_t *uuid) + { + struct afs_ListAddrByAttributes__xdr *r; ++ struct afs_addr_list *alist; + const struct afs_uuid *u = (const struct afs_uuid *)uuid; + struct afs_call *call; + struct afs_net *net = vc->cell->net; +@@ -309,7 +304,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, + + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); +- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); ++ afs_wait_for_call_to_complete(call, &vc->ac); ++ alist = call->ret_alist; ++ afs_put_call(call); ++ if (vc->ac.error) { ++ afs_put_addrlist(alist); ++ return ERR_PTR(vc->ac.error); ++ } ++ return alist; + } + + /* +@@ -618,7 +620,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { + .name = "YFSVL.GetEndpoints", + .op = afs_YFSVL_GetEndpoints, + .deliver = afs_deliver_yfsvl_get_endpoints, +- .destructor = afs_vl_get_addrs_u_destructor, ++ .destructor = afs_flat_call_destructor, + }; + + /* +@@ -628,6 +630,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = { + struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, + const uuid_t *uuid) + { ++ struct afs_addr_list *alist; + struct afs_call *call; + struct afs_net *net = vc->cell->net; + __be32 *bp; +@@ -652,7 +655,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, + + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); +- return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac); ++ afs_wait_for_call_to_complete(call, &vc->ac); ++ alist = call->ret_alist; ++ afs_put_call(call); ++ if (vc->ac.error) { ++ afs_put_addrlist(alist); ++ return ERR_PTR(vc->ac.error); ++ } ++ return alist; + } + + /* +@@ -717,12 +727,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call) + return 0; + } + +-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call) +-{ +- kfree(call->ret_str); +- afs_flat_call_destructor(call); +-} +- + /* + * VL.GetCapabilities operation type + */ +@@ -730,7 +734,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = { + .name = "YFSVL.GetCellName", + .op = afs_YFSVL_GetCellName, + .deliver = afs_deliver_yfsvl_get_cell_name, +- .destructor = afs_destroy_yfsvl_get_cell_name, ++ .destructor = afs_flat_call_destructor, + }; + + /* +@@ -745,6 +749,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) + struct afs_call *call; + struct afs_net *net = vc->cell->net; + __be32 *bp; ++ char *cellname; + + _enter(""); + +@@ -763,5 +768,12 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) + /* Can't take a ref on server */ + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); +- return (char *)afs_wait_for_call_to_complete(call, &vc->ac); ++ afs_wait_for_call_to_complete(call, &vc->ac); ++ cellname = call->ret_str; ++ afs_put_call(call); ++ if (vc->ac.error) { ++ kfree(cellname); ++ return ERR_PTR(vc->ac.error); ++ } ++ return cellname; + } +-- +2.43.0 + diff --git a/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch b/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch new file mode 100644 index 00000000000..f1ffd647e8f --- /dev/null +++ b/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch @@ -0,0 +1,121 @@ +From 118984ee79c3e4b1b67ee3211d94d7d945c76fec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 2 Jan 2024 14:02:37 +0000 +Subject: afs: Fix error handling with lookup via FS.InlineBulkStatus + +From: David Howells + +[ Upstream commit 17ba6f0bd14fe3ac606aac6bebe5e69bdaad8ba1 ] + +When afs does a lookup, it tries to use FS.InlineBulkStatus to preemptively +look up a bunch of files in the parent directory and cache this locally, on +the basis that we might want to look at them too (for example if someone +does an ls on a directory, they may want want to then stat every file +listed). + +FS.InlineBulkStatus can be considered a compound op with the normal abort +code applying to the compound as a whole. Each status fetch within the +compound is then given its own individual abort code - but assuming no +error that prevents the bulk fetch from returning the compound result will +be 0, even if all the constituent status fetches failed. + +At the conclusion of afs_do_lookup(), we should use the abort code from the +appropriate status to determine the error to return, if any - but instead +it is assumed that we were successful if the op as a whole succeeded and we +return an incompletely initialised inode, resulting in ENOENT, no matter +the actual reason. In the particular instance reported, a vnode with no +permission granted to be accessed is being given a UAEACCES abort code +which should be reported as EACCES, but is instead being reported as +ENOENT. + +Fix this by abandoning the inode (which will be cleaned up with the op) if +file[1] has an abort code indicated and turn that abort code into an error +instead. + +Whilst we're at it, add a tracepoint so that the abort codes of the +individual subrequests of FS.InlineBulkStatus can be logged. At the moment +only the container abort code can be 0. + +Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept") +Reported-by: Jeffrey Altman +Signed-off-by: David Howells +Reviewed-by: Marc Dionne +cc: linux-afs@lists.infradead.org +Signed-off-by: Sasha Levin +--- + fs/afs/dir.c | 12 +++++++++--- + include/trace/events/afs.h | 25 +++++++++++++++++++++++++ + 2 files changed, 34 insertions(+), 3 deletions(-) + +diff --git a/fs/afs/dir.c b/fs/afs/dir.c +index 75896a677b96..9140780be5a4 100644 +--- a/fs/afs/dir.c ++++ b/fs/afs/dir.c +@@ -716,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op) + break; + } + ++ if (vp->scb.status.abort_code) ++ trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code); + if (!vp->scb.have_status && !vp->scb.have_error) + continue; + +@@ -905,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } +- inode = ERR_PTR(afs_op_error(op)); + + out_op: + if (!afs_op_error(op)) { +- inode = &op->file[1].vnode->netfs.inode; +- op->file[1].vnode = NULL; ++ if (op->file[1].scb.status.abort_code) { ++ afs_op_accumulate_error(op, -ECONNABORTED, ++ op->file[1].scb.status.abort_code); ++ } else { ++ inode = &op->file[1].vnode->netfs.inode; ++ op->file[1].vnode = NULL; ++ } + } + + if (op->file[0].scb.have_status) +diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h +index e9d412d19dbb..caec276515dc 100644 +--- a/include/trace/events/afs.h ++++ b/include/trace/events/afs.h +@@ -1216,6 +1216,31 @@ TRACE_EVENT(afs_file_error, + __print_symbolic(__entry->where, afs_file_errors)) + ); + ++TRACE_EVENT(afs_bulkstat_error, ++ TP_PROTO(struct afs_operation *op, struct afs_fid *fid, unsigned int index, s32 abort), ++ ++ TP_ARGS(op, fid, index, abort), ++ ++ TP_STRUCT__entry( ++ __field_struct(struct afs_fid, fid) ++ __field(unsigned int, op) ++ __field(unsigned int, index) ++ __field(s32, abort) ++ ), ++ ++ TP_fast_assign( ++ __entry->op = op->debug_id; ++ __entry->fid = *fid; ++ __entry->index = index; ++ __entry->abort = abort; ++ ), ++ ++ TP_printk("OP=%08x[%02x] %llx:%llx:%x a=%d", ++ __entry->op, __entry->index, ++ __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique, ++ __entry->abort) ++ ); ++ + TRACE_EVENT(afs_cm_no_server, + TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx), + +-- +2.43.0 + diff --git a/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch b/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch new file mode 100644 index 00000000000..fbccf512332 --- /dev/null +++ b/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch @@ -0,0 +1,90 @@ +From f7c86f260e13437b71984ae0f0cd27554335461d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 30 Nov 2023 12:56:14 +0100 +Subject: afs: fix the usage of read_seqbegin_or_lock() in afs_find_server*() + +From: Oleg Nesterov + +[ Upstream commit 1702e0654ca9a7bcd7c7619c8a5004db58945b71 ] + +David Howells says: + + (5) afs_find_server(). + + There could be a lot of servers in the list and each server can have + multiple addresses, so I think this would be better with an exclusive + second pass. + + The server list isn't likely to change all that often, but when it does + change, there's a good chance several servers are going to be + added/removed one after the other. Further, this is only going to be + used for incoming cache management/callback requests from the server, + which hopefully aren't going to happen too often - but it is remotely + drivable. + + (6) afs_find_server_by_uuid(). + + Similarly to (5), there could be a lot of servers to search through, but + they are in a tree not a flat list, so it should be faster to process. + Again, it's not likely to change that often and, again, when it does + change it's likely to involve multiple changes. This can be driven + remotely by an incoming cache management request but is mostly going to + be driven by setting up or reconfiguring a volume's server list - + something that also isn't likely to happen often. + +Make the "seq" counter odd on the 2nd pass, otherwise read_seqbegin_or_lock() +never takes the lock. + +Signed-off-by: Oleg Nesterov +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Link: https://lore.kernel.org/r/20231130115614.GA21581@redhat.com/ +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/server.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/fs/afs/server.c b/fs/afs/server.c +index b5237206eac3..0bd2f5ba6900 100644 +--- a/fs/afs/server.c ++++ b/fs/afs/server.c +@@ -27,7 +27,7 @@ struct afs_server *afs_find_server(struct afs_net *net, + const struct afs_addr_list *alist; + struct afs_server *server = NULL; + unsigned int i; +- int seq = 0, diff; ++ int seq = 1, diff; + + rcu_read_lock(); + +@@ -35,6 +35,7 @@ struct afs_server *afs_find_server(struct afs_net *net, + if (server) + afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq); + server = NULL; ++ seq++; /* 2 on the 1st/lockless path, otherwise odd */ + read_seqbegin_or_lock(&net->fs_addr_lock, &seq); + + if (srx->transport.family == AF_INET6) { +@@ -90,7 +91,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu + { + struct afs_server *server = NULL; + struct rb_node *p; +- int diff, seq = 0; ++ int diff, seq = 1; + + _enter("%pU", uuid); + +@@ -102,7 +103,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu + if (server) + afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq); + server = NULL; +- ++ seq++; /* 2 on the 1st/lockless path, otherwise odd */ + read_seqbegin_or_lock(&net->fs_lock, &seq); + + p = net->fs_servers.rb_node; +-- +2.43.0 + diff --git a/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch b/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch new file mode 100644 index 00000000000..65a186da907 --- /dev/null +++ b/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch @@ -0,0 +1,44 @@ +From e9106b5b7e80e3ab85a87fbea7ea3ecdc53673cd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 Oct 2023 16:00:18 +0100 +Subject: afs: Handle the VIO and UAEIO aborts explicitly + +From: David Howells + +[ Upstream commit eb8eae65f0c713bcef84b082aa919f72c3d83268 ] + +When processing the result of a call, handle the VIO and UAEIO abort +specifically rather than leaving it to a default case. Rather than +erroring out unconditionally, see if there's another server if the volume +has more than one server available, otherwise return -EREMOTEIO. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/rotate.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index 59aed7a6dd11..a108cd55bb4e 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -330,6 +330,13 @@ bool afs_select_fileserver(struct afs_operation *op) + + goto restart_from_beginning; + ++ case UAEIO: ++ case VIO: ++ op->error = -EREMOTEIO; ++ if (op->volume->type != AFSVL_RWVOL) ++ goto next_server; ++ goto failed; ++ + case VDISKFULL: + case UAENOSPC: + /* The partition is full. Only applies to RWVOLs. +-- +2.43.0 + diff --git a/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch b/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch new file mode 100644 index 00000000000..d285df11aa1 --- /dev/null +++ b/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch @@ -0,0 +1,54 @@ +From b8adfd03eeab12713e571102691a0551705137fc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 8 Jan 2024 17:22:36 +0000 +Subject: afs: Hide silly-rename files from userspace + +From: David Howells + +[ Upstream commit 57e9d49c54528c49b8bffe6d99d782ea051ea534 ] + +There appears to be a race between silly-rename files being created/removed +and various userspace tools iterating over the contents of a directory, +leading to such errors as: + + find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory + tar: ./include/linux/greybus/.__afs3C95: File removed before we read it + +when building a kernel. + +Fix afs_readdir() so that it doesn't return .__afsXXXX silly-rename files +to userspace. This doesn't stop them being looked up directly by name as +we need to be able to look them up from within the kernel as part of the +silly-rename algorithm. + +Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename") +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Signed-off-by: Sasha Levin +--- + fs/afs/dir.c | 8 ++++++++ + 1 file changed, 8 insertions(+) + +diff --git a/fs/afs/dir.c b/fs/afs/dir.c +index 5219182e52e1..2df2e9ee130d 100644 +--- a/fs/afs/dir.c ++++ b/fs/afs/dir.c +@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode, + continue; + } + ++ /* Don't expose silly rename entries to userspace. */ ++ if (nlen > 6 && ++ dire->u.name[0] == '.' && ++ ctx->actor != afs_lookup_filldir && ++ ctx->actor != afs_lookup_one_filldir && ++ memcmp(dire->u.name, ".__afs", 6) == 0) ++ continue; ++ + /* found the next entry */ + if (!dir_emit(ctx, dire->u.name, nlen, + ntohl(dire->u.vnode), +-- +2.43.0 + diff --git a/queue-6.7/afs-simplify-error-handling.patch b/queue-6.7/afs-simplify-error-handling.patch new file mode 100644 index 00000000000..71b95db7328 --- /dev/null +++ b/queue-6.7/afs-simplify-error-handling.patch @@ -0,0 +1,1028 @@ +From d2f27e70f3691aa4364cbb1f807ed0811f3a3ab0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 25 Oct 2023 17:53:33 +0100 +Subject: afs: Simplify error handling + +From: David Howells + +[ Upstream commit aa453becce5d1ae1b94b7fc22f47d7b05d22b14e ] + +Simplify error handling a bit by moving it from the afs_addr_cursor struct +to the afs_operation and afs_vl_cursor structs and using the error +prioritisation function for accumulating errors from multiple sources (AFS +tries to rotate between multiple fileservers, some of which may be +inaccessible or in some state of offlinedness). + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/addr_list.c | 8 ++-- + fs/afs/dir.c | 14 ++++--- + fs/afs/dir_silly.c | 2 +- + fs/afs/file.c | 3 -- + fs/afs/fs_operation.c | 24 +++++------ + fs/afs/fsclient.c | 1 + + fs/afs/internal.h | 44 +++++++++++++++------ + fs/afs/misc.c | 10 ++++- + fs/afs/rotate.c | 58 ++++++++++++++------------- + fs/afs/rxrpc.c | 17 ++++---- + fs/afs/server.c | 1 - + fs/afs/vl_alias.c | 2 +- + fs/afs/vl_probe.c | 7 ++-- + fs/afs/vl_rotate.c | 92 +++++++++++++++++++++---------------------- + fs/afs/vlclient.c | 34 ++++++++++------ + 15 files changed, 174 insertions(+), 143 deletions(-) + +diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c +index 519821f5aedc..f4837c3b8ae2 100644 +--- a/fs/afs/addr_list.c ++++ b/fs/afs/addr_list.c +@@ -386,26 +386,24 @@ bool afs_iterate_addresses(struct afs_addr_cursor *ac) + selected: + ac->index = index; + set_bit(index, &ac->tried); +- ac->responded = false; ++ ac->call_responded = false; + return true; + } + + /* + * Release an address list cursor. + */ +-int afs_end_cursor(struct afs_addr_cursor *ac) ++void afs_end_cursor(struct afs_addr_cursor *ac) + { + struct afs_addr_list *alist; + + alist = ac->alist; + if (alist) { +- if (ac->responded && ++ if (ac->call_responded && + ac->index != alist->preferred && + test_bit(ac->alist->preferred, &ac->tried)) + WRITE_ONCE(alist->preferred, ac->index); + afs_put_addrlist(alist); + ac->alist = NULL; + } +- +- return ac->error; + } +diff --git a/fs/afs/dir.c b/fs/afs/dir.c +index 15763418a938..75896a677b96 100644 +--- a/fs/afs/dir.c ++++ b/fs/afs/dir.c +@@ -701,8 +701,9 @@ static void afs_do_lookup_success(struct afs_operation *op) + vp = &op->file[0]; + abort_code = vp->scb.status.abort_code; + if (abort_code != 0) { +- op->ac.abort_code = abort_code; +- op->error = afs_abort_to_error(abort_code); ++ op->call_abort_code = abort_code; ++ afs_op_set_error(op, afs_abort_to_error(abort_code)); ++ op->cumul_error.abort_code = abort_code; + } + break; + +@@ -854,13 +855,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + _debug("nr_files %u", op->nr_files); + + /* Need space for examining all the selected files */ +- op->error = -ENOMEM; + if (op->nr_files > 2) { + op->more_files = kvcalloc(op->nr_files - 2, + sizeof(struct afs_vnode_param), + GFP_KERNEL); +- if (!op->more_files) ++ if (!op->more_files) { ++ afs_op_nomem(op); + goto out_op; ++ } + + for (i = 2; i < op->nr_files; i++) { + vp = &op->more_files[i - 2]; +@@ -1263,7 +1265,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op) + { + struct afs_vnode *vnode = op->file[0].vnode; + +- switch (op->ac.abort_code) { ++ switch (afs_op_abort_code(op)) { + case VNOVNODE: + set_bit(AFS_VNODE_DELETED, &vnode->flags); + afs_break_callback(vnode, afs_cb_break_for_deleted); +@@ -1288,7 +1290,7 @@ static void afs_vnode_new_inode(struct afs_operation *op) + /* ENOMEM or EINTR at a really inconvenient time - just abandon + * the new directory on the server. + */ +- op->error = PTR_ERR(inode); ++ afs_op_accumulate_error(op, PTR_ERR(inode), 0); + return; + } + +diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c +index bb5807e87fa4..a1e581946b93 100644 +--- a/fs/afs/dir_silly.c ++++ b/fs/afs/dir_silly.c +@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ +- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { ++ if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; +diff --git a/fs/afs/file.c b/fs/afs/file.c +index 0c81c39c32f5..8f9b42427569 100644 +--- a/fs/afs/file.c ++++ b/fs/afs/file.c +@@ -245,10 +245,7 @@ static void afs_fetch_data_notify(struct afs_operation *op) + struct netfs_io_subrequest *subreq = req->subreq; + int error = afs_op_error(op); + +- if (error == -ECONNABORTED) +- error = afs_abort_to_error(op->ac.abort_code); + req->error = error; +- + if (subreq) { + __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags); + netfs_subreq_terminated(subreq, error ?: req->actual_len, false); +diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c +index 1c22d6e77846..cebe4fad8192 100644 +--- a/fs/afs/fs_operation.c ++++ b/fs/afs/fs_operation.c +@@ -169,9 +169,6 @@ static void afs_end_vnode_operation(struct afs_operation *op) + } + + afs_drop_io_locks(op); +- +- if (op->error == -ECONNABORTED) +- op->error = afs_abort_to_error(op->ac.abort_code); + } + + /* +@@ -182,6 +179,8 @@ void afs_wait_for_operation(struct afs_operation *op) + _enter(""); + + while (afs_select_fileserver(op)) { ++ op->call_error = 0; ++ op->call_abort_code = 0; + op->cb_s_break = op->server->cb_s_break; + if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) && + op->ops->issue_yfs_rpc) +@@ -189,28 +188,29 @@ void afs_wait_for_operation(struct afs_operation *op) + else if (op->ops->issue_afs_rpc) + op->ops->issue_afs_rpc(op); + else +- op->ac.error = -ENOTSUPP; ++ op->call_error = -ENOTSUPP; + + if (op->call) { + afs_wait_for_call_to_complete(op->call, &op->ac); +- op->error = op->ac.error; ++ op->call_abort_code = op->call->abort_code; ++ op->call_error = op->call->error; ++ op->call_responded = op->call->responded; ++ op->ac.call_responded = true; ++ WRITE_ONCE(op->ac.alist->addrs[op->ac.index].last_error, ++ op->call_error); + afs_put_call(op->call); + } + } + +- switch (op->error) { +- case 0: ++ if (!afs_op_error(op)) { + _debug("success"); + op->ops->success(op); +- break; +- case -ECONNABORTED: ++ } else if (op->cumul_error.aborted) { + if (op->ops->aborted) + op->ops->aborted(op); +- fallthrough; +- default: ++ } else { + if (op->ops->failed) + op->ops->failed(op); +- break; + } + + afs_end_vnode_operation(op); +diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c +index 020073387111..2a56dea22519 100644 +--- a/fs/afs/fsclient.c ++++ b/fs/afs/fsclient.c +@@ -1629,6 +1629,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net, + call->server = afs_use_server(server, afs_server_trace_give_up_cb); + afs_make_call(ac, call, GFP_NOFS); + afs_wait_for_call_to_complete(call, ac); ++ ret = call->error; + afs_put_call(call); + return ret; + } +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index 45c4526b56be..5f6db0ac06ac 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -75,6 +75,7 @@ enum afs_call_state { + struct afs_address { + struct rxrpc_peer *peer; + u16 service_id; ++ short last_error; /* Last error from this address */ + }; + + /* +@@ -121,7 +122,6 @@ struct afs_call { + }; + void *buffer; /* reply receive buffer */ + union { +- long ret0; /* Value to reply with instead of 0 */ + struct afs_addr_list *ret_alist; + struct afs_vldb_entry *ret_vldb; + char *ret_str; +@@ -145,6 +145,7 @@ struct afs_call { + bool upgrade; /* T to request service upgrade */ + bool intr; /* T if interruptible */ + bool unmarshalling_error; /* T if an unmarshalling error occurred */ ++ bool responded; /* Got a response from the call (may be abort) */ + u16 service_id; /* Actual service ID (after upgrade) */ + unsigned int debug_id; /* Trace ID */ + u32 operation_ID; /* operation ID for an incoming call */ +@@ -719,8 +720,10 @@ struct afs_permits { + * Error prioritisation and accumulation. + */ + struct afs_error { +- short error; /* Accumulated error */ ++ s32 abort_code; /* Cumulative abort code */ ++ short error; /* Cumulative error */ + bool responded; /* T if server responded */ ++ bool aborted; /* T if ->error is from an abort */ + }; + + /* +@@ -730,10 +733,8 @@ struct afs_addr_cursor { + struct afs_addr_list *alist; /* Current address list (pins ref) */ + unsigned long tried; /* Tried addresses */ + signed char index; /* Current address */ +- bool responded; /* T if the current address responded */ + unsigned short nr_iterations; /* Number of address iterations */ +- short error; +- u32 abort_code; ++ bool call_responded; + }; + + /* +@@ -746,13 +747,16 @@ struct afs_vl_cursor { + struct afs_vlserver *server; /* Server on which this resides */ + struct key *key; /* Key for the server */ + unsigned long untried; /* Bitmask of untried servers */ ++ struct afs_error cumul_error; /* Cumulative error */ ++ s32 call_abort_code; + short index; /* Current server */ +- short error; ++ short call_error; /* Error from single call */ + unsigned short flags; + #define AFS_VL_CURSOR_STOP 0x0001 /* Set to cease iteration */ + #define AFS_VL_CURSOR_RETRY 0x0002 /* Set to do a retry */ + #define AFS_VL_CURSOR_RETRIED 0x0004 /* Set if started a retry */ +- unsigned short nr_iterations; /* Number of server iterations */ ++ short nr_iterations; /* Number of server iterations */ ++ bool call_responded; /* T if the current address responded */ + }; + + /* +@@ -803,8 +807,10 @@ struct afs_operation { + struct dentry *dentry_2; /* Second dentry to be altered */ + struct timespec64 mtime; /* Modification time to record */ + struct timespec64 ctime; /* Change time to set */ ++ struct afs_error cumul_error; /* Cumulative error */ + short nr_files; /* Number of entries in file[], more_files */ +- short error; ++ short call_error; /* Error from single call */ ++ s32 call_abort_code; /* Abort code from single call */ + unsigned int debug_id; + + unsigned int cb_v_break; /* Volume break counter before op */ +@@ -860,6 +866,8 @@ struct afs_operation { + unsigned long untried; /* Bitmask of untried servers */ + short index; /* Current server */ + short nr_iterations; /* Number of server iterations */ ++ bool call_responded; /* T if the current address responded */ ++ + + unsigned int flags; + #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ +@@ -976,7 +984,7 @@ bool afs_addr_list_same(const struct afs_addr_list *a, + const struct afs_addr_list *b); + extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); + extern bool afs_iterate_addresses(struct afs_addr_cursor *); +-extern int afs_end_cursor(struct afs_addr_cursor *); ++extern void afs_end_cursor(struct afs_addr_cursor *ac); + + extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, + __be32 xdr, u16 port); +@@ -1235,17 +1243,27 @@ extern void afs_prioritise_error(struct afs_error *, int, u32); + + static inline void afs_op_nomem(struct afs_operation *op) + { +- op->error = -ENOMEM; ++ op->cumul_error.error = -ENOMEM; + } + + static inline int afs_op_error(const struct afs_operation *op) + { +- return op->error; ++ return op->cumul_error.error; ++} ++ ++static inline s32 afs_op_abort_code(const struct afs_operation *op) ++{ ++ return op->cumul_error.abort_code; + } + + static inline int afs_op_set_error(struct afs_operation *op, int error) + { +- return op->error = error; ++ return op->cumul_error.error = error; ++} ++ ++static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code) ++{ ++ afs_prioritise_error(&op->cumul_error, error, abort_code); + } + + /* +@@ -1619,7 +1637,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op, + struct afs_vnode_param *dir_vp, + struct dentry *dentry) + { +- if (!op->error) ++ if (!op->cumul_error.error) + dentry->d_fsdata = + (void *)(unsigned long)dir_vp->scb.status.data_version; + } +diff --git a/fs/afs/misc.c b/fs/afs/misc.c +index 805328ca5428..b8180bf2281f 100644 +--- a/fs/afs/misc.c ++++ b/fs/afs/misc.c +@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) + { + switch (error) { + case 0: ++ e->aborted = false; ++ e->error = 0; + return; + default: + if (e->error == -ETIMEDOUT || +@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code) + if (e->responded) + return; + e->error = error; ++ e->aborted = false; + return; + + case -ECONNABORTED: +- error = afs_abort_to_error(abort_code); +- fallthrough; ++ e->error = afs_abort_to_error(abort_code); ++ e->aborted = true; ++ e->responded = true; ++ return; + case -ENETRESET: /* Responded, but we seem to have changed address */ ++ e->aborted = false; + e->responded = true; + e->error = error; + return; +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index d64c1d90faed..68c88e3a0916 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -112,9 +112,9 @@ bool afs_select_fileserver(struct afs_operation *op) + struct afs_addr_list *alist; + struct afs_server *server; + struct afs_vnode *vnode = op->file[0].vnode; +- struct afs_error e; + unsigned int rtt; +- int error = op->ac.error, i; ++ s32 abort_code = op->call_abort_code; ++ int error = op->call_error, i; + + op->nr_iterations++; + +@@ -122,7 +122,7 @@ bool afs_select_fileserver(struct afs_operation *op) + op->debug_id, op->nr_iterations, op->volume->vid, + op->untried, op->index, + op->ac.tried, op->ac.index, +- error, op->ac.abort_code); ++ error, abort_code); + + if (op->flags & AFS_OPERATION_STOP) { + _leave(" = f [stopped]"); +@@ -133,8 +133,10 @@ bool afs_select_fileserver(struct afs_operation *op) + goto start; + + /* Evaluate the result of the previous operation, if there was one. */ +- switch (error) { ++ switch (op->call_error) { + case 0: ++ op->cumul_error.responded = true; ++ fallthrough; + default: + /* Success or local failure. Stop. */ + afs_op_set_error(op, error); +@@ -151,7 +153,8 @@ bool afs_select_fileserver(struct afs_operation *op) + * errors instead. IBM AFS and OpenAFS fileservers, however, do leak + * these abort codes. + */ +- switch (op->ac.abort_code) { ++ op->cumul_error.responded = true; ++ switch (abort_code) { + case VNOVOL: + /* This fileserver doesn't know about the volume. + * - May indicate that the VL is wrong - retry once and compare +@@ -164,7 +167,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * (administrative action). + */ + if (op->flags & AFS_OPERATION_VNOVOL) { +- op->error = -EREMOTEIO; ++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + } + +@@ -188,7 +191,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * it's the fileserver having trouble. + */ + if (rcu_access_pointer(op->volume->servers) == op->server_list) { +- op->error = -EREMOTEIO; ++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + } + +@@ -201,8 +204,8 @@ bool afs_select_fileserver(struct afs_operation *op) + case VONLINE: + /* These should not be returned from the fileserver. */ + pr_warn("Fileserver returned unexpected abort %d\n", +- op->ac.abort_code); +- op->error = -EREMOTEIO; ++ abort_code); ++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + goto next_server; + + case VNOSERVICE: +@@ -233,7 +236,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT. + */ + case RX_CALL_TIMEOUT: +- op->error = -ETIMEDOUT; ++ afs_op_accumulate_error(op, -ETIMEDOUT, abort_code); + goto next_server; + + case VSALVAGING: /* This error should not be leaked to cache managers +@@ -248,7 +251,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * days). + */ + if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) { +- afs_busy(op->volume, op->ac.abort_code); ++ afs_busy(op->volume, abort_code); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); + } + if (op->flags & AFS_OPERATION_NO_VSLEEP) { +@@ -281,7 +284,7 @@ bool afs_select_fileserver(struct afs_operation *op) + goto failed; + } + if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { +- afs_busy(op->volume, op->ac.abort_code); ++ afs_busy(op->volume, abort_code); + clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + } + busy: +@@ -329,7 +332,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * TODO: Retry a few times with sleeps. + */ + if (rcu_access_pointer(op->volume->servers) == op->server_list) { +- op->error = -ENOMEDIUM; ++ afs_op_accumulate_error(op, -ENOMEDIUM, abort_code); + goto failed; + } + +@@ -337,7 +340,7 @@ bool afs_select_fileserver(struct afs_operation *op) + + case UAEIO: + case VIO: +- op->error = -EREMOTEIO; ++ afs_op_accumulate_error(op, -EREMOTEIO, abort_code); + if (op->volume->type != AFSVL_RWVOL) + goto next_server; + goto failed; +@@ -361,7 +364,7 @@ bool afs_select_fileserver(struct afs_operation *op) + goto failed_but_online; + + default: +- op->error = afs_abort_to_error(op->ac.abort_code); ++ afs_op_accumulate_error(op, error, abort_code); + failed_but_online: + clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags); + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); +@@ -380,7 +383,7 @@ bool afs_select_fileserver(struct afs_operation *op) + case -EHOSTDOWN: + case -ECONNREFUSED: + _debug("no conn"); +- op->error = error; ++ afs_op_accumulate_error(op, error, 0); + goto iterate_address; + + case -ENETRESET: +@@ -506,6 +509,7 @@ bool afs_select_fileserver(struct afs_operation *op) + op->index, op->ac.index, op->ac.alist->nr_addrs, + rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer)); + ++ op->call_responded = false; + _leave(" = t"); + return true; + +@@ -543,17 +547,14 @@ bool afs_select_fileserver(struct afs_operation *op) + if (op->flags & AFS_OPERATION_VBUSY) + goto restart_from_beginning; + +- e.error = -EDESTADDRREQ; +- e.responded = false; + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_server *s = op->server_list->servers[i].server; + +- afs_prioritise_error(&e, READ_ONCE(s->probe.error), +- s->probe.abort_code); ++ error = READ_ONCE(s->probe.error); ++ if (error < 0) ++ afs_op_accumulate_error(op, error, s->probe.abort_code); + } + +- error = e.error; +- op->error = error; + failed: + op->flags |= AFS_OPERATION_STOP; + afs_end_cursor(&op->ac); +@@ -576,11 +577,13 @@ void afs_dump_edestaddrreq(const struct afs_operation *op) + rcu_read_lock(); + + pr_notice("EDESTADDR occurred\n"); +- pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n", ++ pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n", + op->file[0].cb_break_before, +- op->file[1].cb_break_before, op->flags, op->error); +- pr_notice("FC: ut=%lx ix=%d ni=%u\n", ++ op->file[1].cb_break_before, op->flags, op->cumul_error.error); ++ pr_notice("OP: ut=%lx ix=%d ni=%u\n", + op->untried, op->index, op->nr_iterations); ++ pr_notice("OP: call er=%d ac=%d r=%u\n", ++ op->call_error, op->call_abort_code, op->call_responded); + + if (op->server_list) { + const struct afs_server_list *sl = op->server_list; +@@ -605,8 +608,7 @@ void afs_dump_edestaddrreq(const struct afs_operation *op) + } + } + +- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", +- op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error, +- op->ac.responded, op->ac.nr_iterations); ++ pr_notice("AC: t=%lx ax=%u ni=%u\n", ++ op->ac.tried, op->ac.index, op->ac.nr_iterations); + rcu_read_unlock(); + } +diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c +index dad8efadbc44..0b3e2f20b0e0 100644 +--- a/fs/afs/rxrpc.c ++++ b/fs/afs/rxrpc.c +@@ -408,8 +408,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + rxrpc_kernel_recv_data(call->net->socket, rxcall, + &msg.msg_iter, &len, false, + &call->abort_code, &call->service_id); +- ac->abort_code = call->abort_code; +- ac->responded = true; ++ call->responded = true; + } + call->error = ret; + trace_afs_call_done(call); +@@ -429,7 +428,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + afs_set_call_complete(call, ret, 0); + } + +- ac->error = ret; ++ call->error = ret; + call->state = AFS_CALL_COMPLETE; + _leave(" = %d", ret); + } +@@ -510,6 +509,7 @@ static void afs_deliver_to_call(struct afs_call *call) + ret = -EBADMSG; + switch (ret) { + case 0: ++ call->responded = true; + afs_queue_call_work(call); + if (state == AFS_CALL_CL_PROC_REPLY) { + if (call->op) +@@ -524,9 +524,11 @@ static void afs_deliver_to_call(struct afs_call *call) + goto out; + case -ECONNABORTED: + ASSERTCMP(state, ==, AFS_CALL_COMPLETE); ++ call->responded = true; + afs_log_error(call, call->abort_code); + goto done; + case -ENOTSUPP: ++ call->responded = true; + abort_code = RXGEN_OPCODE; + rxrpc_kernel_abort_call(call->net->socket, call->rxcall, + abort_code, ret, +@@ -573,7 +575,7 @@ static void afs_deliver_to_call(struct afs_call *call) + } + + /* +- * Wait synchronously for a call to complete and clean up the call struct. ++ * Wait synchronously for a call to complete. + */ + void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac) + { +@@ -626,13 +628,8 @@ void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor + } + } + +- spin_lock_bh(&call->state_lock); +- ac->abort_code = call->abort_code; +- ac->error = call->error; +- spin_unlock_bh(&call->state_lock); +- + if (call->error == 0 || call->error == -ECONNABORTED) +- ac->responded = true; ++ call->responded = true; + } + + /* +diff --git a/fs/afs/server.c b/fs/afs/server.c +index 2826e6eced71..f7791ef13618 100644 +--- a/fs/afs/server.c ++++ b/fs/afs/server.c +@@ -437,7 +437,6 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server + struct afs_addr_cursor ac = { + .alist = alist, + .index = alist->preferred, +- .error = 0, + }; + + afs_fs_give_up_all_callbacks(net, server, &ac, NULL); +diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c +index 6fdf9f1bedc0..89cadd9a69e1 100644 +--- a/fs/afs/vl_alias.c ++++ b/fs/afs/vl_alias.c +@@ -236,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key) + + while (afs_select_vlserver(&vc)) { + if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) { +- vc.ac.error = -EOPNOTSUPP; ++ vc.call_error = -EOPNOTSUPP; + skipped = true; + continue; + } +diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c +index 9551aef07cee..2f8a13c2bf0c 100644 +--- a/fs/afs/vl_probe.c ++++ b/fs/afs/vl_probe.c +@@ -169,10 +169,11 @@ static bool afs_do_probe_vlserver(struct afs_net *net, + call = afs_vl_get_capabilities(net, &ac, key, server, + server_index); + if (!IS_ERR(call)) { ++ afs_prioritise_error(_e, call->error, call->abort_code); + afs_put_call(call); + in_progress = true; + } else { +- afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code); ++ afs_prioritise_error(_e, PTR_ERR(call), 0); + afs_done_one_vl_probe(server, false); + } + } +@@ -187,12 +188,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key, + struct afs_vlserver_list *vllist) + { + struct afs_vlserver *server; +- struct afs_error e; ++ struct afs_error e = {}; + bool in_progress = false; + int i; + +- e.error = 0; +- e.responded = false; + for (i = 0; i < vllist->nr_servers; i++) { + server = vllist->servers[i].server; + if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags)) +diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c +index f8f255c966ae..e2dc54082a05 100644 +--- a/fs/afs/vl_rotate.c ++++ b/fs/afs/vl_rotate.c +@@ -20,11 +20,11 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cel + memset(vc, 0, sizeof(*vc)); + vc->cell = cell; + vc->key = key; +- vc->error = -EDESTADDRREQ; +- vc->ac.error = SHRT_MAX; ++ vc->cumul_error.error = -EDESTADDRREQ; ++ vc->nr_iterations = -1; + + if (signal_pending(current)) { +- vc->error = -EINTR; ++ vc->cumul_error.error = -EINTR; + vc->flags |= AFS_VL_CURSOR_STOP; + return false; + } +@@ -52,7 +52,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) + &cell->dns_lookup_count, + smp_load_acquire(&cell->dns_lookup_count) + != dns_lookup_count) < 0) { +- vc->error = -ERESTARTSYS; ++ vc->cumul_error.error = -ERESTARTSYS; + return false; + } + } +@@ -60,12 +60,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc) + /* Status load is ordered after lookup counter load */ + if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) { + pr_warn("No record of cell %s\n", cell->name); +- vc->error = -ENOENT; ++ vc->cumul_error.error = -ENOENT; + return false; + } + + if (cell->dns_source == DNS_RECORD_UNAVAILABLE) { +- vc->error = -EDESTADDRREQ; ++ vc->cumul_error.error = -EDESTADDRREQ; + return false; + } + } +@@ -91,52 +91,52 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + { + struct afs_addr_list *alist; + struct afs_vlserver *vlserver; +- struct afs_error e; + unsigned int rtt; +- int error = vc->ac.error, i; ++ s32 abort_code = vc->call_abort_code; ++ int error = vc->call_error, i; ++ ++ vc->nr_iterations++; + + _enter("%lx[%d],%lx[%d],%d,%d", + vc->untried, vc->index, + vc->ac.tried, vc->ac.index, +- error, vc->ac.abort_code); ++ error, abort_code); + + if (vc->flags & AFS_VL_CURSOR_STOP) { + _leave(" = f [stopped]"); + return false; + } + +- vc->nr_iterations++; ++ if (vc->nr_iterations == 0) ++ goto start; + + /* Evaluate the result of the previous operation, if there was one. */ + switch (error) { +- case SHRT_MAX: +- goto start; +- + default: + case 0: + /* Success or local failure. Stop. */ +- vc->error = error; ++ vc->cumul_error.error = error; + vc->flags |= AFS_VL_CURSOR_STOP; +- _leave(" = f [okay/local %d]", vc->ac.error); ++ _leave(" = f [okay/local %d]", vc->cumul_error.error); + return false; + + case -ECONNABORTED: + /* The far side rejected the operation on some grounds. This + * might involve the server being busy or the volume having been moved. + */ +- switch (vc->ac.abort_code) { ++ switch (abort_code) { + case AFSVL_IO: + case AFSVL_BADVOLOPER: + case AFSVL_NOMEM: + /* The server went weird. */ +- vc->error = -EREMOTEIO; ++ afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code); + //write_lock(&vc->cell->vl_servers_lock); + //vc->server_list->weird_mask |= 1 << vc->index; + //write_unlock(&vc->cell->vl_servers_lock); + goto next_server; + + default: +- vc->error = afs_abort_to_error(vc->ac.abort_code); ++ afs_prioritise_error(&vc->cumul_error, error, abort_code); + goto failed; + } + +@@ -149,12 +149,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + case -ETIMEDOUT: + case -ETIME: + _debug("no conn %d", error); +- vc->error = error; ++ afs_prioritise_error(&vc->cumul_error, error, 0); + goto iterate_address; + + case -ECONNRESET: + _debug("call reset"); +- vc->error = error; ++ afs_prioritise_error(&vc->cumul_error, error, 0); + vc->flags |= AFS_VL_CURSOR_RETRY; + goto next_server; + +@@ -178,15 +178,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + goto failed; + + error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_prioritise_error(&vc->cumul_error, error, 0); ++ goto failed; ++ } + + pick_server: + _debug("pick [%lx]", vc->untried); + + error = afs_wait_for_vl_probes(vc->server_list, vc->untried); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_prioritise_error(&vc->cumul_error, error, 0); ++ goto failed; ++ } + + /* Pick the untried server with the lowest RTT. */ + vc->index = vc->server_list->preferred; +@@ -249,6 +253,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + + _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + ++ vc->call_responded = false; + _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer)); + return true; + +@@ -264,25 +269,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + if (vc->flags & AFS_VL_CURSOR_RETRY) + goto restart_from_beginning; + +- e.error = -EDESTADDRREQ; +- e.responded = false; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + + if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags)) +- e.responded = true; +- afs_prioritise_error(&e, READ_ONCE(s->probe.error), ++ vc->cumul_error.responded = true; ++ afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error), + s->probe.abort_code); + } + +- error = e.error; +- +-failed_set_error: +- vc->error = error; + failed: + vc->flags |= AFS_VL_CURSOR_STOP; + afs_end_cursor(&vc->ac); +- _leave(" = f [failed %d]", vc->error); ++ _leave(" = f [failed %d]", vc->cumul_error.error); + return false; + } + +@@ -305,7 +304,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) + pr_notice("DNS: src=%u st=%u lc=%x\n", + cell->dns_source, cell->dns_status, cell->dns_lookup_count); + pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n", +- vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error); ++ vc->untried, vc->index, vc->nr_iterations, vc->flags, ++ vc->cumul_error.error); ++ pr_notice("VC: call er=%d ac=%d r=%u\n", ++ vc->call_error, vc->call_abort_code, vc->call_responded); + + if (vc->server_list) { + const struct afs_vlserver_list *sl = vc->server_list; +@@ -329,9 +331,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc) + } + } + +- pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n", +- vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error, +- vc->ac.responded, vc->ac.nr_iterations); ++ pr_notice("AC: t=%lx ax=%u ni=%u\n", ++ vc->ac.tried, vc->ac.index, vc->ac.nr_iterations); + rcu_read_unlock(); + } + +@@ -342,17 +343,16 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc) + { + struct afs_net *net = vc->cell->net; + +- if (vc->error == -EDESTADDRREQ || +- vc->error == -EADDRNOTAVAIL || +- vc->error == -ENETUNREACH || +- vc->error == -EHOSTUNREACH) ++ switch (vc->cumul_error.error) { ++ case -EDESTADDRREQ: ++ case -EADDRNOTAVAIL: ++ case -ENETUNREACH: ++ case -EHOSTUNREACH: + afs_vl_dump_edestaddrreq(vc); ++ break; ++ } + + afs_end_cursor(&vc->ac); + afs_put_vlserverlist(net, vc->server_list); +- +- if (vc->error == -ECONNABORTED) +- vc->error = afs_abort_to_error(vc->ac.abort_code); +- +- return vc->error; ++ return vc->cumul_error.error; + } +diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c +index 650534892a20..db7e94584e87 100644 +--- a/fs/afs/vlclient.c ++++ b/fs/afs/vlclient.c +@@ -161,10 +161,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc, + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); + afs_wait_for_call_to_complete(call, &vc->ac); ++ vc->call_abort_code = call->abort_code; ++ vc->call_error = call->error; ++ vc->call_responded = call->responded; + afs_put_call(call); +- if (vc->ac.error) { ++ if (vc->call_error) { + kfree(entry); +- return ERR_PTR(vc->ac.error); ++ return ERR_PTR(vc->call_error); + } + return entry; + } +@@ -305,11 +308,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc, + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); + afs_wait_for_call_to_complete(call, &vc->ac); +- alist = call->ret_alist; ++ vc->call_abort_code = call->abort_code; ++ vc->call_error = call->error; ++ vc->call_responded = call->responded; ++ alist = call->ret_alist; + afs_put_call(call); +- if (vc->ac.error) { ++ if (vc->call_error) { + afs_put_addrlist(alist); +- return ERR_PTR(vc->ac.error); ++ return ERR_PTR(vc->call_error); + } + return alist; + } +@@ -656,11 +662,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc, + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); + afs_wait_for_call_to_complete(call, &vc->ac); +- alist = call->ret_alist; ++ vc->call_abort_code = call->abort_code; ++ vc->call_error = call->error; ++ vc->call_responded = call->responded; ++ alist = call->ret_alist; + afs_put_call(call); +- if (vc->ac.error) { ++ if (vc->call_error) { + afs_put_addrlist(alist); +- return ERR_PTR(vc->ac.error); ++ return ERR_PTR(vc->call_error); + } + return alist; + } +@@ -769,11 +778,14 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc) + trace_afs_make_vl_call(call); + afs_make_call(&vc->ac, call, GFP_KERNEL); + afs_wait_for_call_to_complete(call, &vc->ac); +- cellname = call->ret_str; ++ vc->call_abort_code = call->abort_code; ++ vc->call_error = call->error; ++ vc->call_responded = call->responded; ++ cellname = call->ret_str; + afs_put_call(call); +- if (vc->ac.error) { ++ if (vc->call_error) { + kfree(cellname); +- return ERR_PTR(vc->ac.error); ++ return ERR_PTR(vc->call_error); + } + return cellname; + } +-- +2.43.0 + diff --git a/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch b/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch new file mode 100644 index 00000000000..607217d96de --- /dev/null +++ b/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch @@ -0,0 +1,269 @@ +From 8001a9917176e5da09c7619530be845008e837ed Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 18 Oct 2023 15:38:14 +0100 +Subject: afs: Turn the afs_addr_list address array into an array of structs + +From: David Howells + +[ Upstream commit 07f3502b33a260f873e35708d2fa693eb52225cb ] + +Turn the afs_addr_list address array into an array of structs, thereby +allowing per-address (such as RTT) info to be added. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/addr_list.c | 10 +++++----- + fs/afs/fs_probe.c | 6 +++--- + fs/afs/internal.h | 6 +++++- + fs/afs/proc.c | 4 ++-- + fs/afs/rotate.c | 2 +- + fs/afs/rxrpc.c | 4 ++-- + fs/afs/server.c | 4 ++-- + fs/afs/vl_alias.c | 4 ++-- + fs/afs/vl_probe.c | 6 +++--- + fs/afs/vl_rotate.c | 2 +- + 10 files changed, 26 insertions(+), 22 deletions(-) + +diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c +index de1ae0bead3b..ac05a59e9d46 100644 +--- a/fs/afs/addr_list.c ++++ b/fs/afs/addr_list.c +@@ -45,7 +45,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, + alist->max_addrs = nr; + + for (i = 0; i < nr; i++) { +- struct sockaddr_rxrpc *srx = &alist->addrs[i]; ++ struct sockaddr_rxrpc *srx = &alist->addrs[i].srx; + srx->srx_family = AF_RXRPC; + srx->srx_service = service; + srx->transport_type = SOCK_DGRAM; +@@ -281,7 +281,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) + return; + + for (i = 0; i < alist->nr_ipv4; i++) { +- struct sockaddr_in *a = &alist->addrs[i].transport.sin; ++ struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin; + u32 a_addr = ntohl(a->sin_addr.s_addr); + u16 a_port = ntohs(a->sin_port); + +@@ -298,7 +298,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + +- srx = &alist->addrs[i]; ++ srx = &alist->addrs[i].srx; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin); +@@ -321,7 +321,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) + return; + + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { +- struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6; ++ struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6; + u16 a_port = ntohs(a->sin6_port); + + diff = memcmp(xdr, &a->sin6_addr, 16); +@@ -338,7 +338,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + +- srx = &alist->addrs[i]; ++ srx = &alist->addrs[i].srx; + srx->srx_family = AF_RXRPC; + srx->transport_type = SOCK_DGRAM; + srx->transport_len = sizeof(srx->transport.sin6); +diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c +index daaf3810cc92..3dd24842f277 100644 +--- a/fs/afs/fs_probe.c ++++ b/fs/afs/fs_probe.c +@@ -153,12 +153,12 @@ void afs_fileserver_probe_result(struct afs_call *call) + if (call->service_id == YFS_FS_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx_service = call->service_id; ++ alist->addrs[index].srx.srx_service = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx_service = call->service_id; ++ alist->addrs[index].srx.srx_service = call->service_id; + } + cap0 = ntohl(call->tmp); + if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) +@@ -182,7 +182,7 @@ void afs_fileserver_probe_result(struct afs_call *call) + spin_unlock(&server->probe_lock); + + _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", +- &server->uuid, index, &alist->addrs[index].transport, ++ &server->uuid, index, &alist->addrs[index].srx.transport, + rtt_us, ret); + + return afs_done_one_fs_probe(call->net, server); +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index 7385d62c8cf5..e2adb314ab6a 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -87,7 +87,9 @@ struct afs_addr_list { + enum dns_lookup_status status:8; + unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long responded; /* Mask of addrs that responded */ +- struct sockaddr_rxrpc addrs[] __counted_by(max_addrs); ++ struct { ++ struct sockaddr_rxrpc srx; ++ } addrs[] __counted_by(max_addrs); + #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) + }; + +@@ -969,6 +971,8 @@ extern void afs_put_addrlist(struct afs_addr_list *); + extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, + const char *, size_t, char, + unsigned short, unsigned short); ++bool afs_addr_list_same(const struct afs_addr_list *a, ++ const struct afs_addr_list *b); + extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); + extern bool afs_iterate_addresses(struct afs_addr_cursor *); + extern int afs_end_cursor(struct afs_addr_cursor *); +diff --git a/fs/afs/proc.c b/fs/afs/proc.c +index 2a0c83d71565..ab9cd986cfd9 100644 +--- a/fs/afs/proc.c ++++ b/fs/afs/proc.c +@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " %c %pISpc\n", + alist->preferred == i ? '>' : '-', +- &alist->addrs[i].transport); ++ &alist->addrs[i].srx.transport); + } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", +@@ -399,7 +399,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) + alist->version, alist->responded, alist->failed); + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " [%x] %pISpc%s\n", +- i, &alist->addrs[i].transport, ++ i, &alist->addrs[i].srx.transport, + alist->preferred == i ? "*" : ""); + return 0; + } +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index a3d127953ac6..46081e5da6f5 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op) + + _debug("address [%u] %u/%u %pISp", + op->index, op->ac.index, op->ac.alist->nr_addrs, +- &op->ac.alist->addrs[op->ac.index].transport); ++ &op->ac.alist->addrs[op->ac.index].srx.transport); + + _leave(" = t"); + return true; +diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c +index d642d06a453b..181317126e43 100644 +--- a/fs/afs/rxrpc.c ++++ b/fs/afs/rxrpc.c +@@ -296,7 +296,7 @@ static void afs_notify_end_request_tx(struct sock *sock, + */ + void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + { +- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index]; ++ struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx; + struct rxrpc_call *rxcall; + struct msghdr msg; + struct kvec iov[1]; +@@ -461,7 +461,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, +- &call->alist->addrs[call->addr_ix].transport); ++ &call->alist->addrs[call->addr_ix].srx.transport); + } + } + +diff --git a/fs/afs/server.c b/fs/afs/server.c +index 0bd2f5ba6900..b8e2d211d4a1 100644 +--- a/fs/afs/server.c ++++ b/fs/afs/server.c +@@ -43,7 +43,7 @@ struct afs_server *afs_find_server(struct afs_net *net, + hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { + alist = rcu_dereference(server->addresses); + for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { +- b = &alist->addrs[i].transport.sin6; ++ b = &alist->addrs[i].srx.transport.sin6; + diff = ((u16 __force)a->sin6_port - + (u16 __force)b->sin6_port); + if (diff == 0) +@@ -59,7 +59,7 @@ struct afs_server *afs_find_server(struct afs_net *net, + hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { + alist = rcu_dereference(server->addresses); + for (i = 0; i < alist->nr_ipv4; i++) { +- b = &alist->addrs[i].transport.sin; ++ b = &alist->addrs[i].srx.transport.sin; + diff = ((u16 __force)a->sin_port - + (u16 __force)b->sin_port); + if (diff == 0) +diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c +index f04a80e4f5c3..d3c0df70a1a5 100644 +--- a/fs/afs/vl_alias.c ++++ b/fs/afs/vl_alias.c +@@ -94,8 +94,8 @@ static int afs_compare_fs_alists(const struct afs_server *server_a, + lb = rcu_dereference(server_b->addresses); + + while (a < la->nr_addrs && b < lb->nr_addrs) { +- const struct sockaddr_rxrpc *srx_a = &la->addrs[a]; +- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b]; ++ const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx; ++ const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx; + int diff = afs_compare_addrs(srx_a, srx_b); + + if (diff < 0) { +diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c +index 58452b86e672..bdd9372e3fb2 100644 +--- a/fs/afs/vl_probe.c ++++ b/fs/afs/vl_probe.c +@@ -106,12 +106,12 @@ void afs_vlserver_probe_result(struct afs_call *call) + if (call->service_id == YFS_VL_SERVICE) { + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; + set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx_service = call->service_id; ++ alist->addrs[index].srx.srx_service = call->service_id; + } else { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { + clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx_service = call->service_id; ++ alist->addrs[index].srx.srx_service = call->service_id; + } + } + +@@ -131,7 +131,7 @@ void afs_vlserver_probe_result(struct afs_call *call) + spin_unlock(&server->probe_lock); + + _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", +- server_index, index, &alist->addrs[index].transport, rtt_us, ret); ++ server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret); + + afs_done_one_vl_probe(server, have_result); + } +diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c +index eb415ce56360..e52b9d4c8a0a 100644 +--- a/fs/afs/vl_rotate.c ++++ b/fs/afs/vl_rotate.c +@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + + _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + +- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport); ++ _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport); + return true; + + next_server: +-- +2.43.0 + diff --git a/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch b/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch new file mode 100644 index 00000000000..d8ce058d951 --- /dev/null +++ b/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch @@ -0,0 +1,86 @@ +From c0d15d3cc0ef5a5944b6b80309a57145d4500e26 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 20 Oct 2023 16:04:52 +0100 +Subject: afs: Use op->nr_iterations=-1 to indicate to begin fileserver + iteration + +From: David Howells + +[ Upstream commit 075171fd22be33acf4ab354814bfa6de1c3412ce ] + +Set op->nr_iterations to -1 to indicate that we need to begin fileserver +iteration rather than setting error to SHRT_MAX. This makes it easier to +eliminate the address cursor. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/fs_operation.c | 2 +- + fs/afs/internal.h | 2 +- + fs/afs/rotate.c | 11 ++++++----- + 3 files changed, 8 insertions(+), 7 deletions(-) + +diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c +index 7a3803ce3a22..3e31fae9a149 100644 +--- a/fs/afs/fs_operation.c ++++ b/fs/afs/fs_operation.c +@@ -41,7 +41,7 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo + op->cb_v_break = volume->cb_v_break; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); + op->error = -EDESTADDRREQ; +- op->ac.error = SHRT_MAX; ++ op->nr_iterations = -1; + + _leave(" = [op=%08x]", op->debug_id); + return op; +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index ec08b4a7e499..88381935bd66 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -859,7 +859,7 @@ struct afs_operation { + struct afs_call *call; + unsigned long untried; /* Bitmask of untried servers */ + short index; /* Current server */ +- unsigned short nr_iterations; /* Number of server iterations */ ++ short nr_iterations; /* Number of server iterations */ + + unsigned int flags; + #define AFS_OPERATION_STOP 0x0001 /* Set to cease iteration */ +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index a108cd55bb4e..4084e023ff43 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -116,7 +116,10 @@ bool afs_select_fileserver(struct afs_operation *op) + unsigned int rtt; + int error = op->ac.error, i; + +- _enter("%lx[%d],%lx[%d],%d,%d", ++ op->nr_iterations++; ++ ++ _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d", ++ op->debug_id, op->nr_iterations, op->volume->vid, + op->untried, op->index, + op->ac.tried, op->ac.index, + error, op->ac.abort_code); +@@ -126,13 +129,11 @@ bool afs_select_fileserver(struct afs_operation *op) + return false; + } + +- op->nr_iterations++; ++ if (op->nr_iterations == 0) ++ goto start; + + /* Evaluate the result of the previous operation, if there was one. */ + switch (error) { +- case SHRT_MAX: +- goto start; +- + case 0: + default: + /* Success or local failure. Stop. */ +-- +2.43.0 + diff --git a/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch b/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch new file mode 100644 index 00000000000..2987e8b17a6 --- /dev/null +++ b/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch @@ -0,0 +1,591 @@ +From 2dc8dd6e4d297a768f1c515165a1918554b25b85 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 26 Oct 2023 09:43:23 +0100 +Subject: afs: Wrap most op->error accesses with inline funcs + +From: David Howells + +[ Upstream commit 2de5599f63babb416e09b1a6be429a47910dd47c ] + +Wrap most op->error accesses with inline funcs which will make it easier +for a subsequent patch to replace op->error with something else. Two +functions are added to this end: + + (1) afs_op_error() - Get the error code. + + (2) afs_op_set_error() - Set the error code. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/dir.c | 38 +++++++++++++++--------------- + fs/afs/file.c | 4 ++-- + fs/afs/fs_operation.c | 21 ++++++++++------- + fs/afs/fsclient.c | 2 +- + fs/afs/inode.c | 2 +- + fs/afs/internal.h | 20 ++++++++++++---- + fs/afs/rotate.c | 55 ++++++++++++++++++++++++------------------- + fs/afs/server.c | 6 ++--- + fs/afs/write.c | 6 ++--- + 9 files changed, 87 insertions(+), 67 deletions(-) + +diff --git a/fs/afs/dir.c b/fs/afs/dir.c +index 2df2e9ee130d..15763418a938 100644 +--- a/fs/afs/dir.c ++++ b/fs/afs/dir.c +@@ -886,14 +886,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + * lookups contained therein are stored in the reply without aborting + * the whole operation. + */ +- op->error = -ENOTSUPP; ++ afs_op_set_error(op, -ENOTSUPP); + if (!cookie->one_only) { + op->ops = &afs_inline_bulk_status_operation; + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } + +- if (op->error == -ENOTSUPP) { ++ if (afs_op_error(op) == -ENOTSUPP) { + /* We could try FS.BulkStatus next, but this aborts the entire + * op if any of the lookups fails - so, for the moment, revert + * to FS.FetchStatus for op->file[1]. +@@ -903,10 +903,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry, + afs_begin_vnode_operation(op); + afs_wait_for_operation(op); + } +- inode = ERR_PTR(op->error); ++ inode = ERR_PTR(afs_op_error(op)); + + out_op: +- if (op->error == 0) { ++ if (!afs_op_error(op)) { + inode = &op->file[1].vnode->netfs.inode; + op->file[1].vnode = NULL; + } +@@ -1281,7 +1281,7 @@ static void afs_vnode_new_inode(struct afs_operation *op) + + _enter(""); + +- ASSERTCMP(op->error, ==, 0); ++ ASSERTCMP(afs_op_error(op), ==, 0); + + inode = afs_iget(op, vp); + if (IS_ERR(inode)) { +@@ -1294,7 +1294,7 @@ static void afs_vnode_new_inode(struct afs_operation *op) + + vnode = AFS_FS_I(inode); + set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags); +- if (!op->error) ++ if (!afs_op_error(op)) + afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb); + d_instantiate(op->dentry, inode); + } +@@ -1328,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op) + { + _enter("op=%08x", op->debug_id); + +- if (op->error) ++ if (afs_op_error(op)) + d_drop(op->dentry); + } + +@@ -1488,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op) + struct dentry *dentry = op->dentry; + int ret; + +- if (op->error != 0 || ++ if (afs_op_error(op) || + (op->file[1].scb.have_status && op->file[1].scb.have_error)) + return; + if (d_really_is_positive(dentry)) +@@ -1512,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op) + + ret = afs_validate(vnode, op->key); + if (ret != -ESTALE) +- op->error = ret; ++ afs_op_set_error(op, ret); + } + +- _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error); ++ _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op)); + } + + static void afs_unlink_success(struct afs_operation *op) +@@ -1546,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op) + static void afs_unlink_put(struct afs_operation *op) + { + _enter("op=%08x", op->debug_id); +- if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT) ++ if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT) + d_rehash(op->dentry); + } + +@@ -1587,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) + /* Try to make sure we have a callback promise on the victim. */ + ret = afs_validate(vnode, op->key); + if (ret < 0) { +- op->error = ret; ++ afs_op_set_error(op, ret); + goto error; + } + +@@ -1596,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) + spin_unlock(&dentry->d_lock); + /* Start asynchronous writeout of the inode */ + write_inode_now(d_inode(dentry), 0); +- op->error = afs_sillyrename(dvnode, vnode, dentry, op->key); ++ afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key)); + goto error; + } + if (!d_unhashed(dentry)) { +@@ -1617,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry) + /* If there was a conflict with a third party, check the status of the + * unlinked vnode. + */ +- if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { ++ if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) { + op->file[1].update_ctime = false; + op->fetch_status.which = 1; + op->ops = &afs_fetch_status_operation; +@@ -1699,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op) + static void afs_link_put(struct afs_operation *op) + { + _enter("op=%08x", op->debug_id); +- if (op->error) ++ if (afs_op_error(op)) + d_drop(op->dentry); + } + +@@ -1897,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op) + if (op->rename.rehash) + d_rehash(op->rename.rehash); + dput(op->rename.tmp); +- if (op->error) ++ if (afs_op_error(op)) + d_rehash(op->dentry); + } + +@@ -1942,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + return PTR_ERR(op); + + ret = afs_validate(vnode, op->key); +- op->error = ret; ++ afs_op_set_error(op, ret); + if (ret < 0) + goto error; + +@@ -1979,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + op->rename.tmp = d_alloc(new_dentry->d_parent, + &new_dentry->d_name); + if (!op->rename.tmp) { +- op->error = -ENOMEM; ++ afs_op_nomem(op); + goto error; + } + +@@ -1987,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir, + AFS_FS_I(d_inode(new_dentry)), + new_dentry, op->key); + if (ret) { +- op->error = ret; ++ afs_op_set_error(op, ret); + goto error; + } + +diff --git a/fs/afs/file.c b/fs/afs/file.c +index d37dd201752b..0c81c39c32f5 100644 +--- a/fs/afs/file.c ++++ b/fs/afs/file.c +@@ -243,7 +243,7 @@ static void afs_fetch_data_notify(struct afs_operation *op) + { + struct afs_read *req = op->fetch.req; + struct netfs_io_subrequest *subreq = req->subreq; +- int error = op->error; ++ int error = afs_op_error(op); + + if (error == -ECONNABORTED) + error = afs_abort_to_error(op->ac.abort_code); +@@ -271,7 +271,7 @@ static void afs_fetch_data_success(struct afs_operation *op) + + static void afs_fetch_data_put(struct afs_operation *op) + { +- op->fetch.req->error = op->error; ++ op->fetch.req->error = afs_op_error(op); + afs_put_read(op->fetch.req); + } + +diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c +index 3e31fae9a149..bfb9a7634bd9 100644 +--- a/fs/afs/fs_operation.c ++++ b/fs/afs/fs_operation.c +@@ -40,8 +40,8 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo + op->net = volume->cell->net; + op->cb_v_break = volume->cb_v_break; + op->debug_id = atomic_inc_return(&afs_operation_debug_counter); +- op->error = -EDESTADDRREQ; + op->nr_iterations = -1; ++ afs_op_set_error(op, -EDESTADDRREQ); + + _leave(" = [op=%08x]", op->debug_id); + return op; +@@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op) + swap(vnode, vnode2); + + if (mutex_lock_interruptible(&vnode->io_lock) < 0) { +- op->error = -ERESTARTSYS; ++ afs_op_set_error(op, -ERESTARTSYS); + op->flags |= AFS_OPERATION_STOP; + _leave(" = f [I 0]"); + return false; +@@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op) + + if (vnode2) { + if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) { +- op->error = -ERESTARTSYS; ++ afs_op_set_error(op, -ERESTARTSYS); + op->flags |= AFS_OPERATION_STOP; + mutex_unlock(&vnode->io_lock); + op->flags &= ~AFS_OPERATION_LOCK_0; +@@ -159,11 +159,14 @@ static void afs_end_vnode_operation(struct afs_operation *op) + { + _enter(""); + +- if (op->error == -EDESTADDRREQ || +- op->error == -EADDRNOTAVAIL || +- op->error == -ENETUNREACH || +- op->error == -EHOSTUNREACH) ++ switch (afs_op_error(op)) { ++ case -EDESTADDRREQ: ++ case -EADDRNOTAVAIL: ++ case -ENETUNREACH: ++ case -EHOSTUNREACH: + afs_dump_edestaddrreq(op); ++ break; ++ } + + afs_drop_io_locks(op); + +@@ -209,7 +212,7 @@ void afs_wait_for_operation(struct afs_operation *op) + + afs_end_vnode_operation(op); + +- if (op->error == 0 && op->ops->edit_dir) { ++ if (!afs_op_error(op) && op->ops->edit_dir) { + _debug("edit_dir"); + op->ops->edit_dir(op); + } +@@ -221,7 +224,7 @@ void afs_wait_for_operation(struct afs_operation *op) + */ + int afs_put_operation(struct afs_operation *op) + { +- int i, ret = op->error; ++ int i, ret = afs_op_error(op); + + _enter("op=%08x,%d", op->debug_id, ret); + +diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c +index 7d37f63ef0f0..6821ce0f9d63 100644 +--- a/fs/afs/fsclient.c ++++ b/fs/afs/fsclient.c +@@ -1899,7 +1899,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op) + int i; + + if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) { +- op->error = -ENOTSUPP; ++ afs_op_set_error(op, -ENOTSUPP); + return; + } + +diff --git a/fs/afs/inode.c b/fs/afs/inode.c +index 78efc9719349..d6eed332507f 100644 +--- a/fs/afs/inode.c ++++ b/fs/afs/inode.c +@@ -331,7 +331,7 @@ static void afs_fetch_status_success(struct afs_operation *op) + + if (vnode->netfs.inode.i_state & I_NEW) { + ret = afs_inode_init_from_status(op, vp, vnode); +- op->error = ret; ++ afs_op_set_error(op, ret); + if (ret == 0) + afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb); + } else { +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index 88381935bd66..1a306df267b0 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -1140,11 +1140,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *); + extern void afs_wait_for_operation(struct afs_operation *); + extern int afs_do_sync_operation(struct afs_operation *); + +-static inline void afs_op_nomem(struct afs_operation *op) +-{ +- op->error = -ENOMEM; +-} +- + static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n, + struct afs_vnode *vnode) + { +@@ -1238,6 +1233,21 @@ static inline void __afs_stat(atomic_t *s) + extern int afs_abort_to_error(u32); + extern void afs_prioritise_error(struct afs_error *, int, u32); + ++static inline void afs_op_nomem(struct afs_operation *op) ++{ ++ op->error = -ENOMEM; ++} ++ ++static inline int afs_op_error(const struct afs_operation *op) ++{ ++ return op->error; ++} ++ ++static inline int afs_op_set_error(struct afs_operation *op, int error) ++{ ++ return op->error = error; ++} ++ + /* + * mntpt.c + */ +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index 4084e023ff43..d64c1d90faed 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -51,7 +51,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op, + * and have to return an error. + */ + if (op->flags & AFS_OPERATION_CUR_ONLY) { +- op->error = -ESTALE; ++ afs_op_set_error(op, -ESTALE); + return false; + } + +@@ -93,7 +93,7 @@ static bool afs_sleep_and_retry(struct afs_operation *op) + if (!(op->flags & AFS_OPERATION_UNINTR)) { + msleep_interruptible(1000); + if (signal_pending(current)) { +- op->error = -ERESTARTSYS; ++ afs_op_set_error(op, -ERESTARTSYS); + return false; + } + } else { +@@ -137,7 +137,7 @@ bool afs_select_fileserver(struct afs_operation *op) + case 0: + default: + /* Success or local failure. Stop. */ +- op->error = error; ++ afs_op_set_error(op, error); + op->flags |= AFS_OPERATION_STOP; + _leave(" = f [okay/local %d]", error); + return false; +@@ -174,11 +174,13 @@ bool afs_select_fileserver(struct afs_operation *op) + + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_op_set_error(op, error); ++ goto failed; ++ } + + if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) { +- op->error = -ENOMEDIUM; ++ afs_op_set_error(op, -ENOMEDIUM); + goto failed; + } + +@@ -250,11 +252,11 @@ bool afs_select_fileserver(struct afs_operation *op) + clear_bit(AFS_VOLUME_BUSY, &op->volume->flags); + } + if (op->flags & AFS_OPERATION_NO_VSLEEP) { +- op->error = -EADV; ++ afs_op_set_error(op, -EADV); + goto failed; + } + if (op->flags & AFS_OPERATION_CUR_ONLY) { +- op->error = -ESTALE; ++ afs_op_set_error(op, -ESTALE); + goto failed; + } + goto busy; +@@ -275,7 +277,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * lock we need to maintain. + */ + if (op->flags & AFS_OPERATION_NO_VSLEEP) { +- op->error = -EBUSY; ++ afs_op_set_error(op, -EBUSY); + goto failed; + } + if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) { +@@ -304,7 +306,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * honour, just in case someone sets up a loop. + */ + if (op->flags & AFS_OPERATION_VMOVED) { +- op->error = -EREMOTEIO; ++ afs_op_set_error(op, -EREMOTEIO); + goto failed; + } + op->flags |= AFS_OPERATION_VMOVED; +@@ -312,8 +314,10 @@ bool afs_select_fileserver(struct afs_operation *op) + set_bit(AFS_VOLUME_WAIT, &op->volume->flags); + set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags); + error = afs_check_volume_status(op->volume, op); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_op_set_error(op, error); ++ goto failed; ++ } + + /* If the server list didn't change, then the VLDB is + * out of sync with the fileservers. This is hopefully +@@ -344,7 +348,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * Translate locally and return ENOSPC. + * No replicas to failover to. + */ +- op->error = -ENOSPC; ++ afs_op_set_error(op, -ENOSPC); + goto failed_but_online; + + case VOVERQUOTA: +@@ -353,7 +357,7 @@ bool afs_select_fileserver(struct afs_operation *op) + * Translate locally and return EDQUOT. + * No replicas to failover to. + */ +- op->error = -EDQUOT; ++ afs_op_set_error(op, -EDQUOT); + goto failed_but_online; + + default: +@@ -366,7 +370,7 @@ bool afs_select_fileserver(struct afs_operation *op) + + case -ETIMEDOUT: + case -ETIME: +- if (op->error != -EDESTADDRREQ) ++ if (afs_op_error(op) != -EDESTADDRREQ) + goto iterate_address; + fallthrough; + case -ERFKILL: +@@ -385,7 +389,7 @@ bool afs_select_fileserver(struct afs_operation *op) + fallthrough; + case -ECONNRESET: + _debug("call reset"); +- op->error = error; ++ afs_op_set_error(op, error); + goto failed; + } + +@@ -401,8 +405,10 @@ bool afs_select_fileserver(struct afs_operation *op) + * volume may have moved or even have been deleted. + */ + error = afs_check_volume_status(op->volume, op); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_op_set_error(op, error); ++ goto failed; ++ } + + if (!afs_start_fs_iteration(op, vnode)) + goto failed; +@@ -413,8 +419,10 @@ bool afs_select_fileserver(struct afs_operation *op) + _debug("pick [%lx]", op->untried); + + error = afs_wait_for_fs_probes(op->server_list, op->untried); +- if (error < 0) +- goto failed_set_error; ++ if (error < 0) { ++ afs_op_set_error(op, error); ++ goto failed; ++ } + + /* Pick the untried server with the lowest RTT. If we have outstanding + * callbacks, we stick with the server we're already using if we can. +@@ -515,7 +523,8 @@ bool afs_select_fileserver(struct afs_operation *op) + op->flags &= ~AFS_OPERATION_RETRY_SERVER; + goto retry_server; + case -ERESTARTSYS: +- goto failed_set_error; ++ afs_op_set_error(op, error); ++ goto failed; + case -ETIME: + case -EDESTADDRREQ: + goto next_server; +@@ -544,13 +553,11 @@ bool afs_select_fileserver(struct afs_operation *op) + } + + error = e.error; +- +-failed_set_error: + op->error = error; + failed: + op->flags |= AFS_OPERATION_STOP; + afs_end_cursor(&op->ac); +- _leave(" = f [failed %d]", op->error); ++ _leave(" = f [failed %d]", afs_op_error(op)); + return false; + } + +diff --git a/fs/afs/server.c b/fs/afs/server.c +index 5b5fa94005c9..2826e6eced71 100644 +--- a/fs/afs/server.c ++++ b/fs/afs/server.c +@@ -629,8 +629,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op, + _leave(" = t [intr]"); + return true; + } +- op->error = PTR_ERR(alist); +- _leave(" = f [%d]", op->error); ++ afs_op_set_error(op, PTR_ERR(alist)); ++ _leave(" = f [%d]", afs_op_error(op)); + return false; + } + +@@ -684,7 +684,7 @@ bool afs_check_server_record(struct afs_operation *op, struct afs_server *server + (op->flags & AFS_OPERATION_UNINTR) ? + TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE); + if (ret == -ERESTARTSYS) { +- op->error = ret; ++ afs_op_set_error(op, ret); + _leave(" = f [intr]"); + return false; + } +diff --git a/fs/afs/write.c b/fs/afs/write.c +index 4a168781936b..9f90d8970ce9 100644 +--- a/fs/afs/write.c ++++ b/fs/afs/write.c +@@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op) + + op->ctime = op->file[0].scb.status.mtime_client; + afs_vnode_commit_status(op, &op->file[0]); +- if (op->error == 0) { ++ if (!afs_op_error(op)) { + if (!op->store.laundering) + afs_pages_written_back(vnode, op->store.pos, op->store.size); + afs_stat_v(vnode, n_stores); +@@ -428,7 +428,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t + + afs_wait_for_operation(op); + +- switch (op->error) { ++ switch (afs_op_error(op)) { + case -EACCES: + case -EPERM: + case -ENOKEY: +@@ -447,7 +447,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t + } + + afs_put_wb_key(wbk); +- _leave(" = %d", op->error); ++ _leave(" = %d", afs_op_error(op)); + return afs_put_operation(op); + } + +-- +2.43.0 + diff --git a/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch b/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch new file mode 100644 index 00000000000..d018f571016 --- /dev/null +++ b/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch @@ -0,0 +1,113 @@ +From 171db765ad289c8587da5dec137d4deb2f99c402 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:45:14 -0800 +Subject: bnxt_en: Prevent kernel warning when running offline self test + +From: Michael Chan + +[ Upstream commit c20f482129a582455f02eb9a6dcb2a4215274599 ] + +We call bnxt_half_open_nic() to setup the chip partially to run +loopback tests. The rings and buffers are initialized normally +so that we can transmit and receive packets in loopback mode. +That means page pool buffers are allocated for the aggregation ring +just like the normal case. NAPI is not needed because we are just +polling for the loopback packets. + +When we're done with the loopback tests, we call bnxt_half_close_nic() +to clean up. When freeing the page pools, we hit a WARN_ON() +in page_pool_unlink_napi() because the NAPI state linked to the +page pool is uninitialized. + +The simplest way to avoid this warning is just to initialize the +NAPIs during half open and delete the NAPIs during half close. +Trying to skip the page pool initialization or skip linking of +NAPI during half open will be more complicated. + +This fix avoids this warning: + +WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30 +CPU: 4 PID: 46967 Comm: ethtool Tainted: G S W 6.7.0-rc5+ #22 +Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021 +RIP: 0010:page_pool_unlink_napi+0x1f/0x30 +Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90 +RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246 +RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008 +RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000 +RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641 +R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001 +R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0 +FS: 00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +PKRU: 55555554 +Call Trace: + + ? __warn+0x81/0x140 + ? page_pool_unlink_napi+0x1f/0x30 + ? report_bug+0x102/0x200 + ? handle_bug+0x44/0x70 + ? exc_invalid_op+0x13/0x60 + ? asm_exc_invalid_op+0x16/0x20 + ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en] + ? page_pool_unlink_napi+0x1f/0x30 + page_pool_destroy+0x3e/0x150 + bnxt_free_mem+0x441/0x5e0 [bnxt_en] + bnxt_half_close_nic+0x2a/0x40 [bnxt_en] + bnxt_self_test+0x21d/0x450 [bnxt_en] + __dev_ethtool+0xeda/0x2e30 + ? native_queued_spin_lock_slowpath+0x17f/0x2b0 + ? __link_object+0xa1/0x160 + ? _raw_spin_unlock_irqrestore+0x23/0x40 + ? __create_object+0x5f/0x90 + ? __kmem_cache_alloc_node+0x317/0x3c0 + ? dev_ethtool+0x59/0x170 + dev_ethtool+0xa7/0x170 + dev_ioctl+0xc3/0x530 + sock_do_ioctl+0xa8/0xf0 + sock_ioctl+0x270/0x310 + __x64_sys_ioctl+0x8c/0xc0 + do_syscall_64+0x3e/0xf0 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools") +Reviewed-by: Andy Gospodarek +Reviewed-by: Ajit Khaparde +Signed-off-by: Michael Chan +Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index 1019b4dc7bed..22c8bfb5ed9d 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -10627,10 +10627,12 @@ int bnxt_half_open_nic(struct bnxt *bp) + netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc); + goto half_open_err; + } ++ bnxt_init_napi(bp); + set_bit(BNXT_STATE_HALF_OPEN, &bp->state); + rc = bnxt_init_nic(bp, true); + if (rc) { + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); ++ bnxt_del_napi(bp); + netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc); + goto half_open_err; + } +@@ -10649,6 +10651,7 @@ int bnxt_half_open_nic(struct bnxt *bp) + void bnxt_half_close_nic(struct bnxt *bp) + { + bnxt_hwrm_resource_free(bp, false, true); ++ bnxt_del_napi(bp); + bnxt_free_skbs(bp); + bnxt_free_mem(bp, true); + clear_bit(BNXT_STATE_HALF_OPEN, &bp->state); +-- +2.43.0 + diff --git a/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch b/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch new file mode 100644 index 00000000000..55a85d91d2a --- /dev/null +++ b/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch @@ -0,0 +1,43 @@ +From c6cfa8547d19c5c8f5f9a9fe22bd0b1064af03a7 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:45:11 -0800 +Subject: bnxt_en: Wait for FLR to complete during probe + +From: Michael Chan + +[ Upstream commit 3c1069fa42872f95cf3c6fedf80723d391e12d57 ] + +The first message to firmware may fail if the device is undergoing FLR. +The driver has some recovery logic for this failure scenario but we must +wait 100 msec for FLR to complete before proceeding. Otherwise the +recovery will always fail. + +Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure") +Reviewed-by: Damodharam Ammepalli +Signed-off-by: Michael Chan +Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++ + 1 file changed, 5 insertions(+) + +diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +index e1f1e646cf48..1019b4dc7bed 100644 +--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c ++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c +@@ -12298,6 +12298,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp) + + bp->fw_cap = 0; + rc = bnxt_hwrm_ver_get(bp); ++ /* FW may be unresponsive after FLR. FLR must complete within 100 msec ++ * so wait before continuing with recovery. ++ */ ++ if (rc) ++ msleep(100); + bnxt_try_map_fw_health_reg(bp); + if (rc) { + rc = bnxt_try_recover_fw(bp); +-- +2.43.0 + diff --git a/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch b/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch new file mode 100644 index 00000000000..9e96756ac7b --- /dev/null +++ b/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch @@ -0,0 +1,161 @@ +From 7893c91364a20cba4d0c74f3b3455ab5e3175dec Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 11:02:25 +1030 +Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K + aligned + +From: Qu Wenruo + +[ Upstream commit f546c4282673497a06ecb6190b50ae7f6c85b02f ] + +[BUG] +There is a bug report that, on a ext4-converted btrfs, scrub leads to +various problems, including: + +- "unable to find chunk map" errors + BTRFS info (device vdb): scrub: started on devid 1 + BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096 + BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056 + + This would lead to unrepariable errors. + +- Use-after-free KASAN reports: + ================================================================== + BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0 + Read of size 8 at addr ffff8881013c9040 by task btrfs/909 + CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b + Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023 + Call Trace: + + dump_stack_lvl+0x43/0x60 + print_report+0xcf/0x640 + kasan_report+0xa6/0xd0 + __blk_rq_map_sg+0x18f/0x7c0 + virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] + virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff] + blk_mq_flush_plug_list.part.0+0x780/0x860 + __blk_flush_plug+0x1ba/0x220 + blk_finish_plug+0x3b/0x60 + submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965] + __x64_sys_ioctl+0xbd/0x100 + do_syscall_64+0x5d/0xe0 + entry_SYSCALL_64_after_hwframe+0x63/0x6b + RIP: 0033:0x7f47e5e0952b + +- Crash, mostly due to above use-after-free + +[CAUSE] +The converted fs has the following data chunk layout: + + item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80 + length 86016 owner 2 stripe_len 65536 type DATA|single + +For above logical bytenr 2214744064, it's at the chunk end +(2214658048 + 86016 = 2214744064). + +This means btrfs_submit_bio() would split the bio, and trigger endio +function for both of the two halves. + +However scrub_submit_initial_read() would only expect the endio function +to be called once, not any more. +This means the first endio function would already free the bbio::bio, +leaving the bvec freed, thus the 2nd endio call would lead to +use-after-free. + +[FIX] +- Make sure scrub_read_endio() only updates bits in its range + Since we may read less than 64K at the end of the chunk, we should not + touch the bits beyond chunk boundary. + +- Make sure scrub_submit_initial_read() only to read the chunk range + This is done by calculating the real number of sectors we need to + read, and add sector-by-sector to the bio. + +Thankfully the scrub read repair path won't need extra fixes: + +- scrub_stripe_submit_repair_read() + With above fixes, we won't update error bit for range beyond chunk, + thus scrub_stripe_submit_repair_read() should never submit any read + beyond the chunk. + +Reported-by: Rongrong +Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure") +Tested-by: Rongrong +Reviewed-by: Johannes Thumshirn +Signed-off-by: Qu Wenruo +Signed-off-by: David Sterba +Signed-off-by: Sasha Levin +--- + fs/btrfs/scrub.c | 29 ++++++++++++++++++++++------- + 1 file changed, 22 insertions(+), 7 deletions(-) + +diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c +index f62a408671cb..443d2519f0a9 100644 +--- a/fs/btrfs/scrub.c ++++ b/fs/btrfs/scrub.c +@@ -1099,12 +1099,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work) + static void scrub_read_endio(struct btrfs_bio *bbio) + { + struct scrub_stripe *stripe = bbio->private; ++ struct bio_vec *bvec; ++ int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio)); ++ int num_sectors; ++ u32 bio_size = 0; ++ int i; ++ ++ ASSERT(sector_nr < stripe->nr_sectors); ++ bio_for_each_bvec_all(bvec, &bbio->bio, i) ++ bio_size += bvec->bv_len; ++ num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits; + + if (bbio->bio.bi_status) { +- bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors); +- bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors); ++ bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors); ++ bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors); + } else { +- bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors); ++ bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors); + } + bio_put(&bbio->bio); + if (atomic_dec_and_test(&stripe->pending_io)) { +@@ -1705,6 +1715,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, + { + struct btrfs_fs_info *fs_info = sctx->fs_info; + struct btrfs_bio *bbio; ++ unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start + ++ stripe->bg->length - stripe->logical) >> ++ fs_info->sectorsize_bits; + int mirror = stripe->mirror_num; + + ASSERT(stripe->bg); +@@ -1719,14 +1732,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx, + bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info, + scrub_read_endio, stripe); + +- /* Read the whole stripe. */ + bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT; +- for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) { ++ /* Read the whole range inside the chunk boundary. */ ++ for (unsigned int cur = 0; cur < nr_sectors; cur++) { ++ struct page *page = scrub_stripe_get_page(stripe, cur); ++ unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur); + int ret; + +- ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0); ++ ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff); + /* We should have allocated enough bio vectors. */ +- ASSERT(ret == PAGE_SIZE); ++ ASSERT(ret == fs_info->sectorsize); + } + atomic_inc(&stripe->pending_io); + +-- +2.43.0 + diff --git a/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch b/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch new file mode 100644 index 00000000000..10ce30a4e43 --- /dev/null +++ b/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch @@ -0,0 +1,54 @@ +From 1a1ebca1fa42f6ee08f20960c224d7176929bbc5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 14:43:01 +0100 +Subject: dpll: fix broken error path in dpll_pin_alloc(..) + +From: Arkadiusz Kubalewski + +[ Upstream commit b6a11a7fc4d6337f7ea720b9287d1b9749c4eae0 ] + +If pin type is not expected, or pin properities failed to allocate +memory, the unwind error path shall not destroy pin's xarrays, which +were not yet initialized. +Add new goto label and use it to fix broken error path. + +Reviewed-by: Jiri Pirko +Signed-off-by: Arkadiusz Kubalewski +Signed-off-by: David S. Miller +Stable-dep-of: 830ead5fb0c5 ("dpll: fix pin dump crash for rebound module") +Signed-off-by: Sasha Levin +--- + drivers/dpll/dpll_core.c | 7 ++++--- + 1 file changed, 4 insertions(+), 3 deletions(-) + +diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c +index 3568149b9562..36f5c0eaf604 100644 +--- a/drivers/dpll/dpll_core.c ++++ b/drivers/dpll/dpll_core.c +@@ -440,7 +440,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, + if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX || + prop->type > DPLL_PIN_TYPE_MAX)) { + ret = -EINVAL; +- goto err; ++ goto err_pin_prop; + } + pin->prop = prop; + refcount_set(&pin->refcount, 1); +@@ -448,11 +448,12 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, + xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC); + ret = xa_alloc(&dpll_pin_xa, &pin->id, pin, xa_limit_16b, GFP_KERNEL); + if (ret) +- goto err; ++ goto err_xa_alloc; + return pin; +-err: ++err_xa_alloc: + xa_destroy(&pin->dpll_refs); + xa_destroy(&pin->parent_refs); ++err_pin_prop: + kfree(pin); + return ERR_PTR(ret); + } +-- +2.43.0 + diff --git a/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch b/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch new file mode 100644 index 00000000000..2e8616c7356 --- /dev/null +++ b/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch @@ -0,0 +1,258 @@ +From c92ec4869e0b3bdd99cf0a23d92cc463e45348dd Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 14:43:02 +0100 +Subject: dpll: fix pin dump crash for rebound module + +From: Arkadiusz Kubalewski + +[ Upstream commit 830ead5fb0c5855ce4d70ba2ed4a673b5f1e7d9b ] + +When a kernel module is unbound but the pin resources were not entirely +freed (other kernel module instance of the same PCI device have had kept +the reference to that pin), and kernel module is again bound, the pin +properties would not be updated (the properties are only assigned when +memory for the pin is allocated), prop pointer still points to the +kernel module memory of the kernel module which was deallocated on the +unbind. + +If the pin dump is invoked in this state, the result is a kernel crash. +Prevent the crash by storing persistent pin properties in dpll subsystem, +copy the content from the kernel module when pin is allocated, instead of +using memory of the kernel module. + +Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions") +Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") +Reviewed-by: Jan Glaza +Reviewed-by: Przemek Kitszel +Signed-off-by: Arkadiusz Kubalewski +Reviewed-by: Jiri Pirko +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/dpll/dpll_core.c | 55 +++++++++++++++++++++++++++++++++++-- + drivers/dpll/dpll_core.h | 4 +-- + drivers/dpll/dpll_netlink.c | 28 +++++++++---------- + 3 files changed, 69 insertions(+), 18 deletions(-) + +diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c +index 36f5c0eaf604..5e3b9b5679f9 100644 +--- a/drivers/dpll/dpll_core.c ++++ b/drivers/dpll/dpll_core.c +@@ -424,6 +424,53 @@ void dpll_device_unregister(struct dpll_device *dpll, + } + EXPORT_SYMBOL_GPL(dpll_device_unregister); + ++static void dpll_pin_prop_free(struct dpll_pin_properties *prop) ++{ ++ kfree(prop->package_label); ++ kfree(prop->panel_label); ++ kfree(prop->board_label); ++ kfree(prop->freq_supported); ++} ++ ++static int dpll_pin_prop_dup(const struct dpll_pin_properties *src, ++ struct dpll_pin_properties *dst) ++{ ++ memcpy(dst, src, sizeof(*dst)); ++ if (src->freq_supported && src->freq_supported_num) { ++ size_t freq_size = src->freq_supported_num * ++ sizeof(*src->freq_supported); ++ dst->freq_supported = kmemdup(src->freq_supported, ++ freq_size, GFP_KERNEL); ++ if (!src->freq_supported) ++ return -ENOMEM; ++ } ++ if (src->board_label) { ++ dst->board_label = kstrdup(src->board_label, GFP_KERNEL); ++ if (!dst->board_label) ++ goto err_board_label; ++ } ++ if (src->panel_label) { ++ dst->panel_label = kstrdup(src->panel_label, GFP_KERNEL); ++ if (!dst->panel_label) ++ goto err_panel_label; ++ } ++ if (src->package_label) { ++ dst->package_label = kstrdup(src->package_label, GFP_KERNEL); ++ if (!dst->package_label) ++ goto err_package_label; ++ } ++ ++ return 0; ++ ++err_package_label: ++ kfree(dst->panel_label); ++err_panel_label: ++ kfree(dst->board_label); ++err_board_label: ++ kfree(dst->freq_supported); ++ return -ENOMEM; ++} ++ + static struct dpll_pin * + dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, + const struct dpll_pin_properties *prop) +@@ -442,7 +489,9 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, + ret = -EINVAL; + goto err_pin_prop; + } +- pin->prop = prop; ++ ret = dpll_pin_prop_dup(prop, &pin->prop); ++ if (ret) ++ goto err_pin_prop; + refcount_set(&pin->refcount, 1); + xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC); + xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC); +@@ -453,6 +502,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module, + err_xa_alloc: + xa_destroy(&pin->dpll_refs); + xa_destroy(&pin->parent_refs); ++ dpll_pin_prop_free(&pin->prop); + err_pin_prop: + kfree(pin); + return ERR_PTR(ret); +@@ -513,6 +563,7 @@ void dpll_pin_put(struct dpll_pin *pin) + xa_destroy(&pin->dpll_refs); + xa_destroy(&pin->parent_refs); + xa_erase(&dpll_pin_xa, pin->id); ++ dpll_pin_prop_free(&pin->prop); + kfree(pin); + } + mutex_unlock(&dpll_lock); +@@ -635,7 +686,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin, + unsigned long i, stop; + int ret; + +- if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX)) ++ if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX)) + return -EINVAL; + + if (WARN_ON(!ops) || +diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h +index 5585873c5c1b..717f715015c7 100644 +--- a/drivers/dpll/dpll_core.h ++++ b/drivers/dpll/dpll_core.h +@@ -44,7 +44,7 @@ struct dpll_device { + * @module: module of creator + * @dpll_refs: hold referencees to dplls pin was registered with + * @parent_refs: hold references to parent pins pin was registered with +- * @prop: pointer to pin properties given by registerer ++ * @prop: pin properties copied from the registerer + * @rclk_dev_name: holds name of device when pin can recover clock from it + * @refcount: refcount + **/ +@@ -55,7 +55,7 @@ struct dpll_pin { + struct module *module; + struct xarray dpll_refs; + struct xarray parent_refs; +- const struct dpll_pin_properties *prop; ++ struct dpll_pin_properties prop; + refcount_t refcount; + }; + +diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c +index ce7cf736f020..4c64611d32ac 100644 +--- a/drivers/dpll/dpll_netlink.c ++++ b/drivers/dpll/dpll_netlink.c +@@ -278,17 +278,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin, + if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq, + DPLL_A_PIN_PAD)) + return -EMSGSIZE; +- for (fs = 0; fs < pin->prop->freq_supported_num; fs++) { ++ for (fs = 0; fs < pin->prop.freq_supported_num; fs++) { + nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED); + if (!nest) + return -EMSGSIZE; +- freq = pin->prop->freq_supported[fs].min; ++ freq = pin->prop.freq_supported[fs].min; + if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq), + &freq, DPLL_A_PIN_PAD)) { + nla_nest_cancel(msg, nest); + return -EMSGSIZE; + } +- freq = pin->prop->freq_supported[fs].max; ++ freq = pin->prop.freq_supported[fs].max; + if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq), + &freq, DPLL_A_PIN_PAD)) { + nla_nest_cancel(msg, nest); +@@ -304,9 +304,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq) + { + int fs; + +- for (fs = 0; fs < pin->prop->freq_supported_num; fs++) +- if (freq >= pin->prop->freq_supported[fs].min && +- freq <= pin->prop->freq_supported[fs].max) ++ for (fs = 0; fs < pin->prop.freq_supported_num; fs++) ++ if (freq >= pin->prop.freq_supported[fs].min && ++ freq <= pin->prop.freq_supported[fs].max) + return true; + return false; + } +@@ -396,7 +396,7 @@ static int + dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin, + struct netlink_ext_ack *extack) + { +- const struct dpll_pin_properties *prop = pin->prop; ++ const struct dpll_pin_properties *prop = &pin->prop; + struct dpll_pin_ref *ref; + int ret; + +@@ -689,7 +689,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx, + int ret; + + if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE & +- pin->prop->capabilities)) { ++ pin->prop.capabilities)) { + NL_SET_ERR_MSG(extack, "state changing is not allowed"); + return -EOPNOTSUPP; + } +@@ -725,7 +725,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin, + int ret; + + if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE & +- pin->prop->capabilities)) { ++ pin->prop.capabilities)) { + NL_SET_ERR_MSG(extack, "state changing is not allowed"); + return -EOPNOTSUPP; + } +@@ -752,7 +752,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin, + int ret; + + if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE & +- pin->prop->capabilities)) { ++ pin->prop.capabilities)) { + NL_SET_ERR_MSG(extack, "prio changing is not allowed"); + return -EOPNOTSUPP; + } +@@ -780,7 +780,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll, + int ret; + + if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE & +- pin->prop->capabilities)) { ++ pin->prop.capabilities)) { + NL_SET_ERR_MSG(extack, "direction changing is not allowed"); + return -EOPNOTSUPP; + } +@@ -810,8 +810,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr, + int ret; + + phase_adj = nla_get_s32(phase_adj_attr); +- if (phase_adj > pin->prop->phase_range.max || +- phase_adj < pin->prop->phase_range.min) { ++ if (phase_adj > pin->prop.phase_range.max || ++ phase_adj < pin->prop.phase_range.min) { + NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr, + "phase adjust value not supported"); + return -EINVAL; +@@ -995,7 +995,7 @@ dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr, + unsigned long i; + + xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) { +- prop = pin->prop; ++ prop = &pin->prop; + cid_match = clock_id ? pin->clock_id == clock_id : true; + mod_match = mod_name_attr && module_name(pin->module) ? + !nla_strcmp(mod_name_attr, +-- +2.43.0 + diff --git a/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch b/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch new file mode 100644 index 00000000000..1384b280009 --- /dev/null +++ b/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch @@ -0,0 +1,69 @@ +From c414d49fa449a866c343aa87835bb4d65c568c92 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 14:43:04 +0100 +Subject: dpll: fix register pin with unregistered parent pin + +From: Arkadiusz Kubalewski + +[ Upstream commit 7dc5b18ff71bd6f948810ab8a08b6a6ff8b315c5 ] + +In case of multiple kernel module instances using the same dpll device: +if only one registers dpll device, then only that one can register +directly connected pins with a dpll device. When unregistered parent is +responsible for determining if the muxed pin can be registered with it +or not, the drivers need to be loaded in serialized order to work +correctly - first the driver instance which registers the direct pins +needs to be loaded, then the other instances could register muxed type +pins. + +Allow registration of a pin with a parent even if the parent was not +yet registered, thus allow ability for unserialized driver instance +load order. +Do not WARN_ON notification for unregistered pin, which can be invoked +for described case, instead just return error. + +Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions") +Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") +Reviewed-by: Jan Glaza +Reviewed-by: Jiri Pirko +Signed-off-by: Arkadiusz Kubalewski +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/dpll/dpll_core.c | 6 ------ + 1 file changed, 6 deletions(-) + +diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c +index 5e3b9b5679f9..f8fbf0394288 100644 +--- a/drivers/dpll/dpll_core.c ++++ b/drivers/dpll/dpll_core.c +@@ -28,8 +28,6 @@ static u32 dpll_xa_id; + WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED)) + #define ASSERT_DPLL_NOT_REGISTERED(d) \ + WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED)) +-#define ASSERT_PIN_REGISTERED(p) \ +- WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED)) + + struct dpll_device_registration { + struct list_head list; +@@ -614,8 +612,6 @@ dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin, + WARN_ON(!ops->state_on_dpll_get) || + WARN_ON(!ops->direction_get)) + return -EINVAL; +- if (ASSERT_DPLL_REGISTERED(dpll)) +- return -EINVAL; + + mutex_lock(&dpll_lock); + if (WARN_ON(!(dpll->module == pin->module && +@@ -693,8 +689,6 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin, + WARN_ON(!ops->state_on_pin_get) || + WARN_ON(!ops->direction_get)) + return -EINVAL; +- if (ASSERT_PIN_REGISTERED(parent)) +- return -EINVAL; + + mutex_lock(&dpll_lock); + ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv); +-- +2.43.0 + diff --git a/queue-6.7/dpll-fix-userspace-availability-of-pins.patch b/queue-6.7/dpll-fix-userspace-availability-of-pins.patch new file mode 100644 index 00000000000..75c0643c06f --- /dev/null +++ b/queue-6.7/dpll-fix-userspace-availability-of-pins.patch @@ -0,0 +1,98 @@ +From be5f9be3b22c6869a4688312effac793c9550550 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 14:43:03 +0100 +Subject: dpll: fix userspace availability of pins + +From: Arkadiusz Kubalewski + +[ Upstream commit db2ec3c94667eaeecc6a74d96594fab6baf80fdc ] + +If parent pin was unregistered but child pin was not, the userspace +would see the "zombie" pins - the ones that were registered with +a parent pin (dpll_pin_on_pin_register(..)). +Technically those are not available - as there is no dpll device in the +system. Do not dump those pins and prevent userspace from any +interaction with them. Provide a unified function to determine if the +pin is available and use it before acting/responding for user requests. + +Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions") +Reviewed-by: Jan Glaza +Reviewed-by: Jiri Pirko +Signed-off-by: Arkadiusz Kubalewski +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/dpll/dpll_netlink.c | 29 +++++++++++++++++++++++++++-- + 1 file changed, 27 insertions(+), 2 deletions(-) + +diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c +index 4c64611d32ac..7cc99d627942 100644 +--- a/drivers/dpll/dpll_netlink.c ++++ b/drivers/dpll/dpll_netlink.c +@@ -525,6 +525,24 @@ __dpll_device_change_ntf(struct dpll_device *dpll) + return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll); + } + ++static bool dpll_pin_available(struct dpll_pin *pin) ++{ ++ struct dpll_pin_ref *par_ref; ++ unsigned long i; ++ ++ if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)) ++ return false; ++ xa_for_each(&pin->parent_refs, i, par_ref) ++ if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id, ++ DPLL_REGISTERED)) ++ return true; ++ xa_for_each(&pin->dpll_refs, i, par_ref) ++ if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id, ++ DPLL_REGISTERED)) ++ return true; ++ return false; ++} ++ + /** + * dpll_device_change_ntf - notify that the dpll device has been changed + * @dpll: registered dpll pointer +@@ -551,7 +569,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin) + int ret = -ENOMEM; + void *hdr; + +- if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))) ++ if (!dpll_pin_available(pin)) + return -ENODEV; + + msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL); +@@ -1102,6 +1120,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info) + } + pin = dpll_pin_find_from_nlattr(info); + if (!IS_ERR(pin)) { ++ if (!dpll_pin_available(pin)) { ++ nlmsg_free(msg); ++ return -ENODEV; ++ } + ret = dpll_msg_add_pin_handle(msg, pin); + if (ret) { + nlmsg_free(msg); +@@ -1151,6 +1173,8 @@ int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb) + + xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED, + ctx->idx) { ++ if (!dpll_pin_available(pin)) ++ continue; + hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid, + cb->nlh->nlmsg_seq, + &dpll_nl_family, NLM_F_MULTI, +@@ -1413,7 +1437,8 @@ int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb, + } + info->user_ptr[0] = xa_load(&dpll_pin_xa, + nla_get_u32(info->attrs[DPLL_A_PIN_ID])); +- if (!info->user_ptr[0]) { ++ if (!info->user_ptr[0] || ++ !dpll_pin_available(info->user_ptr[0])) { + NL_SET_ERR_MSG(info->extack, "pin not found"); + ret = -ENODEV; + goto unlock_dev; +-- +2.43.0 + diff --git a/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch b/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch new file mode 100644 index 00000000000..32f1ad30698 --- /dev/null +++ b/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch @@ -0,0 +1,109 @@ +From 2b38a16abde53bfda995910e39ff9466933e189c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 01:24:42 +0800 +Subject: fjes: fix memleaks in fjes_hw_setup + +From: Zhipeng Lu + +[ Upstream commit f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 ] + +In fjes_hw_setup, it allocates several memory and delay the deallocation +to the fjes_hw_exit in fjes_probe through the following call chain: + +fjes_probe + |-> fjes_hw_init + |-> fjes_hw_setup + |-> fjes_hw_exit + +However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus +all the resources allocated in fjes_hw_setup will be leaked. In this +patch, we free those resources in fjes_hw_setup and prevents such leaks. + +Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine") +Signed-off-by: Zhipeng Lu +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++------- + 1 file changed, 30 insertions(+), 7 deletions(-) + +diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c +index 704e949484d0..b9b5554ea862 100644 +--- a/drivers/net/fjes/fjes_hw.c ++++ b/drivers/net/fjes/fjes_hw.c +@@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) + + mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid); + hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL); +- if (!(hw->hw_info.req_buf)) +- return -ENOMEM; ++ if (!(hw->hw_info.req_buf)) { ++ result = -ENOMEM; ++ goto free_ep_info; ++ } + + hw->hw_info.req_buf_size = mem_size; + + mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid); + hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL); +- if (!(hw->hw_info.res_buf)) +- return -ENOMEM; ++ if (!(hw->hw_info.res_buf)) { ++ result = -ENOMEM; ++ goto free_req_buf; ++ } + + hw->hw_info.res_buf_size = mem_size; + + result = fjes_hw_alloc_shared_status_region(hw); + if (result) +- return result; ++ goto free_res_buf; + + hw->hw_info.buffer_share_bit = 0; + hw->hw_info.buffer_unshare_reserve_bit = 0; +@@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw) + + result = fjes_hw_alloc_epbuf(&buf_pair->tx); + if (result) +- return result; ++ goto free_epbuf; + + result = fjes_hw_alloc_epbuf(&buf_pair->rx); + if (result) +- return result; ++ goto free_epbuf; + + spin_lock_irqsave(&hw->rx_status_lock, flags); + fjes_hw_setup_epbuf(&buf_pair->tx, mac, +@@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw) + fjes_hw_init_command_registers(hw, ¶m); + + return 0; ++ ++free_epbuf: ++ for (epidx = 0; epidx < hw->max_epid ; epidx++) { ++ if (epidx == hw->my_epid) ++ continue; ++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx); ++ fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx); ++ } ++ fjes_hw_free_shared_status_region(hw); ++free_res_buf: ++ kfree(hw->hw_info.res_buf); ++ hw->hw_info.res_buf = NULL; ++free_req_buf: ++ kfree(hw->hw_info.req_buf); ++ hw->hw_info.req_buf = NULL; ++free_ep_info: ++ kfree(hw->ep_shm_info); ++ hw->ep_shm_info = NULL; ++ return result; + } + + static void fjes_hw_cleanup(struct fjes_hw *hw) +-- +2.43.0 + diff --git a/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch b/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch new file mode 100644 index 00000000000..5e7d8b084c1 --- /dev/null +++ b/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch @@ -0,0 +1,170 @@ +From 4186d2f90184f83aa949ca818e6eb18bc87b6253 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:56 +0100 +Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog + +From: Tirthendu Sarkar + +[ Upstream commit 83014323c642b8faa2d64a5f303b41c019322478 ] + +XDP programs can shrink packets by calling the bpf_xdp_adjust_tail() +helper function. For multi-buffer packets this may lead to reduction of +frag count stored in skb_shared_info area of the xdp_buff struct. This +results in issues with the current handling of XDP_PASS and XDP_DROP +cases. + +For XDP_PASS, currently skb is being built using frag count of +xdp_buffer before it was processed by XDP prog and thus will result in +an inconsistent skb when frag count gets reduced by XDP prog. To fix +this, get correct frag count while building the skb instead of using +pre-obtained frag count. + +For XDP_DROP, current page recycling logic will not reuse the page but +instead will adjust the pagecnt_bias so that the page can be freed. This +again results in inconsistent behavior as the page refcnt has already +been changed by the helper while freeing the frag(s) as part of +shrinking the packet. To fix this, only adjust pagecnt_bias for buffers +that are stillpart of the packet post-xdp prog run. + +Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") +Reported-by: Maciej Fijalkowski +Signed-off-by: Tirthendu Sarkar +Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 ++++++++++++--------- + 1 file changed, 23 insertions(+), 17 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index dd410b15000f..35e1bb6fe5e1 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -2099,7 +2099,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring, + static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + struct xdp_buff *xdp) + { +- u32 next = rx_ring->next_to_clean; ++ u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; ++ u32 next = rx_ring->next_to_clean, i = 0; + struct i40e_rx_buffer *rx_buffer; + + xdp->flags = 0; +@@ -2112,10 +2113,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + if (!rx_buffer->page) + continue; + +- if (xdp_res == I40E_XDP_CONSUMED) +- rx_buffer->pagecnt_bias++; +- else ++ if (xdp_res != I40E_XDP_CONSUMED) + i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz); ++ else if (i++ <= nr_frags) ++ rx_buffer->pagecnt_bias++; + + /* EOP buffer will be put in i40e_clean_rx_irq() */ + if (next == rx_ring->next_to_process) +@@ -2129,20 +2130,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res, + * i40e_construct_skb - Allocate skb and populate it + * @rx_ring: rx descriptor ring to transact packets on + * @xdp: xdp_buff pointing to the data +- * @nr_frags: number of buffers for the packet + * + * This function allocates an skb. It then populates it with the page + * data from the current receive descriptor, taking care to set up the + * skb correctly. + */ + static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, +- struct xdp_buff *xdp, +- u32 nr_frags) ++ struct xdp_buff *xdp) + { + unsigned int size = xdp->data_end - xdp->data; + struct i40e_rx_buffer *rx_buffer; ++ struct skb_shared_info *sinfo; + unsigned int headlen; + struct sk_buff *skb; ++ u32 nr_frags = 0; + + /* prefetch first cache line of first page */ + net_prefetch(xdp->data); +@@ -2180,6 +2181,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + memcpy(__skb_put(skb, headlen), xdp->data, + ALIGN(headlen, sizeof(long))); + ++ if (unlikely(xdp_buff_has_frags(xdp))) { ++ sinfo = xdp_get_shared_info_from_buff(xdp); ++ nr_frags = sinfo->nr_frags; ++ } + rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean); + /* update all of the pointers */ + size -= headlen; +@@ -2199,9 +2204,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + } + + if (unlikely(xdp_buff_has_frags(xdp))) { +- struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb); ++ struct skb_shared_info *skinfo = skb_shinfo(skb); + +- sinfo = xdp_get_shared_info_from_buff(xdp); + memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0], + sizeof(skb_frag_t) * nr_frags); + +@@ -2224,17 +2228,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring, + * i40e_build_skb - Build skb around an existing buffer + * @rx_ring: Rx descriptor ring to transact packets on + * @xdp: xdp_buff pointing to the data +- * @nr_frags: number of buffers for the packet + * + * This function builds an skb around an existing Rx buffer, taking care + * to set up the skb correctly and avoid any memcpy overhead. + */ + static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, +- struct xdp_buff *xdp, +- u32 nr_frags) ++ struct xdp_buff *xdp) + { + unsigned int metasize = xdp->data - xdp->data_meta; ++ struct skb_shared_info *sinfo; + struct sk_buff *skb; ++ u32 nr_frags; + + /* Prefetch first cache line of first page. If xdp->data_meta + * is unused, this points exactly as xdp->data, otherwise we +@@ -2243,6 +2247,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, + */ + net_prefetch(xdp->data_meta); + ++ if (unlikely(xdp_buff_has_frags(xdp))) { ++ sinfo = xdp_get_shared_info_from_buff(xdp); ++ nr_frags = sinfo->nr_frags; ++ } ++ + /* build an skb around the page buffer */ + skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz); + if (unlikely(!skb)) +@@ -2255,9 +2264,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring, + skb_metadata_set(skb, metasize); + + if (unlikely(xdp_buff_has_frags(xdp))) { +- struct skb_shared_info *sinfo; +- +- sinfo = xdp_get_shared_info_from_buff(xdp); + xdp_update_skb_shared_info(skb, nr_frags, + sinfo->xdp_frags_size, + nr_frags * xdp->frame_sz, +@@ -2602,9 +2608,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget, + total_rx_bytes += size; + } else { + if (ring_uses_build_skb(rx_ring)) +- skb = i40e_build_skb(rx_ring, xdp, nfrags); ++ skb = i40e_build_skb(rx_ring, xdp); + else +- skb = i40e_construct_skb(rx_ring, xdp, nfrags); ++ skb = i40e_construct_skb(rx_ring, xdp); + + /* drop if we failed to retrieve a buffer */ + if (!skb) { +-- +2.43.0 + diff --git a/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch b/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch new file mode 100644 index 00000000000..f5622ab4323 --- /dev/null +++ b/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch @@ -0,0 +1,130 @@ +From 8a934672dca82b456b04642db14ad547d932075d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:01 +0100 +Subject: i40e: set xdp_rxq_info::frag_size + +From: Maciej Fijalkowski + +[ Upstream commit a045d2f2d03d23e7db6772dd83e0ba2705dfad93 ] + +i40e support XDP multi-buffer so it is supposed to use +__xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the +frag_size. It can not be simply converted at existing callsite because +rx_buf_len could be un-initialized, so let us register xdp_rxq_info +within i40e_configure_rx_ring(), which happen to be called with already +initialized rx_buf_len value. + +Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to +int, so two variables to deal with return codes are not needed within +i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status +from xdp_rxq_info registration. + +Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 40 ++++++++++++--------- + drivers/net/ethernet/intel/i40e/i40e_txrx.c | 9 ----- + 2 files changed, 24 insertions(+), 25 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index d5519af34657..f97a63812141 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -3588,40 +3588,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) + struct i40e_hmc_obj_rxq rx_ctx; + int err = 0; + bool ok; +- int ret; + + bitmap_zero(ring->state, __I40E_RING_STATE_NBITS); + + /* clear the context structure first */ + memset(&rx_ctx, 0, sizeof(rx_ctx)); + +- if (ring->vsi->type == I40E_VSI_MAIN) +- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ++ ring->rx_buf_len = vsi->rx_buf_len; ++ ++ /* XDP RX-queue info only needed for RX rings exposed to XDP */ ++ if (ring->vsi->type != I40E_VSI_MAIN) ++ goto skip; ++ ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->queue_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + ring->xsk_pool = i40e_xsk_pool(ring); + if (ring->xsk_pool) { +- ring->rx_buf_len = +- xsk_pool_get_rx_frame_size(ring->xsk_pool); +- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ++ ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +- if (ret) +- return ret; ++ if (err) ++ return err; + dev_info(&vsi->back->pdev->dev, + "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + ring->queue_index); + + } else { +- ring->rx_buf_len = vsi->rx_buf_len; +- if (ring->vsi->type == I40E_VSI_MAIN) { +- ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, +- MEM_TYPE_PAGE_SHARED, +- NULL); +- if (ret) +- return ret; +- } ++ err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, ++ MEM_TYPE_PAGE_SHARED, ++ NULL); ++ if (err) ++ return err; + } + ++skip: + xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq); + + rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len, +diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +index 35e1bb6fe5e1..071ef309a3a4 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c +@@ -1555,7 +1555,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring) + int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) + { + struct device *dev = rx_ring->dev; +- int err; + + u64_stats_init(&rx_ring->syncp); + +@@ -1576,14 +1575,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring) + rx_ring->next_to_process = 0; + rx_ring->next_to_use = 0; + +- /* XDP RX-queue info only needed for RX rings exposed to XDP */ +- if (rx_ring->vsi->type == I40E_VSI_MAIN) { +- err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, +- rx_ring->queue_index, rx_ring->q_vector->napi.napi_id); +- if (err < 0) +- return err; +- } +- + rx_ring->xdp_prog = rx_ring->vsi->xdp_prog; + + rx_ring->rx_bi = +-- +2.43.0 + diff --git a/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch b/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch new file mode 100644 index 00000000000..9e6beb7bfec --- /dev/null +++ b/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch @@ -0,0 +1,46 @@ +From 27105d0dd212b950eacaae0e22bf6cccdf54c566 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:02 +0100 +Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue + +From: Maciej Fijalkowski + +[ Upstream commit 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f ] + +Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us +make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC +is being set via xsk_pool_get_rx_frame_size() and this needs to be +propagated up to xdp_rxq_info. + +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++ + 1 file changed, 7 insertions(+) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c +index f97a63812141..2bd7b29fb251 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_main.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c +@@ -3611,7 +3611,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring) + + ring->xsk_pool = i40e_xsk_pool(ring); + if (ring->xsk_pool) { ++ xdp_rxq_info_unreg(&ring->xdp_rxq); + ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->queue_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +-- +2.43.0 + diff --git a/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch b/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch new file mode 100644 index 00000000000..b7db021fcbf --- /dev/null +++ b/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch @@ -0,0 +1,58 @@ +From 6567d90f1860790c7a73d15c47a2d8cfa8de7aae Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:57 +0100 +Subject: ice: remove redundant xdp_rxq_info registration + +From: Maciej Fijalkowski + +[ Upstream commit 2ee788c06493d02ee85855414cca39825e768aaf ] + +xdp_rxq_info struct can be registered by drivers via two functions - +xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows +drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size +which in turn will make it possible to grow the packet via +bpf_xdp_adjust_tail() BPF helper. + +Currently, ice registers xdp_rxq_info in two spots: +1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG +2) ice_vsi_cfg_rxq() // via __xdp_rxq_info_reg(), OK + +Cited commit under fixes tag took care of setting up frag_size and +updated registration scheme in 2) but it did not help as +1) is called before 2) and as shown above it uses old registration +function. This means that 2) sees that xdp_rxq_info is already +registered and never calls __xdp_rxq_info_reg() which leaves us with +xdp_rxq_info::frag_size being set to 0. + +To fix this misbehavior, simply remove xdp_rxq_info_reg() call from +ice_setup_rx_ring(). + +Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_txrx.c | 5 ----- + 1 file changed, 5 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c +index 6878448ba112..9170a3e8f088 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.c ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c +@@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring) + if (ice_is_xdp_ena_vsi(rx_ring->vsi)) + WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog); + +- if (rx_ring->vsi->type == ICE_VSI_PF && +- !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq)) +- if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev, +- rx_ring->q_index, rx_ring->q_vector->napi.napi_id)) +- goto err; + return 0; + + err: +-- +2.43.0 + diff --git a/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch b/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch new file mode 100644 index 00000000000..971cebf3c44 --- /dev/null +++ b/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch @@ -0,0 +1,91 @@ +From ba8440c493d603f075c11edb241c244ce6a007fa Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:59 +0100 +Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue + +From: Maciej Fijalkowski + +[ Upstream commit 3de38c87174225487fc93befeea7d380db80aef6 ] + +Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us +make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC +is being set via xsk_pool_get_rx_frame_size() and this needs to be +propagated up to xdp_rxq_info. + +Use a bigger hammer and instead of unregistering only xdp_rxq_info's +memory model, unregister it altogether and register it again and have +xdp_rxq_info with correct frag_size value. + +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_base.c | 37 ++++++++++++++--------- + 1 file changed, 23 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c +index 7fa43827a3f0..4f3e65b47cdc 100644 +--- a/drivers/net/ethernet/intel/ice/ice_base.c ++++ b/drivers/net/ethernet/intel/ice/ice_base.c +@@ -534,19 +534,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + ring->rx_buf_len = ring->vsi->rx_buf_len; + + if (ring->vsi->type == ICE_VSI_PF) { +- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) +- /* coverity[check_return] */ +- __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, +- ring->q_index, +- ring->q_vector->napi.napi_id, +- ring->vsi->rx_buf_len); ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + ring->xsk_pool = ice_xsk_pool(ring); + if (ring->xsk_pool) { +- xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq); ++ xdp_rxq_info_unreg(&ring->xdp_rxq); + + ring->rx_buf_len = + xsk_pool_get_rx_frame_size(ring->xsk_pool); ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_XSK_BUFF_POOL, + NULL); +@@ -557,13 +565,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring) + dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n", + ring->q_index); + } else { +- if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) +- /* coverity[check_return] */ +- __xdp_rxq_info_reg(&ring->xdp_rxq, +- ring->netdev, +- ring->q_index, +- ring->q_vector->napi.napi_id, +- ring->vsi->rx_buf_len); ++ if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) { ++ err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev, ++ ring->q_index, ++ ring->q_vector->napi.napi_id, ++ ring->rx_buf_len); ++ if (err) ++ return err; ++ } + + err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq, + MEM_TYPE_PAGE_SHARED, +-- +2.43.0 + diff --git a/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch b/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch new file mode 100644 index 00000000000..4d842efb99d --- /dev/null +++ b/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch @@ -0,0 +1,170 @@ +From 62da34a963fc7911cae3aa180d1e61801a97258d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:55 +0100 +Subject: ice: work on pre-XDP prog frag count + +From: Maciej Fijalkowski + +[ Upstream commit ad2047cf5d9313200e308612aed516548873d124 ] + +Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a +multi-buffer packet by 4k bytes and then redirects it to an AF_XDP +socket. + +Since support for handling multi-buffer frames was added to XDP, usage +of bpf_xdp_adjust_tail() helper within XDP program can free the page +that given fragment occupies and in turn decrease the fragment count +within skb_shared_info that is embedded in xdp_buff struct. In current +ice driver codebase, it can become problematic when page recycling logic +decides not to reuse the page. In such case, __page_frag_cache_drain() +is used with ice_rx_buf::pagecnt_bias that was not adjusted after +refcount of page was changed by XDP prog which in turn does not drain +the refcount to 0 and page is never freed. + +To address this, let us store the count of frags before the XDP program +was executed on Rx ring struct. This will be used to compare with +current frag count from skb_shared_info embedded in xdp_buff. A smaller +value in the latter indicates that XDP prog freed frag(s). Then, for +given delta decrement pagecnt_bias for XDP_DROP verdict. + +While at it, let us also handle the EOP frag within +ice_set_rx_bufs_act() to make our life easier, so all of the adjustments +needed to be applied against freed frags are performed in the single +place. + +Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/ice/ice_txrx.c | 14 ++++++--- + drivers/net/ethernet/intel/ice/ice_txrx.h | 1 + + drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------ + 3 files changed, 32 insertions(+), 14 deletions(-) + +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c +index 9e97ea863068..6878448ba112 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.c ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c +@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + ret = ICE_XDP_CONSUMED; + } + exit: +- rx_buf->act = ret; +- if (unlikely(xdp_buff_has_frags(xdp))) +- ice_set_rx_bufs_act(xdp, rx_ring, ret); ++ ice_set_rx_bufs_act(xdp, rx_ring, ret); + } + + /** +@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp, + } + + if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) { +- if (unlikely(xdp_buff_has_frags(xdp))) +- ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); ++ ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED); + return -ENOMEM; + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page, + rx_buf->page_offset, size); + sinfo->xdp_frags_size += size; ++ /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail() ++ * can pop off frags but driver has to handle it on its own ++ */ ++ rx_ring->nr_frags = sinfo->nr_frags; + + if (page_is_pfmemalloc(rx_buf->page)) + xdp_buff_set_frag_pfmemalloc(xdp); +@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) + + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + continue; + construct_skb: + if (likely(ice_ring_uses_build_skb(rx_ring))) +@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget) + ICE_XDP_CONSUMED); + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + break; + } + xdp->data = NULL; + rx_ring->first_desc = ntc; ++ rx_ring->nr_frags = 0; + + stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S); + if (unlikely(ice_test_staterr(rx_desc->wb.status_error0, +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h +index daf7b9dbb143..b28b9826bbcd 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx.h ++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h +@@ -333,6 +333,7 @@ struct ice_rx_ring { + struct ice_channel *ch; + struct ice_tx_ring *xdp_ring; + struct xsk_buff_pool *xsk_pool; ++ u32 nr_frags; + dma_addr_t dma; /* physical address of ring */ + u64 cached_phctime; + u16 rx_buf_len; +diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +index 115969ecdf7b..b0e56675f98b 100644 +--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h ++++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h +@@ -12,26 +12,39 @@ + * act: action to store onto Rx buffers related to XDP buffer parts + * + * Set action that should be taken before putting Rx buffer from first frag +- * to one before last. Last one is handled by caller of this function as it +- * is the EOP frag that is currently being processed. This function is +- * supposed to be called only when XDP buffer contains frags. ++ * to the last. + */ + static inline void + ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring, + const unsigned int act) + { +- const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); +- u32 first = rx_ring->first_desc; +- u32 nr_frags = sinfo->nr_frags; ++ u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags; ++ u32 nr_frags = rx_ring->nr_frags + 1; ++ u32 idx = rx_ring->first_desc; + u32 cnt = rx_ring->count; + struct ice_rx_buf *buf; + + for (int i = 0; i < nr_frags; i++) { +- buf = &rx_ring->rx_buf[first]; ++ buf = &rx_ring->rx_buf[idx]; + buf->act = act; + +- if (++first == cnt) +- first = 0; ++ if (++idx == cnt) ++ idx = 0; ++ } ++ ++ /* adjust pagecnt_bias on frags freed by XDP prog */ ++ if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) { ++ u32 delta = rx_ring->nr_frags - sinfo_frags; ++ ++ while (delta) { ++ if (idx == 0) ++ idx = cnt - 1; ++ else ++ idx--; ++ buf = &rx_ring->rx_buf[idx]; ++ buf->pagecnt_bias--; ++ delta--; ++ } + } + } + +-- +2.43.0 + diff --git a/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch b/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch new file mode 100644 index 00000000000..fd14c56579b --- /dev/null +++ b/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch @@ -0,0 +1,52 @@ +From 5d5966086cb8bd78aab1a1b25b336edea51fe324 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 21:50:40 +0100 +Subject: idpf: distinguish vports by the dev_port attribute + +From: Michal Schmidt + +[ Upstream commit 359724fa3ab79fbe9f42c6263cddc2afae32eef3 ] + +idpf registers multiple netdevs (virtual ports) for one PCI function, +but it does not provide a way for userspace to distinguish them with +sysfs attributes. Per Documentation/ABI/testing/sysfs-class-net, it is +a bug not to set dev_port for independent ports on the same PCI bus, +device and function. + +Without dev_port set, systemd-udevd's default naming policy attempts +to assign the same name ("ens2f0") to all four idpf netdevs on my test +system and obviously fails, leaving three of them with the initial +eth name. + +With this patch, systemd-udevd is able to assign unique names to the +netdevs (e.g. "ens2f0", "ens2f0d1", "ens2f0d2", "ens2f0d3"). + +The Intel-provided out-of-tree idpf driver already sets dev_port. In +this patch I chose to do it in the same place in the idpf_cfg_netdev +function. + +Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration") +Signed-off-by: Michal Schmidt +Reviewed-by: Jesse Brandeburg +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/idpf/idpf_lib.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c +index 19809b0ddcd9..0241e498cc20 100644 +--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c ++++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c +@@ -783,6 +783,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport) + /* setup watchdog timeout value to be 5 second */ + netdev->watchdog_timeo = 5 * HZ; + ++ netdev->dev_port = idx; ++ + /* configure default MTU size */ + netdev->min_mtu = ETH_MIN_MTU; + netdev->max_mtu = vport->max_mtu; +-- +2.43.0 + diff --git a/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch b/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch new file mode 100644 index 00000000000..c441dda5a91 --- /dev/null +++ b/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch @@ -0,0 +1,60 @@ +From 0166c869022f93a949db8088d9cffb95e3db16bf Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:58 +0100 +Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers + +From: Maciej Fijalkowski + +[ Upstream commit 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 ] + +Ice and i40e ZC drivers currently set offset of a frag within +skb_shared_info to 0, which is incorrect. xdp_buffs that come from +xsk_buff_pool always have 256 bytes of a headroom, so they need to be +taken into account to retrieve xdp_buff::data via skb_frag_address(). +Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from +xdp_buff::data_hard_start which would result in overwriting existing +payload. + +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++- + drivers/net/ethernet/intel/ice/ice_xsk.c | 3 ++- + 2 files changed, 4 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +index fede0bb3e047..65f38a57b3df 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +@@ -414,7 +414,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first, + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, +- virt_to_page(xdp->data_hard_start), 0, size); ++ virt_to_page(xdp->data_hard_start), ++ XDP_PACKET_HEADROOM, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index 951f84bfdf2b..f3663b3f6390 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -820,7 +820,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first, + } + + __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, +- virt_to_page(xdp->data_hard_start), 0, size); ++ virt_to_page(xdp->data_hard_start), ++ XDP_PACKET_HEADROOM, size); + sinfo->xdp_frags_size += size; + xsk_buff_add_frag(xdp); + +-- +2.43.0 + diff --git a/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch b/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch new file mode 100644 index 00000000000..d173cb42353 --- /dev/null +++ b/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch @@ -0,0 +1,70 @@ +From eae4daf6f79b22f71f6a35ab4233136066e16449 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 18:20:01 +0800 +Subject: ipv6: init the accept_queue's spinlocks in inet6_create + +From: Zhengchao Shao + +[ Upstream commit 435e202d645c197dcfd39d7372eb2a56529b6640 ] + +In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks +once"), the spinlocks of accept_queue are initialized only when socket is +created in the inet4 scenario. The locks are not initialized when socket +is created in the inet6 scenario. The kernel reports the following error: +INFO: trying to register non-static key. +The code is fine but needs lockdep annotation, or maybe +you didn't initialize this object before use? +turning off the locking correctness validator. +Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 +Call Trace: + + dump_stack_lvl (lib/dump_stack.c:107) + register_lock_class (kernel/locking/lockdep.c:1289) + __lock_acquire (kernel/locking/lockdep.c:5015) + lock_acquire.part.0 (kernel/locking/lockdep.c:5756) + _raw_spin_lock_bh (kernel/locking/spinlock.c:178) + inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386) + tcp_disconnect (net/ipv4/tcp.c:2981) + inet_shutdown (net/ipv4/af_inet.c:935) + __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438) + __x64_sys_shutdown (net/socket.c:2445) + do_syscall_64 (arch/x86/entry/common.c:52) + entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) +RIP: 0033:0x7f52ecd05a3d +Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 +48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff +ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 +RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030 +RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d +RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004 +RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000 +R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640 +R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0 + +Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once") +Signed-off-by: Zhengchao Shao +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ipv6/af_inet6.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c +index 13a1833a4df5..959bfd9f6344 100644 +--- a/net/ipv6/af_inet6.c ++++ b/net/ipv6/af_inet6.c +@@ -199,6 +199,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol, + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = SK_CAN_REUSE; + ++ if (INET_PROTOSW_ICSK & answer_flags) ++ inet_init_csk_locks(sk); ++ + inet = inet_sk(sk); + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); + +-- +2.43.0 + diff --git a/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch b/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch new file mode 100644 index 00000000000..0020174b8e3 --- /dev/null +++ b/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch @@ -0,0 +1,130 @@ +From 3c2fb71fcd92c98c689bdd8a2a2c278559759d4d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 17:55:15 -0800 +Subject: llc: Drop support for ETH_P_TR_802_2. + +From: Kuniyuki Iwashima + +[ Upstream commit e3f9bed9bee261e3347131764e42aeedf1ffea61 ] + +syzbot reported an uninit-value bug below. [0] + +llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2 +(0x0011), and syzbot abused the latter to trigger the bug. + + write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16) + +llc_conn_handler() initialises local variables {saddr,daddr}.mac +based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes +them to __llc_lookup(). + +However, the initialisation is done only when skb->protocol is +htons(ETH_P_802_2), otherwise, __llc_lookup_established() and +__llc_lookup_listener() will read garbage. + +The missing initialisation existed prior to commit 211ed865108e +("net: delete all instances of special processing for token ring"). + +It removed the part to kick out the token ring stuff but forgot to +close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv(). + +Let's remove llc_tr_packet_type and complete the deprecation. + +[0]: +BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90 + __llc_lookup_established+0xe9d/0xf90 + __llc_lookup net/llc/llc_conn.c:611 [inline] + llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791 + llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 + __netif_receive_skb_one_core net/core/dev.c:5527 [inline] + __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641 + netif_receive_skb_internal net/core/dev.c:5727 [inline] + netif_receive_skb+0x58/0x660 net/core/dev.c:5786 + tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555 + tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002 + tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048 + call_write_iter include/linux/fs.h:2020 [inline] + new_sync_write fs/read_write.c:491 [inline] + vfs_write+0x8ef/0x1490 fs/read_write.c:584 + ksys_write+0x20f/0x4c0 fs/read_write.c:637 + __do_sys_write fs/read_write.c:649 [inline] + __se_sys_write fs/read_write.c:646 [inline] + __x64_sys_write+0x93/0xd0 fs/read_write.c:646 + do_syscall_x64 arch/x86/entry/common.c:51 [inline] + do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82 + entry_SYSCALL_64_after_hwframe+0x63/0x6b + +Local variable daddr created at: + llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783 + llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206 + +CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023 + +Fixes: 211ed865108e ("net: delete all instances of special processing for token ring") +Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f +Signed-off-by: Kuniyuki Iwashima +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/llc_pdu.h | 6 ++---- + net/llc/llc_core.c | 7 ------- + 2 files changed, 2 insertions(+), 11 deletions(-) + +diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h +index 7e73f8e5e497..1d55ba7c45be 100644 +--- a/include/net/llc_pdu.h ++++ b/include/net/llc_pdu.h +@@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type, + */ + static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) + { +- if (skb->protocol == htons(ETH_P_802_2)) +- memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); ++ memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN); + } + + /** +@@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa) + */ + static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da) + { +- if (skb->protocol == htons(ETH_P_802_2)) +- memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); ++ memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN); + } + + /** +diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c +index 6e387aadffce..4f16d9c88350 100644 +--- a/net/llc/llc_core.c ++++ b/net/llc/llc_core.c +@@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = { + .func = llc_rcv, + }; + +-static struct packet_type llc_tr_packet_type __read_mostly = { +- .type = cpu_to_be16(ETH_P_TR_802_2), +- .func = llc_rcv, +-}; +- + static int __init llc_init(void) + { + dev_add_pack(&llc_packet_type); +- dev_add_pack(&llc_tr_packet_type); + return 0; + } + + static void __exit llc_exit(void) + { + dev_remove_pack(&llc_packet_type); +- dev_remove_pack(&llc_tr_packet_type); + } + + module_init(llc_init); +-- +2.43.0 + diff --git a/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch b/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch new file mode 100644 index 00000000000..368bdf1dc23 --- /dev/null +++ b/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch @@ -0,0 +1,154 @@ +From 4655ed34031dacfc8ec060c94def23c23f158ea5 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 18:36:25 +0000 +Subject: llc: make llc_ui_sendmsg() more robust against bonding changes + +From: Eric Dumazet + +[ Upstream commit dad555c816a50c6a6a8a86be1f9177673918c647 ] + +syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no +headroom, but subsequently trying to push 14 bytes of Ethernet header [1] + +Like some others, llc_ui_sendmsg() releases the socket lock before +calling sock_alloc_send_skb(). +Then it acquires it again, but does not redo all the sanity checks +that were performed. + +This fix: + +- Uses LL_RESERVED_SPACE() to reserve space. +- Check all conditions again after socket lock is held again. +- Do not account Ethernet header for mtu limitation. + +[1] + +skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0 + + kernel BUG at net/core/skbuff.c:193 ! +Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP +Modules linked in: +CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023 +pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--) + pc : skb_panic net/core/skbuff.c:189 [inline] + pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 + lr : skb_panic net/core/skbuff.c:189 [inline] + lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 +sp : ffff800096f97000 +x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000 +x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2 +x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0 +x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce +x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001 +x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000 +x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400 +x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000 +x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714 +x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089 +Call trace: + skb_panic net/core/skbuff.c:189 [inline] + skb_under_panic+0x13c/0x140 net/core/skbuff.c:203 + skb_push+0xf0/0x108 net/core/skbuff.c:2451 + eth_header+0x44/0x1f8 net/ethernet/eth.c:83 + dev_hard_header include/linux/netdevice.h:3188 [inline] + llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33 + llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85 + llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline] + llc_sap_next_state net/llc/llc_sap.c:182 [inline] + llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209 + llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270 + llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997 + sock_sendmsg_nosec net/socket.c:730 [inline] + __sock_sendmsg net/socket.c:745 [inline] + sock_sendmsg+0x194/0x274 net/socket.c:767 + splice_to_socket+0x7cc/0xd58 fs/splice.c:881 + do_splice_from fs/splice.c:933 [inline] + direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142 + splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088 + do_splice_direct+0x20c/0x348 fs/splice.c:1194 + do_sendfile+0x4bc/0xc70 fs/read_write.c:1254 + __do_sys_sendfile64 fs/read_write.c:1322 [inline] + __se_sys_sendfile64 fs/read_write.c:1308 [inline] + __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308 + __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline] + invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51 + el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136 + do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155 + el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678 + el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696 + el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595 +Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000) + +Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") +Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com +Signed-off-by: Eric Dumazet +Reviewed-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/llc/af_llc.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c +index 9b06c380866b..20551cfb7da6 100644 +--- a/net/llc/af_llc.c ++++ b/net/llc/af_llc.c +@@ -928,14 +928,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len, + */ + static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) + { ++ DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); + struct sock *sk = sock->sk; + struct llc_sock *llc = llc_sk(sk); +- DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name); + int flags = msg->msg_flags; + int noblock = flags & MSG_DONTWAIT; ++ int rc = -EINVAL, copied = 0, hdrlen, hh_len; + struct sk_buff *skb = NULL; ++ struct net_device *dev; + size_t size = 0; +- int rc = -EINVAL, copied = 0, hdrlen; + + dprintk("%s: sending from %02X to %02X\n", __func__, + llc->laddr.lsap, llc->daddr.lsap); +@@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len) + if (rc) + goto out; + } +- hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr); ++ dev = llc->dev; ++ hh_len = LL_RESERVED_SPACE(dev); ++ hdrlen = llc_ui_header_len(sk, addr); + size = hdrlen + len; +- if (size > llc->dev->mtu) +- size = llc->dev->mtu; ++ size = min_t(size_t, size, READ_ONCE(dev->mtu)); + copied = size - hdrlen; + rc = -EINVAL; + if (copied < 0) + goto out; + release_sock(sk); +- skb = sock_alloc_send_skb(sk, size, noblock, &rc); ++ skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc); + lock_sock(sk); + if (!skb) + goto out; +- skb->dev = llc->dev; ++ if (sock_flag(sk, SOCK_ZAPPED) || ++ llc->dev != dev || ++ hdrlen != llc_ui_header_len(sk, addr) || ++ hh_len != LL_RESERVED_SPACE(dev) || ++ size > READ_ONCE(dev->mtu)) ++ goto out; ++ skb->dev = dev; + skb->protocol = llc_proto_type(addr->sllc_arphrd); +- skb_reserve(skb, hdrlen); ++ skb_reserve(skb, hh_len + hdrlen); + rc = memcpy_from_msg(skb_put(skb, copied), msg, copied); + if (rc) + goto out; +-- +2.43.0 + diff --git a/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch b/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch new file mode 100644 index 00000000000..e53e81ce2e4 --- /dev/null +++ b/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch @@ -0,0 +1,58 @@ +From 673b7bbd4cec69e76f7c6790c24d1041bcf45ca4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 10:51:41 -0600 +Subject: net: fec: fix the unhandled context fault from smmu + +From: Shenwei Wang + +[ Upstream commit 5e344807735023cd3a67c37a1852b849caa42620 ] + +When repeatedly changing the interface link speed using the command below: + +ethtool -s eth0 speed 100 duplex full +ethtool -s eth0 speed 1000 duplex full + +The following errors may sometimes be reported by the ARM SMMU driver: + +[ 5395.035364] fec 5b040000.ethernet eth0: Link is Down +[ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault: +fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2 +[ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full - +flow control off + +It is identified that the FEC driver does not properly stop the TX queue +during the link speed transitions, and this results in the invalid virtual +I/O address translations from the SMMU and causes the context faults. + +Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()") +Signed-off-by: Shenwei Wang +Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/freescale/fec_main.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c +index e08c7b572497..c107680985e4 100644 +--- a/drivers/net/ethernet/freescale/fec_main.c ++++ b/drivers/net/ethernet/freescale/fec_main.c +@@ -2036,6 +2036,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) + + /* if any of the above changed restart the FEC */ + if (status_change) { ++ netif_stop_queue(ndev); + napi_disable(&fep->napi); + netif_tx_lock_bh(ndev); + fec_restart(ndev); +@@ -2045,6 +2046,7 @@ static void fec_enet_adjust_link(struct net_device *ndev) + } + } else { + if (fep->link) { ++ netif_stop_queue(ndev); + napi_disable(&fep->napi); + netif_tx_lock_bh(ndev); + fec_stop(ndev); +-- +2.43.0 + diff --git a/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch b/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch new file mode 100644 index 00000000000..79ae3be3b78 --- /dev/null +++ b/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch @@ -0,0 +1,81 @@ +From 4638a989e1c2afb54acc2b7dc2890487a0a9c764 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 16:58:59 -0800 +Subject: net: fix removing a namespace with conflicting altnames +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Jakub Kicinski + +[ Upstream commit d09486a04f5da0a812c26217213b89a3b1acf836 ] + +Mark reports a BUG() when a net namespace is removed. + + kernel BUG at net/core/dev.c:11520! + +Physical interfaces moved outside of init_net get "refunded" +to init_net when that namespace disappears. The main interface +name may get overwritten in the process if it would have +conflicted. We need to also discard all conflicting altnames. +Recent fixes addressed ensuring that altnames get moved +with the main interface, which surfaced this problem. + +Reported-by: Марк Коренберг +Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/ +Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns") +Signed-off-by: Jakub Kicinski +Reviewed-by: Eric Dumazet +Reviewed-by: Jiri Pirko +Reviewed-by: Xin Long +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/core/dev.c | 9 +++++++++ + net/core/dev.h | 3 +++ + 2 files changed, 12 insertions(+) + +diff --git a/net/core/dev.c b/net/core/dev.c +index ad20bebe153f..add22ca0dff9 100644 +--- a/net/core/dev.c ++++ b/net/core/dev.c +@@ -11509,6 +11509,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = { + + static void __net_exit default_device_exit_net(struct net *net) + { ++ struct netdev_name_node *name_node, *tmp; + struct net_device *dev, *aux; + /* + * Push all migratable network devices back to the +@@ -11531,6 +11532,14 @@ static void __net_exit default_device_exit_net(struct net *net) + snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex); + if (netdev_name_in_use(&init_net, fb_name)) + snprintf(fb_name, IFNAMSIZ, "dev%%d"); ++ ++ netdev_for_each_altname_safe(dev, name_node, tmp) ++ if (netdev_name_in_use(&init_net, name_node->name)) { ++ netdev_name_node_del(name_node); ++ synchronize_rcu(); ++ __netdev_name_node_alt_destroy(name_node); ++ } ++ + err = dev_change_net_namespace(dev, &init_net, fb_name); + if (err) { + pr_emerg("%s: failed to move %s to init_net: %d\n", +diff --git a/net/core/dev.h b/net/core/dev.h +index 5aa45f0fd4ae..3f5eb92396b6 100644 +--- a/net/core/dev.h ++++ b/net/core/dev.h +@@ -64,6 +64,9 @@ int dev_change_name(struct net_device *dev, const char *newname); + + #define netdev_for_each_altname(dev, namenode) \ + list_for_each_entry((namenode), &(dev)->name_node->list, list) ++#define netdev_for_each_altname_safe(dev, namenode, next) \ ++ list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \ ++ list) + + int netdev_name_node_alt_create(struct net_device *dev, const char *name); + int netdev_name_node_alt_destroy(struct net_device *dev, const char *name); +-- +2.43.0 + diff --git a/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch b/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch new file mode 100644 index 00000000000..86ea396a5ce --- /dev/null +++ b/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch @@ -0,0 +1,61 @@ +From 846480519c20c1853b3e360271f8c0182d65e2a0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 11:47:50 +0100 +Subject: net: micrel: Fix PTP frame parsing for lan8814 + +From: Horatiu Vultur + +[ Upstream commit aaf632f7ab6dec57bc9329a438f94504fe8034b9 ] + +The HW has the capability to check each frame if it is a PTP frame, +which domain it is, which ptp frame type it is, different ip address in +the frame. And if one of these checks fail then the frame is not +timestamp. Most of these checks were disabled except checking the field +minorVersionPTP inside the PTP header. Meaning that once a partner sends +a frame compliant to 8021AS which has minorVersionPTP set to 1, then the +frame was not timestamp because the HW expected by default a value of 0 +in minorVersionPTP. This is exactly the same issue as on lan8841. +Fix this issue by removing this check so the userspace can decide on this. + +Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy") +Signed-off-by: Horatiu Vultur +Reviewed-by: Maxime Chevallier +Reviewed-by: Divya Koppera +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/phy/micrel.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c +index ce5ad4a82481..858175ca58cd 100644 +--- a/drivers/net/phy/micrel.c ++++ b/drivers/net/phy/micrel.c +@@ -120,6 +120,11 @@ + */ + #define LAN8814_1PPM_FORMAT 17179 + ++#define PTP_RX_VERSION 0x0248 ++#define PTP_TX_VERSION 0x0288 ++#define PTP_MAX_VERSION(x) (((x) & GENMASK(7, 0)) << 8) ++#define PTP_MIN_VERSION(x) ((x) & GENMASK(7, 0)) ++ + #define PTP_RX_MOD 0x024F + #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3) + #define PTP_RX_TIMESTAMP_EN 0x024D +@@ -3147,6 +3152,12 @@ static void lan8814_ptp_init(struct phy_device *phydev) + lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0); + lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0); + ++ /* Disable checking for minorVersionPTP field */ ++ lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION, ++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); ++ lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION, ++ PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0)); ++ + skb_queue_head_init(&ptp_priv->tx_queue); + skb_queue_head_init(&ptp_priv->rx_queue); + INIT_LIST_HEAD(&ptp_priv->rx_ts_list); +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch b/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch new file mode 100644 index 00000000000..58fcd4e6ced --- /dev/null +++ b/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch @@ -0,0 +1,94 @@ +From 72948aec8a138908291cc00d2584c02eb45d3574 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sat, 30 Dec 2023 22:40:37 +0200 +Subject: net/mlx5: Bridge, fix multicast packets sent to uplink + +From: Moshe Shemesh + +[ Upstream commit ec7cc38ef9f83553102e84c82536971a81630739 ] + +To enable multicast packets which are offloaded in bridge multicast +offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should +be set. Add this bit to FTE for the bridge multicast offload rules. + +Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets") +Signed-off-by: Moshe Shemesh +Reviewed-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++ + drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c | 2 ++ + include/linux/mlx5/fs.h | 1 + + include/linux/mlx5/mlx5_ifc.h | 2 +- + 4 files changed, 7 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +index a7ed87e9d842..22dd30cf8033 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c +@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md + i++; + } + ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16); + ether_addr_copy(dmac_v, entry->key.addr); +@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po + if (!rule_spec) + return ERR_PTR(-ENOMEM); + ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS; + + flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT; +@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port) + dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID; + dest.vport.vhca_id = port->esw_owner_vhca_id; + } ++ rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN; + handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1); + + kvfree(rule_spec); +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +index a4b925331661..b29299c49ab3 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c +@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev, + fte->flow_context.flow_tag); + MLX5_SET(flow_context, in_flow_context, flow_source, + fte->flow_context.flow_source); ++ MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en, ++ !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN)); + + MLX5_SET(flow_context, in_flow_context, extended_destination, + extended_dest); +diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h +index 6f7725238abc..3fb428ce7d1c 100644 +--- a/include/linux/mlx5/fs.h ++++ b/include/linux/mlx5/fs.h +@@ -132,6 +132,7 @@ struct mlx5_flow_handle; + + enum { + FLOW_CONTEXT_HAS_TAG = BIT(0), ++ FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1), + }; + + struct mlx5_flow_context { +diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h +index 3f7b664d625b..fb8d26a15df4 100644 +--- a/include/linux/mlx5/mlx5_ifc.h ++++ b/include/linux/mlx5/mlx5_ifc.h +@@ -3557,7 +3557,7 @@ struct mlx5_ifc_flow_context_bits { + u8 action[0x10]; + + u8 extended_destination[0x1]; +- u8 reserved_at_81[0x1]; ++ u8 uplink_hairpin_en[0x1]; + u8 flow_source[0x2]; + u8 encrypt_decrypt_type[0x4]; + u8 destination_list_size[0x18]; +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch b/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch new file mode 100644 index 00000000000..deb11050804 --- /dev/null +++ b/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch @@ -0,0 +1,51 @@ +From 1dd205824d3c300e4463cf69b20f524b19dbc73d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 17 Dec 2023 13:20:36 +0200 +Subject: net/mlx5: DR, Can't go to uplink vport on RX rule + +From: Yevgeny Kliteynik + +[ Upstream commit 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 ] + +Go-To-Vport action on RX is not allowed when the vport is uplink. +In such case, the packet should be dropped. + +Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") +Signed-off-by: Yevgeny Kliteynik +Reviewed-by: Erez Shitrit +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../mellanox/mlx5/core/steering/dr_action.c | 16 +++++++++++----- + 1 file changed, 11 insertions(+), 5 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +index 74fc318b5027..d2b65a0ce47b 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +@@ -874,11 +874,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + action->sampler->tx_icm_addr; + break; + case DR_ACTION_TYP_VPORT: +- attr.hit_gvmi = action->vport->caps->vhca_gvmi; +- dest_action = action; +- attr.final_icm_addr = rx_rule ? +- action->vport->caps->icm_address_rx : +- action->vport->caps->icm_address_tx; ++ if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) { ++ /* can't go to uplink on RX rule - dropping instead */ ++ attr.final_icm_addr = nic_dmn->drop_icm_addr; ++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; ++ } else { ++ attr.hit_gvmi = action->vport->caps->vhca_gvmi; ++ dest_action = action; ++ attr.final_icm_addr = rx_rule ? ++ action->vport->caps->icm_address_rx : ++ action->vport->caps->icm_address_tx; ++ } + break; + case DR_ACTION_TYP_POP_VLAN: + if (!rx_rule && !(dmn->ste_ctx->actions_caps & +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch b/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch new file mode 100644 index 00000000000..f3340cf175c --- /dev/null +++ b/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch @@ -0,0 +1,39 @@ +From f996a3313ebd84e0efdb7d61b590eec93fc7c6a2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 17 Dec 2023 11:24:08 +0200 +Subject: net/mlx5: DR, Use the right GVMI number for drop action + +From: Yevgeny Kliteynik + +[ Upstream commit 5665954293f13642f9c052ead83c1e9d8cff186f ] + +When FW provides ICM addresses for drop RX/TX, the provided capability +is 64 bits that contain its GVMI as well as the ICM address itself. +In case of TX DROP this GVMI is different from the GVMI that the +domain is operating on. + +This patch fixes the action to use these GVMI IDs, as provided by FW. + +Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality") +Signed-off-by: Yevgeny Kliteynik +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +index e3ec559369fa..74fc318b5027 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c +@@ -788,6 +788,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher, + switch (action_type) { + case DR_ACTION_TYP_DROP: + attr.final_icm_addr = nic_dmn->drop_icm_addr; ++ attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48; + break; + case DR_ACTION_TYP_FT: + dest_action = action; +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch b/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch new file mode 100644 index 00000000000..199824f919e --- /dev/null +++ b/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch @@ -0,0 +1,149 @@ +From 35519d6efbd840bc026139266c7df4248df4cc79 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 31 Dec 2023 15:19:50 +0200 +Subject: net/mlx5: Fix a WARN upon a callback command failure + +From: Yishai Hadas + +[ Upstream commit cc8091587779cfaddb6b29c9e9edb9079a282cad ] + +The below WARN [1] is reported once a callback command failed. + +As a callback runs under an interrupt context, needs to use the IRQ +save/restore variant. + +[1] +DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context()) +WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353 + lockdep_hardirqs_on_prepare+0x11b/0x180 +Modules linked in: vhost_net vhost tap mlx5_vfio_pci +vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh +vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle +ip6table_nat ip6table_filter ip6_tables iptable_mangle +xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink +xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5 +auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi +scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm +mlx5_ib ib_uverbs ib_core fuse mlx5_core +CPU: 15 PID: 0 Comm: swapper/15 Tainted: G W 6.7.0-rc4+ #1587 +Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS +rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180 +Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3 + 76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1 + e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00 + 85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75 +RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027 +RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888 +RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001 +R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1 +R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003 +FS: 0000000000000000(0000) GS:ffff88885fbc0000(0000) +knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0 +DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +Call Trace: + +? __warn+0x81/0x170 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +? report_bug+0xf8/0x1c0 +? handle_bug+0x3f/0x70 +? exc_invalid_op+0x13/0x60 +? asm_exc_invalid_op+0x16/0x20 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +? lockdep_hardirqs_on_prepare+0x11b/0x180 +trace_hardirqs_on+0x4a/0xa0 +raw_spin_unlock_irq+0x24/0x30 +cmd_status_err+0xc0/0x1a0 [mlx5_core] +cmd_status_err+0x1a0/0x1a0 [mlx5_core] +mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core] +mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core] +cmd_comp_notifier+0x1a/0x20 [mlx5_core] +notifier_call_chain+0x3e/0xe0 +atomic_notifier_call_chain+0x5f/0x130 +mlx5_eq_async_int+0xe7/0x200 [mlx5_core] +notifier_call_chain+0x3e/0xe0 +atomic_notifier_call_chain+0x5f/0x130 +irq_int_handler+0x11/0x20 [mlx5_core] +__handle_irq_event_percpu+0x99/0x220 +? tick_irq_enter+0x5d/0x80 +handle_irq_event_percpu+0xf/0x40 +handle_irq_event+0x3a/0x60 +handle_edge_irq+0xa2/0x1c0 +__common_interrupt+0x55/0x140 +common_interrupt+0x7d/0xa0 + + +asm_common_interrupt+0x22/0x40 +RIP: 0010:default_idle+0x13/0x20 +Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff +ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb +f4 c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00 +RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242 +RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000 +RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c +RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001 +R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000 +R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000 +? do_idle+0x1ec/0x210 +default_idle_call+0x6c/0x90 +do_idle+0x1ec/0x210 +cpu_startup_entry+0x26/0x30 +start_secondary+0x11b/0x150 +secondary_startup_64_no_verify+0x165/0x16b + +irq event stamp: 833284 +hardirqs last enabled at (833283): [] +do_idle+0x1ec/0x210 +hardirqs last disabled at (833284): [] +common_interrupt+0xf/0xa0 +softirqs last enabled at (833224): [] +__do_softirq+0x2bf/0x40e +softirqs last disabled at (833177): [] +irq_exit_rcu+0x7f/0xa0 + +Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs") +Signed-off-by: Yishai Hadas +Reviewed-by: Moshe Shemesh +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++-- + 1 file changed, 3 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +index a7b1f9686c09..4957412ff1f6 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c +@@ -1923,6 +1923,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + { + const char *namep = mlx5_command_str(opcode); + struct mlx5_cmd_stats *stats; ++ unsigned long flags; + + if (!err || !(strcmp(namep, "unknown command opcode"))) + return; +@@ -1930,7 +1931,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + stats = xa_load(&dev->cmd.stats, opcode); + if (!stats) + return; +- spin_lock_irq(&stats->lock); ++ spin_lock_irqsave(&stats->lock, flags); + stats->failed++; + if (err < 0) + stats->last_failed_errno = -err; +@@ -1939,7 +1940,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status, + stats->last_failed_mbox_status = status; + stats->last_failed_syndrome = syndrome; + } +- spin_unlock_irq(&stats->lock); ++ spin_unlock_irqrestore(&stats->lock, flags); + } + + /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */ +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch b/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch new file mode 100644 index 00000000000..5b8dd962a2e --- /dev/null +++ b/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch @@ -0,0 +1,39 @@ +From fee9cf89467830f61dd17acaae0daeed512142b9 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Nov 2023 14:01:54 -0800 +Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for + ASO + +From: Rahul Rameshbabu + +[ Upstream commit 20cbf8cbb827094197f3b17db60d71449415db1e ] + +mlx5 devices have specific constants for choosing the CQ period mode. These +constants do not have to match the constants used by the kernel software +API for DIM period mode selection. + +Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO") +Signed-off-by: Rahul Rameshbabu +Reviewed-by: Jianbo Liu +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +index 40c7be124041..58bd749b5e4d 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c +@@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data) + mlx5_fill_page_frag_array(&cq->wq_ctrl.buf, + (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas)); + +- MLX5_SET(cqc, cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE); ++ MLX5_SET(cqc, cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE); + MLX5_SET(cqc, cqc, c_eqn_or_apu_element, eqn); + MLX5_SET(cqc, cqc, uar_page, mdev->priv.uar->index); + MLX5_SET(cqc, cqc, log_page_size, cq->wq_ctrl.buf.page_shift - +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch b/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch new file mode 100644 index 00000000000..1ef83f29a43 --- /dev/null +++ b/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch @@ -0,0 +1,39 @@ +From 67398c780b74733917ea136778b926b754cfccd1 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 12 Dec 2023 13:52:55 +0200 +Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled + +From: Leon Romanovsky + +[ Upstream commit 20f5468a7988dedd94a57ba8acd65ebda6a59723 ] + +All ConnectX devices have software parsing capability enabled, but it is +more correct to set allow_swp only if capability exists, which for IPsec +means that crypto offload is supported. + +Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check") +Signed-off-by: Leon Romanovsky +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +index e097f336e1c4..30507b7c2fb1 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c +@@ -1062,8 +1062,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev, + void *wq = MLX5_ADDR_OF(sqc, sqc, wq); + bool allow_swp; + +- allow_swp = +- mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev); ++ allow_swp = mlx5_geneve_tx_allowed(mdev) || ++ (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO); + mlx5e_build_sq_param_common(mdev, param); + MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size); + MLX5_SET(sqc, sqc, allow_swp, allow_swp); +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch b/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch new file mode 100644 index 00000000000..03470c1aa78 --- /dev/null +++ b/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch @@ -0,0 +1,100 @@ +From fcb41c119b911abd8b24a93d58e8f373a36e3784 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 15:17:36 +0800 +Subject: net/mlx5e: fix a double-free in arfs_create_groups + +From: Zhipeng Lu + +[ Upstream commit 3c6d5189246f590e4e1f167991558bdb72a4738b ] + +When `in` allocated by kvzalloc fails, arfs_create_groups will free +ft->g and return an error. However, arfs_create_table, the only caller of +arfs_create_groups, will hold this error and call to +mlx5e_destroy_flow_table, in which the ft->g will be freed again. + +Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables") +Signed-off-by: Zhipeng Lu +Reviewed-by: Simon Horman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++-------- + 1 file changed, 15 insertions(+), 11 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +index bb7f86c993e5..e66f486faafe 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c +@@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + + ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS, + sizeof(*ft->g), GFP_KERNEL); +- in = kvzalloc(inlen, GFP_KERNEL); +- if (!in || !ft->g) { +- kfree(ft->g); +- kvfree(in); ++ if (!ft->g) + return -ENOMEM; ++ ++ in = kvzalloc(inlen, GFP_KERNEL); ++ if (!in) { ++ err = -ENOMEM; ++ goto err_free_g; + } + + mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria); +@@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + break; + default: + err = -EINVAL; +- goto out; ++ goto err_free_in; + } + + switch (type) { +@@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + break; + default: + err = -EINVAL; +- goto out; ++ goto err_free_in; + } + + MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS); +@@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) +- goto err; ++ goto err_clean_group; + ft->num_groups++; + + memset(in, 0, inlen); +@@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft, + MLX5_SET_CFG(in, end_flow_index, ix - 1); + ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in); + if (IS_ERR(ft->g[ft->num_groups])) +- goto err; ++ goto err_clean_group; + ft->num_groups++; + + kvfree(in); + return 0; + +-err: ++err_clean_group: + err = PTR_ERR(ft->g[ft->num_groups]); + ft->g[ft->num_groups] = NULL; +-out: ++err_free_in: + kvfree(in); +- ++err_free_g: ++ kfree(ft->g); ++ ft->g = NULL; + return err; + } + +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch b/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch new file mode 100644 index 00000000000..1488dab148c --- /dev/null +++ b/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch @@ -0,0 +1,40 @@ +From a4f214befe9daf31c7e22b11d18352c1e952afd2 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 28 Nov 2023 17:29:01 +0800 +Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups + +From: Dinghao Liu + +[ Upstream commit aef855df7e1bbd5aa4484851561211500b22707e ] + +When kcalloc() for ft->g succeeds but kvzalloc() for in fails, +fs_any_create_groups() will free ft->g. However, its caller +fs_any_create_table() will free ft->g again through calling +mlx5e_destroy_flow_table(), which will lead to a double-free. +Fix this by setting ft->g to NULL in fs_any_create_groups(). + +Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API") +Signed-off-by: Dinghao Liu +Reviewed-by: Tariq Toukan +Reviewed-by: Simon Horman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +index e1283531e0b8..671adbad0a40 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c +@@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft) + in = kvzalloc(inlen, GFP_KERNEL); + if (!in || !ft->g) { + kfree(ft->g); ++ ft->g = NULL; + kvfree(in); + return -ENOMEM; + } +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch b/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch new file mode 100644 index 00000000000..837a32a0787 --- /dev/null +++ b/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch @@ -0,0 +1,44 @@ +From 23db336dc559a45fa3dfa9a231d8d09a0fb1c25a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 5 Nov 2023 17:09:46 +0200 +Subject: net/mlx5e: Fix inconsistent hairpin RQT sizes + +From: Tariq Toukan + +[ Upstream commit c20767fd45e82d64352db82d4fc8d281a43e4783 ] + +The processing of traffic in hairpin queues occurs in HW/FW and does not +involve the cpus, hence the upper bound on max num channels does not +apply to them. Using this bound for the hairpin RQT max_table_size is +wrong. It could be too small, and cause the error below [1]. As the +RQT size provided on init does not get modified later, use the same +value for both actual and max table sizes. + +[1] +mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1200): CREATE_RQT(0x916) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x538faf), err(-22) + +Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels") +Signed-off-by: Tariq Toukan +Reviewed-by: Gal Pressman +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +index 96af9e2ab1d8..b61d82f08e65 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -761,7 +761,7 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp) + + err = mlx5e_rss_params_indir_init(&indir, mdev, + mlx5e_rqt_size(mdev, hp->num_channels), +- mlx5e_rqt_size(mdev, priv->max_nch)); ++ mlx5e_rqt_size(mdev, hp->num_channels)); + if (err) + return err; + +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch b/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch new file mode 100644 index 00000000000..b79cc389e7b --- /dev/null +++ b/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch @@ -0,0 +1,41 @@ +From fa8499340268fb830497b9a417e690b2a2ee1e41 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 22 Nov 2023 18:32:11 -0800 +Subject: net/mlx5e: Fix operation precedence bug in port timestamping + napi_poll context + +From: Rahul Rameshbabu + +[ Upstream commit 3876638b2c7ebb2c9d181de1191db0de8cac143a ] + +Indirection (*) is of lower precedence than postfix increment (++). Logic +in napi_poll context would cause an out-of-bound read by first increment +the pointer address by byte address space and then dereference the value. +Rather, the intended logic was to dereference first and then increment the +underlying value. + +Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter") +Signed-off-by: Rahul Rameshbabu +Reviewed-by: Tariq Toukan +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +index af3928eddafd..803035d4e597 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c +@@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq, + mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp); + out: + napi_consume_skb(skb, budget); +- md_buff[*md_buff_sz++] = metadata_id; ++ md_buff[(*md_buff_sz)++] = metadata_id; + if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) && + !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state)) + queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work); +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch b/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch new file mode 100644 index 00000000000..afb040b80b9 --- /dev/null +++ b/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch @@ -0,0 +1,126 @@ +From 77963fd716035e94aa889dcddb5f193a5ac0f276 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 10 Nov 2023 11:10:22 +0100 +Subject: net/mlx5e: Fix peer flow lists handling + +From: Vlad Buslov + +[ Upstream commit d76fdd31f953ac5046555171620f2562715e9b71 ] + +The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP +flag when list of peer flows has become empty. However, if any concurrent +user holds a reference to a peer flow (for example, the neighbor update +workqueue task is updating peer flow's parent encap entry concurrently), +then the flow will not be removed from the peer list and, consecutively, +DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls +mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm +will try to remove the flow from eswitch instances that it has never peered +with causing either NULL pointer dereference when trying to remove the flow +peer list head of peer_index that was never initialized or a warning if the +list debug config is enabled[0]. + +Fix the issue by always removing the peer flow from the list even when not +releasing the last reference to it. + +[0]: + +[ 3102.985806] ------------[ cut here ]------------ +[ 3102.986223] list_del corruption, ffff888139110698->next is NULL +[ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg +ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding] +[ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3 +[ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014 +[ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0 +[ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b +[ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286 +[ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000 +[ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640 +[ 3102.997188] DEL flow 00000000be367878 on port 0 +[ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff +[ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100 +[ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240 +[ 3103.000790] FS: 00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000 +[ 3103.001486] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0 +[ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000 +[ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400 +[ 3103.003787] Call Trace: +[ 3103.004055] +[ 3103.004297] ? __warn+0x7d/0x130 +[ 3103.004623] ? __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3103.005094] ? report_bug+0xf1/0x1c0 +[ 3103.005439] ? console_unlock+0x4a/0xd0 +[ 3103.005806] ? handle_bug+0x3f/0x70 +[ 3103.006149] ? exc_invalid_op+0x13/0x60 +[ 3103.006531] ? asm_exc_invalid_op+0x16/0x20 +[ 3103.007430] ? __list_del_entry_valid_or_report+0x4f/0xc0 +[ 3103.007910] mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core] +[ 3103.008463] mlx5e_tc_del_flow+0x46/0x270 [mlx5_core] +[ 3103.008944] mlx5e_flow_put+0x26/0x50 [mlx5_core] +[ 3103.009401] mlx5e_delete_flower+0x25f/0x380 [mlx5_core] +[ 3103.009901] tc_setup_cb_destroy+0xab/0x180 +[ 3103.010292] fl_hw_destroy_filter+0x99/0xc0 [cls_flower] +[ 3103.010779] __fl_delete+0x2d4/0x2f0 [cls_flower] +[ 3103.011207] fl_delete+0x36/0x80 [cls_flower] +[ 3103.011614] tc_del_tfilter+0x56f/0x750 +[ 3103.011982] rtnetlink_rcv_msg+0xff/0x3a0 +[ 3103.012362] ? netlink_ack+0x1c7/0x4e0 +[ 3103.012719] ? rtnl_calcit.isra.44+0x130/0x130 +[ 3103.013134] netlink_rcv_skb+0x54/0x100 +[ 3103.013533] netlink_unicast+0x1ca/0x2b0 +[ 3103.013902] netlink_sendmsg+0x361/0x4d0 +[ 3103.014269] __sock_sendmsg+0x38/0x60 +[ 3103.014643] ____sys_sendmsg+0x1f2/0x200 +[ 3103.015018] ? copy_msghdr_from_user+0x72/0xa0 +[ 3103.015265] ___sys_sendmsg+0x87/0xd0 +[ 3103.016608] ? copy_msghdr_from_user+0x72/0xa0 +[ 3103.017014] ? ___sys_recvmsg+0x9b/0xd0 +[ 3103.017381] ? ttwu_do_activate.isra.137+0x58/0x180 +[ 3103.017821] ? wake_up_q+0x49/0x90 +[ 3103.018157] ? futex_wake+0x137/0x160 +[ 3103.018521] ? __sys_sendmsg+0x51/0x90 +[ 3103.018882] __sys_sendmsg+0x51/0x90 +[ 3103.019230] ? exit_to_user_mode_prepare+0x56/0x130 +[ 3103.019670] do_syscall_64+0x3c/0x80 +[ 3103.020017] entry_SYSCALL_64_after_hwframe+0x46/0xb0 +[ 3103.020469] RIP: 0033:0x7f4254811ef4 +[ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b +[ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e +[ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4 +[ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012 +[ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0 +[ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001 +[ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0 +[ 3103.025931] +[ 3103.026182] ---[ end trace 0000000000000000 ]--- +[ 3103.027033] ------------[ cut here ]------------ + +Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer") +Signed-off-by: Vlad Buslov +Reviewed-by: Mark Bloch +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +index b61d82f08e65..404dd1d9b28b 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c +@@ -2014,9 +2014,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow, + list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) { + if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev)) + continue; ++ ++ list_del(&peer_flow->peer_flows); + if (refcount_dec_and_test(&peer_flow->refcnt)) { + mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow); +- list_del(&peer_flow->peer_flows); + kfree(peer_flow); + } + } +-- +2.43.0 + diff --git a/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch b/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch new file mode 100644 index 00000000000..add5ca7d25b --- /dev/null +++ b/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch @@ -0,0 +1,68 @@ +From 87127fb62eda0f2490d2a3d442cd9fdbdbf2732a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Sun, 26 Nov 2023 11:08:10 +0200 +Subject: net/mlx5e: Ignore IPsec replay window values on sender side + +From: Leon Romanovsky + +[ Upstream commit 315a597f9bcfe7fe9980985031413457bee95510 ] + +XFRM stack doesn't prevent from users to configure replay window +in TX side and strongswan sets replay_window to be 1. It causes +to failures in validation logic when trying to offload the SA. + +Replay window is not relevant in TX side and should be ignored. + +Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes") +Signed-off-by: Aya Levin +Signed-off-by: Leon Romanovsky +Signed-off-by: Saeed Mahameed +Signed-off-by: Sasha Levin +--- + .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c | 10 ++++++++-- + 1 file changed, 8 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +index 161c5190c236..05612d9c6080 100644 +--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c ++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c +@@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, + /* iv len */ + aes_gcm->icv_len = x->aead->alg_icv_len; + ++ attrs->dir = x->xso.dir; ++ + /* esn */ + if (x->props.flags & XFRM_STATE_ESN) { + attrs->replay_esn.trigger = true; + attrs->replay_esn.esn = sa_entry->esn_state.esn; + attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb; + attrs->replay_esn.overlap = sa_entry->esn_state.overlap; ++ if (attrs->dir == XFRM_DEV_OFFLOAD_OUT) ++ goto skip_replay_window; ++ + switch (x->replay_esn->replay_window) { + case 32: + attrs->replay_esn.replay_window = +@@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry, + } + } + +- attrs->dir = x->xso.dir; ++skip_replay_window: + /* spi */ + attrs->spi = be32_to_cpu(x->id.spi); + +@@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev, + return -EINVAL; + } + +- if (x->replay_esn && x->replay_esn->replay_window != 32 && ++ if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN && ++ x->replay_esn->replay_window != 32 && + x->replay_esn->replay_window != 64 && + x->replay_esn->replay_window != 128 && + x->replay_esn->replay_window != 256) { +-- +2.43.0 + diff --git a/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch b/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch new file mode 100644 index 00000000000..08f104fa2a6 --- /dev/null +++ b/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch @@ -0,0 +1,77 @@ +From ab880a9dc6e8c24858ed3edae56d7d323782c2f6 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 19:59:14 -0800 +Subject: net: mvpp2: clear BM pool before initialization + +From: Jenishkumar Maheshbhai Patel + +[ Upstream commit 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 ] + +Register value persist after booting the kernel using +kexec which results in kernel panic. Thus clear the +BM pool registers before initialisation to fix the issue. + +Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit") +Signed-off-by: Jenishkumar Maheshbhai Patel +Reviewed-by: Maxime Chevallier +Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + .../net/ethernet/marvell/mvpp2/mvpp2_main.c | 27 ++++++++++++++++++- + 1 file changed, 26 insertions(+), 1 deletion(-) + +diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +index 93137606869e..065f07392c96 100644 +--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c ++++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c +@@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv) + mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val); + } + ++/* Cleanup pool before actual initialization in the OS */ ++static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id) ++{ ++ unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu()); ++ u32 val; ++ int i; ++ ++ /* Drain the BM from all possible residues left by firmware */ ++ for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++) ++ mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id)); ++ ++ put_cpu(); ++ ++ /* Stop the BM pool */ ++ val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id)); ++ val |= MVPP2_BM_STOP_MASK; ++ mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val); ++} ++ + static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) + { + enum dma_data_direction dma_dir = DMA_FROM_DEVICE; + int i, err, poolnum = MVPP2_BM_POOLS_NUM; + struct mvpp2_port *port; + ++ if (priv->percpu_pools) ++ poolnum = mvpp2_get_nrxqs(priv) * 2; ++ ++ /* Clean up the pool state in case it contains stale state */ ++ for (i = 0; i < poolnum; i++) ++ mvpp2_bm_pool_cleanup(priv, i); ++ + if (priv->percpu_pools) { + for (i = 0; i < priv->port_count; i++) { + port = priv->port_list[i]; +@@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv) + } + } + +- poolnum = mvpp2_get_nrxqs(priv) * 2; + for (i = 0; i < poolnum; i++) { + /* the pool in use */ + int pn = i / (poolnum / 2); +-- +2.43.0 + diff --git a/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch b/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch new file mode 100644 index 00000000000..a219e8fb1c3 --- /dev/null +++ b/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch @@ -0,0 +1,71 @@ +From 3a2f87de96b7fc7b6543d41e9fefa4c13dfa7cfc Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 17:48:39 -0800 +Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv + +From: Sharath Srinivasan + +[ Upstream commit 13e788deb7348cc88df34bed736c3b3b9927ea52 ] + +Syzcaller UBSAN crash occurs in rds_cmsg_recv(), +which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1), +but with array size of 4 (RDS_RX_MAX_TRACES). +Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from +trace.rx_trace_pos[i] in rds_recv_track_latency(), +with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the +off-by-one bounds check in rds_recv_track_latency() to prevent +a potential crash in rds_cmsg_recv(). + +Found by syzcaller: +================================================================= +UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39 +index 4 is out of range for type 'u64 [4]' +CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), +BIOS 1.15.0-1 04/01/2014 +Call Trace: + + __dump_stack lib/dump_stack.c:88 [inline] + dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106 + ubsan_epilogue lib/ubsan.c:217 [inline] + __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348 + rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585 + rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716 + sock_recvmsg_nosec net/socket.c:1044 [inline] + sock_recvmsg+0xe2/0x160 net/socket.c:1066 + __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246 + __do_sys_recvfrom net/socket.c:2264 [inline] + __se_sys_recvfrom net/socket.c:2260 [inline] + __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260 + do_syscall_x64 arch/x86/entry/common.c:51 [inline] + do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82 + entry_SYSCALL_64_after_hwframe+0x63/0x6b +================================================================== + +Fixes: 3289025aedc0 ("RDS: add receive message trace used by application") +Reported-by: Chenyuan Yang +Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/ +Signed-off-by: Sharath Srinivasan +Reviewed-by: Simon Horman +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/rds/af_rds.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c +index 01c4cdfef45d..8435a20968ef 100644 +--- a/net/rds/af_rds.c ++++ b/net/rds/af_rds.c +@@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval, + + rs->rs_rx_traces = trace.rx_traces; + for (i = 0; i < rs->rs_rx_traces; i++) { +- if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) { ++ if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) { + rs->rs_rx_traces = 0; + return -EFAULT; + } +-- +2.43.0 + diff --git a/queue-6.7/net-sched-flower-fix-chain-template-offload.patch b/queue-6.7/net-sched-flower-fix-chain-template-offload.patch new file mode 100644 index 00000000000..5cd636abe05 --- /dev/null +++ b/queue-6.7/net-sched-flower-fix-chain-template-offload.patch @@ -0,0 +1,190 @@ +From 95f0d3dbe719a42cc4c5614e70cc0a6a71b2f833 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 15:28:43 +0200 +Subject: net/sched: flower: Fix chain template offload + +From: Ido Schimmel + +[ Upstream commit 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 ] + +When a qdisc is deleted from a net device the stack instructs the +underlying driver to remove its flow offload callback from the +associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack +then continues to replay the removal of the filters in the block for +this driver by iterating over the chains in the block and invoking the +'reoffload' operation of the classifier being used. In turn, the +classifier in its 'reoffload' operation prepares and emits a +'FLOW_CLS_DESTROY' command for each filter. + +However, the stack does not do the same for chain templates and the +underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when +a qdisc is deleted. This results in a memory leak [1] which can be +reproduced using [2]. + +Fix by introducing a 'tmplt_reoffload' operation and have the stack +invoke it with the appropriate arguments as part of the replay. +Implement the operation in the sole classifier that supports chain +templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}' +command based on whether a flow offload callback is being bound to a +filter block or being unbound from one. + +As far as I can tell, the issue happens since cited commit which +reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains() +in __tcf_block_put(). The order cannot be reversed as the filter block +is expected to be freed after flushing all the chains. + +[1] +unreferenced object 0xffff888107e28800 (size 2048): + comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) + hex dump (first 32 bytes): + b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff ..|......[...... + 01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff ................ + backtrace: + [] __kmem_cache_alloc_node+0x1e8/0x320 + [] __kmalloc+0x4e/0x90 + [] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0 + [] mlxsw_sp_flower_tmplt_create+0x145/0x180 + [] mlxsw_sp_flow_block_cb+0x1ea/0x280 + [] tc_setup_cb_call+0x183/0x340 + [] fl_tmplt_create+0x3da/0x4c0 + [] tc_ctl_chain+0xa15/0x1170 + [] rtnetlink_rcv_msg+0x3cc/0xed0 + [] netlink_rcv_skb+0x170/0x440 + [] netlink_unicast+0x540/0x820 + [] netlink_sendmsg+0x8d8/0xda0 + [] ____sys_sendmsg+0x30f/0xa80 + [] ___sys_sendmsg+0x13a/0x1e0 + [] __sys_sendmsg+0x11c/0x1f0 + [] do_syscall_64+0x40/0xe0 +unreferenced object 0xffff88816d2c0400 (size 1024): + comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s) + hex dump (first 32 bytes): + 40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00 @.......W.8..... + 10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff ..,m......,m.... + backtrace: + [] __kmem_cache_alloc_node+0x1e8/0x320 + [] __kmalloc_node+0x51/0x90 + [] kvmalloc_node+0xa6/0x1f0 + [] bucket_table_alloc.isra.0+0x83/0x460 + [] rhashtable_init+0x43b/0x7c0 + [] mlxsw_sp_acl_ruleset_get+0x428/0x7a0 + [] mlxsw_sp_flower_tmplt_create+0x145/0x180 + [] mlxsw_sp_flow_block_cb+0x1ea/0x280 + [] tc_setup_cb_call+0x183/0x340 + [] fl_tmplt_create+0x3da/0x4c0 + [] tc_ctl_chain+0xa15/0x1170 + [] rtnetlink_rcv_msg+0x3cc/0xed0 + [] netlink_rcv_skb+0x170/0x440 + [] netlink_unicast+0x540/0x820 + [] netlink_sendmsg+0x8d8/0xda0 + [] ____sys_sendmsg+0x30f/0xa80 + +[2] + # tc qdisc add dev swp1 clsact + # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32 + # tc qdisc del dev swp1 clsact + # devlink dev reload pci/0000:06:00.0 + +Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()") +Signed-off-by: Ido Schimmel +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/net/sch_generic.h | 4 ++++ + net/sched/cls_api.c | 9 ++++++++- + net/sched/cls_flower.c | 23 +++++++++++++++++++++++ + 3 files changed, 35 insertions(+), 1 deletion(-) + +diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h +index dcb9160e6467..959a7725c27b 100644 +--- a/include/net/sch_generic.h ++++ b/include/net/sch_generic.h +@@ -375,6 +375,10 @@ struct tcf_proto_ops { + struct nlattr **tca, + struct netlink_ext_ack *extack); + void (*tmplt_destroy)(void *tmplt_priv); ++ void (*tmplt_reoffload)(struct tcf_chain *chain, ++ bool add, ++ flow_setup_cb_t *cb, ++ void *cb_priv); + struct tcf_exts * (*get_exts)(const struct tcf_proto *tp, + u32 handle); + +diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c +index 1976bd163986..02c594baa1d9 100644 +--- a/net/sched/cls_api.c ++++ b/net/sched/cls_api.c +@@ -1536,6 +1536,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, + chain_prev = chain, + chain = __tcf_get_next_chain(block, chain), + tcf_chain_put(chain_prev)) { ++ if (chain->tmplt_ops && add) ++ chain->tmplt_ops->tmplt_reoffload(chain, true, cb, ++ cb_priv); + for (tp = __tcf_get_next_proto(chain, NULL); tp; + tp_prev = tp, + tp = __tcf_get_next_proto(chain, tp), +@@ -1551,6 +1554,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb, + goto err_playback_remove; + } + } ++ if (chain->tmplt_ops && !add) ++ chain->tmplt_ops->tmplt_reoffload(chain, false, cb, ++ cb_priv); + } + + return 0; +@@ -2971,7 +2977,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net, + ops = tcf_proto_lookup_ops(name, true, extack); + if (IS_ERR(ops)) + return PTR_ERR(ops); +- if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) { ++ if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump || ++ !ops->tmplt_reoffload) { + NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier"); + module_put(ops->owner); + return -EOPNOTSUPP; +diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c +index e5314a31f75a..efb9d2811b73 100644 +--- a/net/sched/cls_flower.c ++++ b/net/sched/cls_flower.c +@@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv) + kfree(tmplt); + } + ++static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add, ++ flow_setup_cb_t *cb, void *cb_priv) ++{ ++ struct fl_flow_tmplt *tmplt = chain->tmplt_priv; ++ struct flow_cls_offload cls_flower = {}; ++ ++ cls_flower.rule = flow_rule_alloc(0); ++ if (!cls_flower.rule) ++ return; ++ ++ cls_flower.common.chain_index = chain->index; ++ cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE : ++ FLOW_CLS_TMPLT_DESTROY; ++ cls_flower.cookie = (unsigned long) tmplt; ++ cls_flower.rule->match.dissector = &tmplt->dissector; ++ cls_flower.rule->match.mask = &tmplt->mask; ++ cls_flower.rule->match.key = &tmplt->dummy_key; ++ ++ cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv); ++ kfree(cls_flower.rule); ++} ++ + static int fl_dump_key_val(struct sk_buff *skb, + void *val, int val_type, + void *mask, int mask_type, int len) +@@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = { + .bind_class = fl_bind_class, + .tmplt_create = fl_tmplt_create, + .tmplt_destroy = fl_tmplt_destroy, ++ .tmplt_reoffload = fl_tmplt_reoffload, + .tmplt_dump = fl_tmplt_dump, + .get_exts = fl_get_exts, + .owner = THIS_MODULE, +-- +2.43.0 + diff --git a/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch b/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch new file mode 100644 index 00000000000..67c9b27430c --- /dev/null +++ b/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch @@ -0,0 +1,87 @@ +From 1777c87b90b1b1898a582ff004304306f589756b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 12:32:10 +0800 +Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump + +From: Wen Gu + +[ Upstream commit dbc153fd3c142909e564bb256da087e13fbf239c ] + +A crash was found when dumping SMC-D connections. It can be reproduced +by following steps: + +- run nginx/wrk test: + smc_run nginx + smc_run wrk -t 16 -c 1000 -d -H 'Connection: Close' + +- continuously dump SMC-D connections in parallel: + watch -n 1 'smcss -D' + + BUG: kernel NULL pointer dereference, address: 0000000000000030 + CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E 6.7.0+ #55 + RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] + Call Trace: + + ? __die+0x24/0x70 + ? page_fault_oops+0x66/0x150 + ? exc_page_fault+0x69/0x140 + ? asm_exc_page_fault+0x26/0x30 + ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag] + ? __kmalloc_node_track_caller+0x35d/0x430 + ? __alloc_skb+0x77/0x170 + smc_diag_dump_proto+0xd0/0xf0 [smc_diag] + smc_diag_dump+0x26/0x60 [smc_diag] + netlink_dump+0x19f/0x320 + __netlink_dump_start+0x1dc/0x300 + smc_diag_handler_dump+0x6a/0x80 [smc_diag] + ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag] + sock_diag_rcv_msg+0x121/0x140 + ? __pfx_sock_diag_rcv_msg+0x10/0x10 + netlink_rcv_skb+0x5a/0x110 + sock_diag_rcv+0x28/0x40 + netlink_unicast+0x22a/0x330 + netlink_sendmsg+0x1f8/0x420 + __sock_sendmsg+0xb0/0xc0 + ____sys_sendmsg+0x24e/0x300 + ? copy_msghdr_from_user+0x62/0x80 + ___sys_sendmsg+0x7c/0xd0 + ? __do_fault+0x34/0x160 + ? do_read_fault+0x5f/0x100 + ? do_fault+0xb0/0x110 + ? __handle_mm_fault+0x2b0/0x6c0 + __sys_sendmsg+0x4d/0x80 + do_syscall_64+0x69/0x180 + entry_SYSCALL_64_after_hwframe+0x6e/0x76 + +It is possible that the connection is in process of being established +when we dump it. Assumed that the connection has been registered in a +link group by smc_conn_create() but the rmb_desc has not yet been +initialized by smc_buf_create(), thus causing the illegal access to +conn->rmb_desc. So fix it by checking before dump. + +Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support") +Signed-off-by: Wen Gu +Reviewed-by: Dust Li +Reviewed-by: Wenjia Zhang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + net/smc/smc_diag.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c +index 5cc376834c57..fb9e5cc1285e 100644 +--- a/net/smc/smc_diag.c ++++ b/net/smc/smc_diag.c +@@ -163,7 +163,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb, + } + if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd && + (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) && +- !list_empty(&smc->conn.lgr->list)) { ++ !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) { + struct smc_connection *conn = &smc->conn; + struct smcd_diag_dmbinfo dinfo; + struct smcd_dev *smcd = conn->lgr->smcd; +-- +2.43.0 + diff --git a/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch b/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch new file mode 100644 index 00000000000..6214814c447 --- /dev/null +++ b/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch @@ -0,0 +1,63 @@ +From 174002e959ef2d8df58a9f06047e3f6d941a7e96 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 19:19:09 +0100 +Subject: net: stmmac: Wait a bit for the reset to take effect + +From: Bernd Edlinger + +[ Upstream commit a5f5eee282a0aae80227697e1d9c811b1726d31d ] + +otherwise the synopsys_id value may be read out wrong, +because the GMAC_VERSION register might still be in reset +state, for at least 1 us after the reset is de-asserted. + +Add a wait for 10 us before continuing to be on the safe side. + +> From what have you got that delay value? + +Just try and error, with very old linux versions and old gcc versions +the synopsys_id was read out correctly most of the time (but not always), +with recent linux versions and recnet gcc versions it was read out +wrongly most of the time, but again not always. +I don't have access to the VHDL code in question, so I cannot +tell why it takes so long to get the correct values, I also do not +have more than a few hardware samples, so I cannot tell how long +this timeout must be in worst case. +Experimentally I can tell that the register is read several times +as zero immediately after the reset is de-asserted, also adding several +no-ops is not enough, adding a printk is enough, also udelay(1) seems to +be enough but I tried that not very often, and I have not access to many +hardware samples to be 100% sure about the necessary delay. +And since the udelay here is only executed once per device instance, +it seems acceptable to delay the boot for 10 us. + +BTW: my hardware's synopsys id is 0x37. + +Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control") +Signed-off-by: Bernd Edlinger +Reviewed-by: Jiri Pirko +Reviewed-by: Serge Semin +Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++ + 1 file changed, 3 insertions(+) + +diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +index 49b81daf7411..d094c3c1e2ee 100644 +--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c ++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c +@@ -7467,6 +7467,9 @@ int stmmac_dvr_probe(struct device *device, + dev_err(priv->device, "unable to bring out of ahb reset: %pe\n", + ERR_PTR(ret)); + ++ /* Wait a bit for the reset to take effect */ ++ udelay(10); ++ + /* Init MAC and get the capabilities */ + ret = stmmac_hw_init(priv); + if (ret) +-- +2.43.0 + diff --git a/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch b/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch new file mode 100644 index 00000000000..784ce129c01 --- /dev/null +++ b/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch @@ -0,0 +1,60 @@ +From 541e41a88a00522cc6ce415e5481902002c27b4a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 13:34:32 +0100 +Subject: netfilter: nf_tables: restrict anonymous set and map names to 16 + bytes + +From: Florian Westphal + +[ Upstream commit b462579b2b86a8f5230543cadd3a4836be27baf7 ] + +nftables has two types of sets/maps, one where userspace defines the +name, and anonymous sets/maps, where userspace defines a template name. + +For the latter, kernel requires presence of exactly one "%d". +nftables uses "__set%d" and "__map%d" for this. The kernel will +expand the format specifier and replaces it with the smallest unused +number. + +As-is, userspace could define a template name that allows to move +the set name past the 256 bytes upperlimit (post-expansion). + +I don't see how this could be a problem, but I would prefer if userspace +cannot do this, so add a limit of 16 bytes for the '%d' template name. + +16 bytes is the old total upper limit for set names that existed when +nf_tables was merged initially. + +Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nf_tables_api.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c +index f032c29f1da6..5282e8377782 100644 +--- a/net/netfilter/nf_tables_api.c ++++ b/net/netfilter/nf_tables_api.c +@@ -24,6 +24,7 @@ + #include + + #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-")) ++#define NFT_SET_MAX_ANONLEN 16 + + unsigned int nf_tables_net_id __read_mostly; + +@@ -4411,6 +4412,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set, + if (p[1] != 'd' || strchr(p + 2, '%')) + return -EINVAL; + ++ if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN) ++ return -EINVAL; ++ + inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL); + if (inuse == NULL) + return -ENOMEM; +-- +2.43.0 + diff --git a/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch b/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch new file mode 100644 index 00000000000..08633ac3997 --- /dev/null +++ b/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch @@ -0,0 +1,196 @@ +From 3f0829fa1a89fd1aab6d15af0a0d0f7eb428dd1d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 16:38:25 +0100 +Subject: netfilter: nf_tables: validate NFPROTO_* family + +From: Pablo Neira Ayuso + +[ Upstream commit d0009effa8862c20a13af4cb7475d9771b905693 ] + +Several expressions explicitly refer to NF_INET_* hook definitions +from expr->ops->validate, however, family is not validated. + +Bail out with EOPNOTSUPP in case they are used from unsupported +families. + +Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables") +Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression") +Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression") +Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching") +Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support") +Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support") +Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression") +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_compat.c | 12 ++++++++++++ + net/netfilter/nft_flow_offload.c | 5 +++++ + net/netfilter/nft_nat.c | 5 +++++ + net/netfilter/nft_rt.c | 5 +++++ + net/netfilter/nft_socket.c | 5 +++++ + net/netfilter/nft_synproxy.c | 7 +++++-- + net/netfilter/nft_tproxy.c | 5 +++++ + net/netfilter/nft_xfrm.c | 5 +++++ + 8 files changed, 47 insertions(+), 2 deletions(-) + +diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c +index 5284cd2ad532..f0eeda97bfcd 100644 +--- a/net/netfilter/nft_compat.c ++++ b/net/netfilter/nft_compat.c +@@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx, + unsigned int hook_mask = 0; + int ret; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_BRIDGE && ++ ctx->family != NFPROTO_ARP) ++ return -EOPNOTSUPP; ++ + if (nft_is_base_chain(ctx->chain)) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); +@@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx, + unsigned int hook_mask = 0; + int ret; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_BRIDGE && ++ ctx->family != NFPROTO_ARP) ++ return -EOPNOTSUPP; ++ + if (nft_is_base_chain(ctx->chain)) { + const struct nft_base_chain *basechain = + nft_base_chain(ctx->chain); +diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c +index ab3362c483b4..397351fa4d5f 100644 +--- a/net/netfilter/nft_flow_offload.c ++++ b/net/netfilter/nft_flow_offload.c +@@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx, + { + unsigned int hook_mask = (1 << NF_INET_FORWARD); + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, hook_mask); + } + +diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c +index 583885ce7232..808f5802c270 100644 +--- a/net/netfilter/nft_nat.c ++++ b/net/netfilter/nft_nat.c +@@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx, + struct nft_nat *priv = nft_expr_priv(expr); + int err; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT); + if (err < 0) + return err; +diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c +index 35a2c28caa60..24d977138572 100644 +--- a/net/netfilter/nft_rt.c ++++ b/net/netfilter/nft_rt.c +@@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp + const struct nft_rt *priv = nft_expr_priv(expr); + unsigned int hooks; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + switch (priv->key) { + case NFT_RT_NEXTHOP4: + case NFT_RT_NEXTHOP6: +diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c +index 9ed85be79452..f30163e2ca62 100644 +--- a/net/netfilter/nft_socket.c ++++ b/net/netfilter/nft_socket.c +@@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, + (1 << NF_INET_PRE_ROUTING) | + (1 << NF_INET_LOCAL_IN) | +diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c +index 13da882669a4..1d737f89dfc1 100644 +--- a/net/netfilter/nft_synproxy.c ++++ b/net/netfilter/nft_synproxy.c +@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx, + break; + #endif + case NFPROTO_INET: +- case NFPROTO_BRIDGE: + err = nf_synproxy_ipv4_init(snet, ctx->net); + if (err) + goto nf_ct_failure; +@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx) + break; + #endif + case NFPROTO_INET: +- case NFPROTO_BRIDGE: + nf_synproxy_ipv4_fini(snet, ctx->net); + nf_synproxy_ipv6_fini(snet, ctx->net); + break; +@@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) | + (1 << NF_INET_FORWARD)); + } +diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c +index ae15cd693f0e..71412adb73d4 100644 +--- a/net/netfilter/nft_tproxy.c ++++ b/net/netfilter/nft_tproxy.c +@@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx, + const struct nft_expr *expr, + const struct nft_data **data) + { ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING); + } + +diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c +index 452f8587adda..1c866757db55 100644 +--- a/net/netfilter/nft_xfrm.c ++++ b/net/netfilter/nft_xfrm.c +@@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e + const struct nft_xfrm *priv = nft_expr_priv(expr); + unsigned int hooks; + ++ if (ctx->family != NFPROTO_IPV4 && ++ ctx->family != NFPROTO_IPV6 && ++ ctx->family != NFPROTO_INET) ++ return -EOPNOTSUPP; ++ + switch (priv->dir) { + case XFRM_POLICY_IN: + hooks = (1 << NF_INET_FORWARD) | +-- +2.43.0 + diff --git a/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch b/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch new file mode 100644 index 00000000000..049bed3fa63 --- /dev/null +++ b/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch @@ -0,0 +1,83 @@ +From 03c58469dab167d91526495d6be164f559516dbe Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 13:11:32 +0100 +Subject: netfilter: nft_limit: reject configurations that cause integer + overflow + +From: Florian Westphal + +[ Upstream commit c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa ] + +Reject bogus configs where internal token counter wraps around. +This only occurs with very very large requests, such as 17gbyte/s. + +Its better to reject this rather than having incorrect ratelimit. + +Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting") +Signed-off-by: Florian Westphal +Signed-off-by: Pablo Neira Ayuso +Signed-off-by: Sasha Levin +--- + net/netfilter/nft_limit.c | 23 ++++++++++++++++------- + 1 file changed, 16 insertions(+), 7 deletions(-) + +diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c +index 79039afde34e..cefa25e0dbb0 100644 +--- a/net/netfilter/nft_limit.c ++++ b/net/netfilter/nft_limit.c +@@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost) + static int nft_limit_init(struct nft_limit_priv *priv, + const struct nlattr * const tb[], bool pkts) + { ++ u64 unit, tokens, rate_with_burst; + bool invert = false; +- u64 unit, tokens; + + if (tb[NFTA_LIMIT_RATE] == NULL || + tb[NFTA_LIMIT_UNIT] == NULL) + return -EINVAL; + + priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE])); ++ if (priv->rate == 0) ++ return -EINVAL; ++ + unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT])); +- priv->nsecs = unit * NSEC_PER_SEC; +- if (priv->rate == 0 || priv->nsecs < unit) ++ if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs)) + return -EOVERFLOW; + + if (tb[NFTA_LIMIT_BURST]) +@@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv, + if (pkts && priv->burst == 0) + priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT; + +- if (priv->rate + priv->burst < priv->rate) ++ if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst)) + return -EOVERFLOW; + + if (pkts) { +- tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst; ++ u64 tmp = div64_u64(priv->nsecs, priv->rate); ++ ++ if (check_mul_overflow(tmp, priv->burst, &tokens)) ++ return -EOVERFLOW; + } else { ++ u64 tmp; ++ + /* The token bucket size limits the number of tokens can be + * accumulated. tokens_max specifies the bucket size. + * tokens_max = unit * (rate + burst) / rate. + */ +- tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst), +- priv->rate); ++ if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp)) ++ return -EOVERFLOW; ++ ++ tokens = div64_u64(tmp, priv->rate); + } + + if (tb[NFTA_LIMIT_FLAGS]) { +-- +2.43.0 + diff --git a/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch b/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch new file mode 100644 index 00000000000..14758dd4dfe --- /dev/null +++ b/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch @@ -0,0 +1,44 @@ +From 9b0508de9d77bca0679a689ebf30f1cb59ba3392 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 12 Jan 2024 09:59:41 +0300 +Subject: netfs, fscache: Prevent Oops in fscache_put_cache() + +From: Dan Carpenter + +[ Upstream commit 3be0b3ed1d76c6703b9ee482b55f7e01c369cc68 ] + +This function dereferences "cache" and then checks if it's +IS_ERR_OR_NULL(). Check first, then dereference. + +Fixes: 9549332df4ed ("fscache: Implement cache registration") +Signed-off-by: Dan Carpenter +Signed-off-by: David Howells +Link: https://lore.kernel.org/r/e84bc740-3502-4f16-982a-a40d5676615c@moroto.mountain/ # v2 +Signed-off-by: Sasha Levin +--- + fs/fscache/cache.c | 3 ++- + 1 file changed, 2 insertions(+), 1 deletion(-) + +diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c +index d645f8b302a2..9397ed39b0b4 100644 +--- a/fs/fscache/cache.c ++++ b/fs/fscache/cache.c +@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache); + void fscache_put_cache(struct fscache_cache *cache, + enum fscache_cache_trace where) + { +- unsigned int debug_id = cache->debug_id; ++ unsigned int debug_id; + bool zero; + int ref; + + if (IS_ERR_OR_NULL(cache)) + return; + ++ debug_id = cache->debug_id; + zero = __refcount_dec_and_test(&cache->ref, &ref); + trace_fscache_cache(debug_id, ref - 1, where); + +-- +2.43.0 + diff --git a/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch b/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch new file mode 100644 index 00000000000..96c47b14f48 --- /dev/null +++ b/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch @@ -0,0 +1,76 @@ +From e0e707fa22ae61e59a539bf4dbd7beb2f21590eb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 09:18:07 +0800 +Subject: netlink: fix potential sleeping issue in mqueue_flush_file + +From: Zhengchao Shao + +[ Upstream commit 234ec0b6034b16869d45128b8cd2dc6ffe596f04 ] + +I analyze the potential sleeping issue of the following processes: +Thread A Thread B +... netlink_create //ref = 1 +do_mq_notify ... + sock = netlink_getsockbyfilp ... //ref = 2 + info->notify_sock = sock; ... +... netlink_sendmsg +... skb = netlink_alloc_large_skb //skb->head is vmalloced +... netlink_unicast +... sk = netlink_getsockbyportid //ref = 3 +... netlink_sendskb +... __netlink_sendskb +... skb_queue_tail //put skb to sk_receive_queue +... sock_put //ref = 2 +... ... +... netlink_release +... deferred_put_nlk_sk //ref = 1 +mqueue_flush_file + spin_lock + remove_notification + netlink_sendskb + sock_put //ref = 0 + sk_free + ... + __sk_destruct + netlink_sock_destruct + skb_queue_purge //get skb from sk_receive_queue + ... + __skb_queue_purge_reason + kfree_skb_reason + __kfree_skb + ... + skb_release_all + skb_release_head_state + netlink_skb_destructor + vfree(skb->head) //sleeping while holding spinlock + +In netlink_sendmsg, if the memory pointed to by skb->head is allocated by +vmalloc, and is put to sk_receive_queue queue, also the skb is not freed. +When the mqueue executes flush, the sleeping bug will occur. Use +vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue. + +Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space") +Signed-off-by: Zhengchao Shao +Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/netlink/af_netlink.c | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c +index eb086b06d60d..d9107b545d36 100644 +--- a/net/netlink/af_netlink.c ++++ b/net/netlink/af_netlink.c +@@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb) + if (is_vmalloc_addr(skb->head)) { + if (!skb->cloned || + !atomic_dec_return(&(skb_shinfo(skb)->dataref))) +- vfree(skb->head); ++ vfree_atomic(skb->head); + + skb->head = NULL; + } +-- +2.43.0 + diff --git a/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch b/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch new file mode 100644 index 00000000000..40144ee180b --- /dev/null +++ b/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch @@ -0,0 +1,141 @@ +From 3ee2c71d0f6a323db1850cf4f2af474918609467 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 19 Dec 2023 00:19:15 +0100 +Subject: rcu: Defer RCU kthreads wakeup when CPU is dying + +From: Frederic Weisbecker + +[ Upstream commit e787644caf7628ad3269c1fbd321c3255cf51710 ] + +When the CPU goes idle for the last time during the CPU down hotplug +process, RCU reports a final quiescent state for the current CPU. If +this quiescent state propagates up to the top, some tasks may then be +woken up to complete the grace period: the main grace period kthread +and/or the expedited main workqueue (or kworker). + +If those kthreads have a SCHED_FIFO policy, the wake up can indirectly +arm the RT bandwith timer to the local offline CPU. Since this happens +after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the +timer gets ignored. Therefore if the RCU kthreads are waiting for RT +bandwidth to be available, they may never be actually scheduled. + +This triggers TREE03 rcutorture hangs: + + rcu: INFO: rcu_preempt self-detected stall on CPU + rcu: 4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved) + rcu: (t=21035 jiffies g=938281 q=40787 ncpus=6) + rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0 + rcu: Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior. + rcu: RCU grace-period kthread stack dump: + task:rcu_preempt state:R running task stack:14896 pid:14 tgid:14 ppid:2 flags:0x00004000 + Call Trace: + + __schedule+0x2eb/0xa80 + schedule+0x1f/0x90 + schedule_timeout+0x163/0x270 + ? __pfx_process_timeout+0x10/0x10 + rcu_gp_fqs_loop+0x37c/0x5b0 + ? __pfx_rcu_gp_kthread+0x10/0x10 + rcu_gp_kthread+0x17c/0x200 + kthread+0xde/0x110 + ? __pfx_kthread+0x10/0x10 + ret_from_fork+0x2b/0x40 + ? __pfx_kthread+0x10/0x10 + ret_from_fork_asm+0x1b/0x30 + + +The situation can't be solved with just unpinning the timer. The hrtimer +infrastructure and the nohz heuristics involved in finding the best +remote target for an unpinned timer would then also need to handle +enqueues from an offline CPU in the most horrendous way. + +So fix this on the RCU side instead and defer the wake up to an online +CPU if it's too late for the local one. + +Reported-by: Paul E. McKenney +Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier") +Signed-off-by: Frederic Weisbecker +Signed-off-by: Paul E. McKenney +Signed-off-by: Neeraj Upadhyay (AMD) +Signed-off-by: Sasha Levin +--- + kernel/rcu/tree.c | 34 +++++++++++++++++++++++++++++++++- + kernel/rcu/tree_exp.h | 3 +-- + 2 files changed, 34 insertions(+), 3 deletions(-) + +diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c +index 3ac3c846105f..157f3ca2a9b5 100644 +--- a/kernel/rcu/tree.c ++++ b/kernel/rcu/tree.c +@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp) + return needmore; + } + ++static void swake_up_one_online_ipi(void *arg) ++{ ++ struct swait_queue_head *wqh = arg; ++ ++ swake_up_one(wqh); ++} ++ ++static void swake_up_one_online(struct swait_queue_head *wqh) ++{ ++ int cpu = get_cpu(); ++ ++ /* ++ * If called from rcutree_report_cpu_starting(), wake up ++ * is dangerous that late in the CPU-down hotplug process. The ++ * scheduler might queue an ignored hrtimer. Defer the wake up ++ * to an online CPU instead. ++ */ ++ if (unlikely(cpu_is_offline(cpu))) { ++ int target; ++ ++ target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU), ++ cpu_online_mask); ++ ++ smp_call_function_single(target, swake_up_one_online_ipi, ++ wqh, 0); ++ put_cpu(); ++ } else { ++ put_cpu(); ++ swake_up_one(wqh); ++ } ++} ++ + /* + * Awaken the grace-period kthread. Don't do a self-awaken (unless in an + * interrupt or softirq handler, in which case we just might immediately +@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void) + return; + WRITE_ONCE(rcu_state.gp_wake_time, jiffies); + WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq)); +- swake_up_one(&rcu_state.gp_wq); ++ swake_up_one_online(&rcu_state.gp_wq); + } + + /* +diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h +index 6d7cea5d591f..2ac440bc7e10 100644 +--- a/kernel/rcu/tree_exp.h ++++ b/kernel/rcu/tree_exp.h +@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp) + return ret; + } + +- + /* + * Report the exit from RCU read-side critical section for the last task + * that queued itself during or before the current expedited preemptible-RCU +@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp, + raw_spin_unlock_irqrestore_rcu_node(rnp, flags); + if (wake) { + smp_mb(); /* EGP done before wake_up(). */ +- swake_up_one(&rcu_state.expedited_wq); ++ swake_up_one_online(&rcu_state.expedited_wq); + } + break; + } +-- +2.43.0 + diff --git a/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch b/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch new file mode 100644 index 00000000000..b645231b453 --- /dev/null +++ b/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch @@ -0,0 +1,1276 @@ +From 5f7f956733145671f4d20cd9caf937583e2b50ad Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 19 Oct 2023 12:55:11 +0100 +Subject: rxrpc, afs: Allow afs to pin rxrpc_peer objects + +From: David Howells + +[ Upstream commit 72904d7b9bfbf2dd146254edea93958bc35bbbfe ] + +Change rxrpc's API such that: + + (1) A new function, rxrpc_kernel_lookup_peer(), is provided to look up an + rxrpc_peer record for a remote address and a corresponding function, + rxrpc_kernel_put_peer(), is provided to dispose of it again. + + (2) When setting up a call, the rxrpc_peer object used during a call is + now passed in rather than being set up by rxrpc_connect_call(). For + afs, this meenat passing it to rxrpc_kernel_begin_call() rather than + the full address (the service ID then has to be passed in as a + separate parameter). + + (3) A new function, rxrpc_kernel_remote_addr(), is added so that afs can + get a pointer to the transport address for display purposed, and + another, rxrpc_kernel_remote_srx(), to gain a pointer to the full + rxrpc address. + + (4) The function to retrieve the RTT from a call, rxrpc_kernel_get_srtt(), + is then altered to take a peer. This now returns the RTT or -1 if + there are insufficient samples. + + (5) Rename rxrpc_kernel_get_peer() to rxrpc_kernel_call_get_peer(). + + (6) Provide a new function, rxrpc_kernel_get_peer(), to get a ref on a + peer the caller already has. + +This allows the afs filesystem to pin the rxrpc_peer records that it is +using, allowing faster lookups and pointer comparisons rather than +comparing sockaddr_rxrpc contents. It also makes it easier to get hold of +the RTT. The following changes are made to afs: + + (1) The addr_list struct's addrs[] elements now hold a peer struct pointer + and a service ID rather than a sockaddr_rxrpc. + + (2) When displaying the transport address, rxrpc_kernel_remote_addr() is + used. + + (3) The port arg is removed from afs_alloc_addrlist() since it's always + overridden. + + (4) afs_merge_fs_addr4() and afs_merge_fs_addr6() do peer lookup and may + now return an error that must be handled. + + (5) afs_find_server() now takes a peer pointer to specify the address. + + (6) afs_find_server(), afs_compare_fs_alists() and afs_merge_fs_addr[46]{} + now do peer pointer comparison rather than address comparison. + +Signed-off-by: David Howells +cc: Marc Dionne +cc: linux-afs@lists.infradead.org +Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus") +Signed-off-by: Sasha Levin +--- + fs/afs/addr_list.c | 125 ++++++++++++++++++----------------- + fs/afs/cmservice.c | 5 +- + fs/afs/fs_probe.c | 11 +-- + fs/afs/internal.h | 26 ++++---- + fs/afs/proc.c | 9 +-- + fs/afs/rotate.c | 6 +- + fs/afs/rxrpc.c | 10 +-- + fs/afs/server.c | 41 ++---------- + fs/afs/vl_alias.c | 55 +-------------- + fs/afs/vl_list.c | 15 +++-- + fs/afs/vl_probe.c | 12 ++-- + fs/afs/vl_rotate.c | 6 +- + fs/afs/vlclient.c | 22 ++++-- + include/net/af_rxrpc.h | 15 +++-- + include/trace/events/rxrpc.h | 3 + + net/rxrpc/af_rxrpc.c | 62 ++++++++++++++--- + net/rxrpc/ar-internal.h | 2 +- + net/rxrpc/call_object.c | 17 ++--- + net/rxrpc/peer_object.c | 58 ++++++++++------ + net/rxrpc/sendmsg.c | 11 ++- + 20 files changed, 273 insertions(+), 238 deletions(-) + +diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c +index ac05a59e9d46..519821f5aedc 100644 +--- a/fs/afs/addr_list.c ++++ b/fs/afs/addr_list.c +@@ -13,26 +13,33 @@ + #include "internal.h" + #include "afs_fs.h" + ++static void afs_free_addrlist(struct rcu_head *rcu) ++{ ++ struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu); ++ unsigned int i; ++ ++ for (i = 0; i < alist->nr_addrs; i++) ++ rxrpc_kernel_put_peer(alist->addrs[i].peer); ++} ++ + /* + * Release an address list. + */ + void afs_put_addrlist(struct afs_addr_list *alist) + { + if (alist && refcount_dec_and_test(&alist->usage)) +- kfree_rcu(alist, rcu); ++ call_rcu(&alist->rcu, afs_free_addrlist); + } + + /* + * Allocate an address list. + */ +-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, +- unsigned short service, +- unsigned short port) ++struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id) + { + struct afs_addr_list *alist; + unsigned int i; + +- _enter("%u,%u,%u", nr, service, port); ++ _enter("%u,%u", nr, service_id); + + if (nr > AFS_MAX_ADDRESSES) + nr = AFS_MAX_ADDRESSES; +@@ -44,16 +51,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, + refcount_set(&alist->usage, 1); + alist->max_addrs = nr; + +- for (i = 0; i < nr; i++) { +- struct sockaddr_rxrpc *srx = &alist->addrs[i].srx; +- srx->srx_family = AF_RXRPC; +- srx->srx_service = service; +- srx->transport_type = SOCK_DGRAM; +- srx->transport_len = sizeof(srx->transport.sin6); +- srx->transport.sin6.sin6_family = AF_INET6; +- srx->transport.sin6.sin6_port = htons(port); +- } +- ++ for (i = 0; i < nr; i++) ++ alist->addrs[i].service_id = service_id; + return alist; + } + +@@ -126,7 +125,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, + if (!vllist->servers[0].server) + goto error_vl; + +- alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT); ++ alist = afs_alloc_addrlist(nr, service); + if (!alist) + goto error; + +@@ -197,9 +196,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net, + } + + if (family == AF_INET) +- afs_merge_fs_addr4(alist, x[0], xport); ++ ret = afs_merge_fs_addr4(net, alist, x[0], xport); + else +- afs_merge_fs_addr6(alist, x, xport); ++ ret = afs_merge_fs_addr6(net, alist, x, xport); ++ if (ret < 0) ++ goto error; + + } while (p < end); + +@@ -271,25 +272,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry + /* + * Merge an IPv4 entry into a fileserver address list. + */ +-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) ++int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist, ++ __be32 xdr, u16 port) + { +- struct sockaddr_rxrpc *srx; +- u32 addr = ntohl(xdr); ++ struct sockaddr_rxrpc srx; ++ struct rxrpc_peer *peer; + int i; + + if (alist->nr_addrs >= alist->max_addrs) +- return; ++ return 0; + +- for (i = 0; i < alist->nr_ipv4; i++) { +- struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin; +- u32 a_addr = ntohl(a->sin_addr.s_addr); +- u16 a_port = ntohs(a->sin_port); ++ srx.srx_family = AF_RXRPC; ++ srx.transport_type = SOCK_DGRAM; ++ srx.transport_len = sizeof(srx.transport.sin); ++ srx.transport.sin.sin_family = AF_INET; ++ srx.transport.sin.sin_port = htons(port); ++ srx.transport.sin.sin_addr.s_addr = xdr; + +- if (addr == a_addr && port == a_port) +- return; +- if (addr == a_addr && port < a_port) +- break; +- if (addr < a_addr) ++ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); ++ if (!peer) ++ return -ENOMEM; ++ ++ for (i = 0; i < alist->nr_ipv4; i++) { ++ if (peer == alist->addrs[i].peer) { ++ rxrpc_kernel_put_peer(peer); ++ return 0; ++ } ++ if (peer <= alist->addrs[i].peer) + break; + } + +@@ -298,38 +307,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port) + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); + +- srx = &alist->addrs[i].srx; +- srx->srx_family = AF_RXRPC; +- srx->transport_type = SOCK_DGRAM; +- srx->transport_len = sizeof(srx->transport.sin); +- srx->transport.sin.sin_family = AF_INET; +- srx->transport.sin.sin_port = htons(port); +- srx->transport.sin.sin_addr.s_addr = xdr; ++ alist->addrs[i].peer = peer; + alist->nr_ipv4++; + alist->nr_addrs++; ++ return 0; + } + + /* + * Merge an IPv6 entry into a fileserver address list. + */ +-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) ++int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist, ++ __be32 *xdr, u16 port) + { +- struct sockaddr_rxrpc *srx; +- int i, diff; ++ struct sockaddr_rxrpc srx; ++ struct rxrpc_peer *peer; ++ int i; + + if (alist->nr_addrs >= alist->max_addrs) +- return; ++ return 0; + +- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { +- struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6; +- u16 a_port = ntohs(a->sin6_port); ++ srx.srx_family = AF_RXRPC; ++ srx.transport_type = SOCK_DGRAM; ++ srx.transport_len = sizeof(srx.transport.sin6); ++ srx.transport.sin6.sin6_family = AF_INET6; ++ srx.transport.sin6.sin6_port = htons(port); ++ memcpy(&srx.transport.sin6.sin6_addr, xdr, 16); + +- diff = memcmp(xdr, &a->sin6_addr, 16); +- if (diff == 0 && port == a_port) +- return; +- if (diff == 0 && port < a_port) +- break; +- if (diff < 0) ++ peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL); ++ if (!peer) ++ return -ENOMEM; ++ ++ for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { ++ if (peer == alist->addrs[i].peer) { ++ rxrpc_kernel_put_peer(peer); ++ return 0; ++ } ++ if (peer <= alist->addrs[i].peer) + break; + } + +@@ -337,15 +350,9 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port) + memmove(alist->addrs + i + 1, + alist->addrs + i, + sizeof(alist->addrs[0]) * (alist->nr_addrs - i)); +- +- srx = &alist->addrs[i].srx; +- srx->srx_family = AF_RXRPC; +- srx->transport_type = SOCK_DGRAM; +- srx->transport_len = sizeof(srx->transport.sin6); +- srx->transport.sin6.sin6_family = AF_INET6; +- srx->transport.sin6.sin6_port = htons(port); +- memcpy(&srx->transport.sin6.sin6_addr, xdr, 16); ++ alist->addrs[i].peer = peer; + alist->nr_addrs++; ++ return 0; + } + + /* +diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c +index d4ddb20d6732..99a3f20bc786 100644 +--- a/fs/afs/cmservice.c ++++ b/fs/afs/cmservice.c +@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call) + { + struct sockaddr_rxrpc srx; + struct afs_server *server; ++ struct rxrpc_peer *peer; + +- rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx); ++ peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall); + +- server = afs_find_server(call->net, &srx); ++ server = afs_find_server(call->net, peer); + if (!server) { + trace_afs_cm_no_server(call, &srx); + return 0; +diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c +index 3dd24842f277..58d28b82571e 100644 +--- a/fs/afs/fs_probe.c ++++ b/fs/afs/fs_probe.c +@@ -101,6 +101,7 @@ static void afs_fs_probe_not_done(struct afs_net *net, + void afs_fileserver_probe_result(struct afs_call *call) + { + struct afs_addr_list *alist = call->alist; ++ struct afs_address *addr = &alist->addrs[call->addr_ix]; + struct afs_server *server = call->server; + unsigned int index = call->addr_ix; + unsigned int rtt_us = 0, cap0; +@@ -153,12 +154,12 @@ void afs_fileserver_probe_result(struct afs_call *call) + if (call->service_id == YFS_FS_SERVICE) { + server->probe.is_yfs = true; + set_bit(AFS_SERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx.srx_service = call->service_id; ++ addr->service_id = call->service_id; + } else { + server->probe.not_yfs = true; + if (!server->probe.is_yfs) { + clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx.srx_service = call->service_id; ++ addr->service_id = call->service_id; + } + cap0 = ntohl(call->tmp); + if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES) +@@ -167,7 +168,7 @@ void afs_fileserver_probe_result(struct afs_call *call) + clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags); + } + +- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); ++ rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; + server->rtt = rtt_us; +@@ -181,8 +182,8 @@ void afs_fileserver_probe_result(struct afs_call *call) + out: + spin_unlock(&server->probe_lock); + +- _debug("probe %pU [%u] %pISpc rtt=%u ret=%d", +- &server->uuid, index, &alist->addrs[index].srx.transport, ++ _debug("probe %pU [%u] %pISpc rtt=%d ret=%d", ++ &server->uuid, index, rxrpc_kernel_remote_addr(alist->addrs[index].peer), + rtt_us, ret); + + return afs_done_one_fs_probe(call->net, server); +diff --git a/fs/afs/internal.h b/fs/afs/internal.h +index e2adb314ab6a..ec08b4a7e499 100644 +--- a/fs/afs/internal.h ++++ b/fs/afs/internal.h +@@ -72,6 +72,11 @@ enum afs_call_state { + AFS_CALL_COMPLETE, /* Completed or failed */ + }; + ++struct afs_address { ++ struct rxrpc_peer *peer; ++ u16 service_id; ++}; ++ + /* + * List of server addresses. + */ +@@ -87,9 +92,7 @@ struct afs_addr_list { + enum dns_lookup_status status:8; + unsigned long failed; /* Mask of addrs that failed locally/ICMP */ + unsigned long responded; /* Mask of addrs that responded */ +- struct { +- struct sockaddr_rxrpc srx; +- } addrs[] __counted_by(max_addrs); ++ struct afs_address addrs[] __counted_by(max_addrs); + #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8)) + }; + +@@ -420,7 +423,7 @@ struct afs_vlserver { + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { +- unsigned int rtt; /* RTT in uS */ ++ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + u32 abort_code; + short error; + unsigned short flags; +@@ -537,7 +540,7 @@ struct afs_server { + atomic_t probe_outstanding; + spinlock_t probe_lock; + struct { +- unsigned int rtt; /* RTT in uS */ ++ unsigned int rtt; /* Best RTT in uS (or UINT_MAX) */ + u32 abort_code; + short error; + bool responded:1; +@@ -964,9 +967,7 @@ static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist + refcount_inc(&alist->usage); + return alist; + } +-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int, +- unsigned short, +- unsigned short); ++extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id); + extern void afs_put_addrlist(struct afs_addr_list *); + extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *, + const char *, size_t, char, +@@ -977,8 +978,10 @@ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *); + extern bool afs_iterate_addresses(struct afs_addr_cursor *); + extern int afs_end_cursor(struct afs_addr_cursor *); + +-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16); +-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16); ++extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr, ++ __be32 xdr, u16 port); ++extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr, ++ __be32 *xdr, u16 port); + + /* + * callback.c +@@ -1405,8 +1408,7 @@ extern void __exit afs_clean_up_permit_cache(void); + */ + extern spinlock_t afs_server_peer_lock; + +-extern struct afs_server *afs_find_server(struct afs_net *, +- const struct sockaddr_rxrpc *); ++extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *); + extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *); + extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32); + extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace); +diff --git a/fs/afs/proc.c b/fs/afs/proc.c +index ab9cd986cfd9..8a65a06908d2 100644 +--- a/fs/afs/proc.c ++++ b/fs/afs/proc.c +@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v) + for (i = 0; i < alist->nr_addrs; i++) + seq_printf(m, " %c %pISpc\n", + alist->preferred == i ? '>' : '-', +- &alist->addrs[i].srx.transport); ++ rxrpc_kernel_remote_addr(alist->addrs[i].peer)); + } + seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt); + seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n", +@@ -398,9 +398,10 @@ static int afs_proc_servers_show(struct seq_file *m, void *v) + seq_printf(m, " - ALIST v=%u rsp=%lx f=%lx\n", + alist->version, alist->responded, alist->failed); + for (i = 0; i < alist->nr_addrs; i++) +- seq_printf(m, " [%x] %pISpc%s\n", +- i, &alist->addrs[i].srx.transport, +- alist->preferred == i ? "*" : ""); ++ seq_printf(m, " [%x] %pISpc%s rtt=%d\n", ++ i, rxrpc_kernel_remote_addr(alist->addrs[i].peer), ++ alist->preferred == i ? "*" : "", ++ rxrpc_kernel_get_srtt(alist->addrs[i].peer)); + return 0; + } + +diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c +index 46081e5da6f5..59aed7a6dd11 100644 +--- a/fs/afs/rotate.c ++++ b/fs/afs/rotate.c +@@ -113,7 +113,7 @@ bool afs_select_fileserver(struct afs_operation *op) + struct afs_server *server; + struct afs_vnode *vnode = op->file[0].vnode; + struct afs_error e; +- u32 rtt; ++ unsigned int rtt; + int error = op->ac.error, i; + + _enter("%lx[%d],%lx[%d],%d,%d", +@@ -420,7 +420,7 @@ bool afs_select_fileserver(struct afs_operation *op) + } + + op->index = -1; +- rtt = U32_MAX; ++ rtt = UINT_MAX; + for (i = 0; i < op->server_list->nr_servers; i++) { + struct afs_server *s = op->server_list->servers[i].server; + +@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op) + + _debug("address [%u] %u/%u %pISp", + op->index, op->ac.index, op->ac.alist->nr_addrs, +- &op->ac.alist->addrs[op->ac.index].srx.transport); ++ rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer)); + + _leave(" = t"); + return true; +diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c +index 181317126e43..2603db03b7ff 100644 +--- a/fs/afs/rxrpc.c ++++ b/fs/afs/rxrpc.c +@@ -296,7 +296,8 @@ static void afs_notify_end_request_tx(struct sock *sock, + */ + void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + { +- struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx; ++ struct afs_address *addr = &ac->alist->addrs[ac->index]; ++ struct rxrpc_peer *peer = addr->peer; + struct rxrpc_call *rxcall; + struct msghdr msg; + struct kvec iov[1]; +@@ -304,7 +305,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + s64 tx_total_len; + int ret; + +- _enter(",{%pISp},", &srx->transport); ++ _enter(",{%pISp},", rxrpc_kernel_remote_addr(addr->peer)); + + ASSERT(call->type != NULL); + ASSERT(call->type->name != NULL); +@@ -333,7 +334,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + } + + /* create a call */ +- rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key, ++ rxcall = rxrpc_kernel_begin_call(call->net->socket, peer, call->key, + (unsigned long)call, + tx_total_len, + call->max_lifespan, +@@ -341,6 +342,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp) + (call->async ? + afs_wake_up_async_call : + afs_wake_up_call_waiter), ++ addr->service_id, + call->upgrade, + (call->intr ? RXRPC_PREINTERRUPTIBLE : + RXRPC_UNINTERRUPTIBLE), +@@ -461,7 +463,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort) + max = m + 1; + pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n", + msg, call->type->name, +- &call->alist->addrs[call->addr_ix].srx.transport); ++ rxrpc_kernel_remote_addr(call->alist->addrs[call->addr_ix].peer)); + } + } + +diff --git a/fs/afs/server.c b/fs/afs/server.c +index b8e2d211d4a1..5b5fa94005c9 100644 +--- a/fs/afs/server.c ++++ b/fs/afs/server.c +@@ -21,13 +21,12 @@ static void __afs_put_server(struct afs_net *, struct afs_server *); + /* + * Find a server by one of its addresses. + */ +-struct afs_server *afs_find_server(struct afs_net *net, +- const struct sockaddr_rxrpc *srx) ++struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer) + { + const struct afs_addr_list *alist; + struct afs_server *server = NULL; + unsigned int i; +- int seq = 1, diff; ++ int seq = 1; + + rcu_read_lock(); + +@@ -38,37 +37,11 @@ struct afs_server *afs_find_server(struct afs_net *net, + seq++; /* 2 on the 1st/lockless path, otherwise odd */ + read_seqbegin_or_lock(&net->fs_addr_lock, &seq); + +- if (srx->transport.family == AF_INET6) { +- const struct sockaddr_in6 *a = &srx->transport.sin6, *b; +- hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { +- alist = rcu_dereference(server->addresses); +- for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) { +- b = &alist->addrs[i].srx.transport.sin6; +- diff = ((u16 __force)a->sin6_port - +- (u16 __force)b->sin6_port); +- if (diff == 0) +- diff = memcmp(&a->sin6_addr, +- &b->sin6_addr, +- sizeof(struct in6_addr)); +- if (diff == 0) +- goto found; +- } +- } +- } else { +- const struct sockaddr_in *a = &srx->transport.sin, *b; +- hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) { +- alist = rcu_dereference(server->addresses); +- for (i = 0; i < alist->nr_ipv4; i++) { +- b = &alist->addrs[i].srx.transport.sin; +- diff = ((u16 __force)a->sin_port - +- (u16 __force)b->sin_port); +- if (diff == 0) +- diff = ((u32 __force)a->sin_addr.s_addr - +- (u32 __force)b->sin_addr.s_addr); +- if (diff == 0) +- goto found; +- } +- } ++ hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) { ++ alist = rcu_dereference(server->addresses); ++ for (i = 0; i < alist->nr_addrs; i++) ++ if (alist->addrs[i].peer == peer) ++ goto found; + } + + server = NULL; +diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c +index d3c0df70a1a5..6fdf9f1bedc0 100644 +--- a/fs/afs/vl_alias.c ++++ b/fs/afs/vl_alias.c +@@ -32,55 +32,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k + return volume; + } + +-/* +- * Compare two addresses. +- */ +-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a, +- const struct sockaddr_rxrpc *srx_b) +-{ +- short port_a, port_b; +- int addr_a, addr_b, diff; +- +- diff = (short)srx_a->transport_type - (short)srx_b->transport_type; +- if (diff) +- goto out; +- +- switch (srx_a->transport_type) { +- case AF_INET: { +- const struct sockaddr_in *a = &srx_a->transport.sin; +- const struct sockaddr_in *b = &srx_b->transport.sin; +- addr_a = ntohl(a->sin_addr.s_addr); +- addr_b = ntohl(b->sin_addr.s_addr); +- diff = addr_a - addr_b; +- if (diff == 0) { +- port_a = ntohs(a->sin_port); +- port_b = ntohs(b->sin_port); +- diff = port_a - port_b; +- } +- break; +- } +- +- case AF_INET6: { +- const struct sockaddr_in6 *a = &srx_a->transport.sin6; +- const struct sockaddr_in6 *b = &srx_b->transport.sin6; +- diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16); +- if (diff == 0) { +- port_a = ntohs(a->sin6_port); +- port_b = ntohs(b->sin6_port); +- diff = port_a - port_b; +- } +- break; +- } +- +- default: +- WARN_ON(1); +- diff = 1; +- } +- +-out: +- return diff; +-} +- + /* + * Compare the address lists of a pair of fileservers. + */ +@@ -94,9 +45,9 @@ static int afs_compare_fs_alists(const struct afs_server *server_a, + lb = rcu_dereference(server_b->addresses); + + while (a < la->nr_addrs && b < lb->nr_addrs) { +- const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx; +- const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx; +- int diff = afs_compare_addrs(srx_a, srx_b); ++ unsigned long pa = (unsigned long)la->addrs[a].peer; ++ unsigned long pb = (unsigned long)lb->addrs[b].peer; ++ long diff = pa - pb; + + if (diff < 0) { + a++; +diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c +index acc48216136a..ba89140eee9e 100644 +--- a/fs/afs/vl_list.c ++++ b/fs/afs/vl_list.c +@@ -83,14 +83,15 @@ static u16 afs_extract_le16(const u8 **_b) + /* + * Build a VL server address list from a DNS queried server list. + */ +-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, ++static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net, ++ const u8 **_b, const u8 *end, + u8 nr_addrs, u16 port) + { + struct afs_addr_list *alist; + const u8 *b = *_b; + int ret = -EINVAL; + +- alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port); ++ alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE); + if (!alist) + return ERR_PTR(-ENOMEM); + if (nr_addrs == 0) +@@ -109,7 +110,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, + goto error; + } + memcpy(x, b, 4); +- afs_merge_fs_addr4(alist, x[0], port); ++ ret = afs_merge_fs_addr4(net, alist, x[0], port); ++ if (ret < 0) ++ goto error; + b += 4; + break; + +@@ -119,7 +122,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end, + goto error; + } + memcpy(x, b, 16); +- afs_merge_fs_addr6(alist, x, port); ++ ret = afs_merge_fs_addr6(net, alist, x, port); ++ if (ret < 0) ++ goto error; + b += 16; + break; + +@@ -247,7 +252,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell, + /* Extract the addresses - note that we can't skip this as we + * have to advance the payload pointer. + */ +- addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port); ++ addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port); + if (IS_ERR(addrs)) { + ret = PTR_ERR(addrs); + goto error_2; +diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c +index bdd9372e3fb2..9551aef07cee 100644 +--- a/fs/afs/vl_probe.c ++++ b/fs/afs/vl_probe.c +@@ -48,6 +48,7 @@ void afs_vlserver_probe_result(struct afs_call *call) + { + struct afs_addr_list *alist = call->alist; + struct afs_vlserver *server = call->vlserver; ++ struct afs_address *addr = &alist->addrs[call->addr_ix]; + unsigned int server_index = call->server_index; + unsigned int rtt_us = 0; + unsigned int index = call->addr_ix; +@@ -106,16 +107,16 @@ void afs_vlserver_probe_result(struct afs_call *call) + if (call->service_id == YFS_VL_SERVICE) { + server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS; + set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx.srx_service = call->service_id; ++ addr->service_id = call->service_id; + } else { + server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS; + if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) { + clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags); +- alist->addrs[index].srx.srx_service = call->service_id; ++ addr->service_id = call->service_id; + } + } + +- rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us); ++ rtt_us = rxrpc_kernel_get_srtt(addr->peer); + if (rtt_us < server->probe.rtt) { + server->probe.rtt = rtt_us; + server->rtt = rtt_us; +@@ -130,8 +131,9 @@ void afs_vlserver_probe_result(struct afs_call *call) + out: + spin_unlock(&server->probe_lock); + +- _debug("probe [%u][%u] %pISpc rtt=%u ret=%d", +- server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret); ++ _debug("probe [%u][%u] %pISpc rtt=%d ret=%d", ++ server_index, index, rxrpc_kernel_remote_addr(addr->peer), ++ rtt_us, ret); + + afs_done_one_vl_probe(server, have_result); + } +diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c +index e52b9d4c8a0a..f8f255c966ae 100644 +--- a/fs/afs/vl_rotate.c ++++ b/fs/afs/vl_rotate.c +@@ -92,7 +92,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + struct afs_addr_list *alist; + struct afs_vlserver *vlserver; + struct afs_error e; +- u32 rtt; ++ unsigned int rtt; + int error = vc->ac.error, i; + + _enter("%lx[%d],%lx[%d],%d,%d", +@@ -194,7 +194,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + goto selected_server; + + vc->index = -1; +- rtt = U32_MAX; ++ rtt = UINT_MAX; + for (i = 0; i < vc->server_list->nr_servers; i++) { + struct afs_vlserver *s = vc->server_list->servers[i].server; + +@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc) + + _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs); + +- _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport); ++ _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer)); + return true; + + next_server: +diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c +index 00fca3c66ba6..41e7932d75c6 100644 +--- a/fs/afs/vlclient.c ++++ b/fs/afs/vlclient.c +@@ -208,7 +208,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) + count = ntohl(*bp); + + nentries = min(nentries, count); +- alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT); ++ alist = afs_alloc_addrlist(nentries, FS_SERVICE); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; +@@ -230,9 +230,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call) + alist = call->ret_alist; + bp = call->buffer; + count = min(call->count, 4U); +- for (i = 0; i < count; i++) +- if (alist->nr_addrs < call->count2) +- afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT); ++ for (i = 0; i < count; i++) { ++ if (alist->nr_addrs < call->count2) { ++ ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT); ++ if (ret < 0) ++ return ret; ++ } ++ } + + call->count -= count; + if (call->count > 0) +@@ -450,7 +454,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) + if (call->count > YFS_MAXENDPOINTS) + return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num); + +- alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT); ++ alist = afs_alloc_addrlist(call->count, FS_SERVICE); + if (!alist) + return -ENOMEM; + alist->version = uniquifier; +@@ -488,14 +492,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call) + if (ntohl(bp[0]) != sizeof(__be32) * 2) + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt4_len); +- afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2])); ++ ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2])); ++ if (ret < 0) ++ return ret; + bp += 3; + break; + case YFS_ENDPOINT_IPV6: + if (ntohl(bp[0]) != sizeof(__be32) * 5) + return afs_protocol_error( + call, afs_eproto_yvl_fsendpt6_len); +- afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5])); ++ ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5])); ++ if (ret < 0) ++ return ret; + bp += 6; + break; + default: +diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h +index 5531dd08061e..0754c463224a 100644 +--- a/include/net/af_rxrpc.h ++++ b/include/net/af_rxrpc.h +@@ -15,6 +15,7 @@ struct key; + struct sock; + struct socket; + struct rxrpc_call; ++struct rxrpc_peer; + enum rxrpc_abort_reason; + + enum rxrpc_interruptibility { +@@ -41,13 +42,14 @@ void rxrpc_kernel_new_call_notification(struct socket *, + rxrpc_notify_new_call_t, + rxrpc_discard_new_call_t); + struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, +- struct sockaddr_rxrpc *srx, ++ struct rxrpc_peer *peer, + struct key *key, + unsigned long user_call_ID, + s64 tx_total_len, + u32 hard_timeout, + gfp_t gfp, + rxrpc_notify_rx_t notify_rx, ++ u16 service_id, + bool upgrade, + enum rxrpc_interruptibility interruptibility, + unsigned int debug_id); +@@ -60,9 +62,14 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *, + u32, int, enum rxrpc_abort_reason); + void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call); + void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call); +-void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *, +- struct sockaddr_rxrpc *); +-bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *); ++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, ++ struct sockaddr_rxrpc *srx, gfp_t gfp); ++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer); ++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer); ++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call); ++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer); ++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer); ++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *); + int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t, + rxrpc_user_attach_call_t, unsigned long, gfp_t, + unsigned int); +diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h +index f7e537f64db4..4c1ef7b3705c 100644 +--- a/include/trace/events/rxrpc.h ++++ b/include/trace/events/rxrpc.h +@@ -178,7 +178,9 @@ + #define rxrpc_peer_traces \ + EM(rxrpc_peer_free, "FREE ") \ + EM(rxrpc_peer_get_accept, "GET accept ") \ ++ EM(rxrpc_peer_get_application, "GET app ") \ + EM(rxrpc_peer_get_bundle, "GET bundle ") \ ++ EM(rxrpc_peer_get_call, "GET call ") \ + EM(rxrpc_peer_get_client_conn, "GET cln-conn") \ + EM(rxrpc_peer_get_input, "GET input ") \ + EM(rxrpc_peer_get_input_error, "GET inpt-err") \ +@@ -187,6 +189,7 @@ + EM(rxrpc_peer_get_service_conn, "GET srv-conn") \ + EM(rxrpc_peer_new_client, "NEW client ") \ + EM(rxrpc_peer_new_prealloc, "NEW prealloc") \ ++ EM(rxrpc_peer_put_application, "PUT app ") \ + EM(rxrpc_peer_put_bundle, "PUT bundle ") \ + EM(rxrpc_peer_put_call, "PUT call ") \ + EM(rxrpc_peer_put_conn, "PUT conn ") \ +diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c +index fa8aec78f63d..465bfe5eb061 100644 +--- a/net/rxrpc/af_rxrpc.c ++++ b/net/rxrpc/af_rxrpc.c +@@ -258,16 +258,62 @@ static int rxrpc_listen(struct socket *sock, int backlog) + return ret; + } + ++/** ++ * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address ++ * @sock: The socket through which it will be accessed ++ * @srx: The network address ++ * @gfp: Allocation flags ++ * ++ * Lookup or create a remote transport endpoint record for the specified ++ * address and return it with a ref held. ++ */ ++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock, ++ struct sockaddr_rxrpc *srx, gfp_t gfp) ++{ ++ struct rxrpc_sock *rx = rxrpc_sk(sock->sk); ++ int ret; ++ ++ ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); ++ if (ret < 0) ++ return ERR_PTR(ret); ++ ++ return rxrpc_lookup_peer(rx->local, srx, gfp); ++} ++EXPORT_SYMBOL(rxrpc_kernel_lookup_peer); ++ ++/** ++ * rxrpc_kernel_get_peer - Get a reference on a peer ++ * @peer: The peer to get a reference on. ++ * ++ * Get a record for the remote peer in a call. ++ */ ++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer) ++{ ++ return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL; ++} ++EXPORT_SYMBOL(rxrpc_kernel_get_peer); ++ ++/** ++ * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference ++ * @peer: The peer to drop a ref on ++ */ ++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer) ++{ ++ rxrpc_put_peer(peer, rxrpc_peer_put_application); ++} ++EXPORT_SYMBOL(rxrpc_kernel_put_peer); ++ + /** + * rxrpc_kernel_begin_call - Allow a kernel service to begin a call + * @sock: The socket on which to make the call +- * @srx: The address of the peer to contact ++ * @peer: The peer to contact + * @key: The security context to use (defaults to socket setting) + * @user_call_ID: The ID to use + * @tx_total_len: Total length of data to transmit during the call (or -1) + * @hard_timeout: The maximum lifespan of the call in sec + * @gfp: The allocation constraints + * @notify_rx: Where to send notifications instead of socket queue ++ * @service_id: The ID of the service to contact + * @upgrade: Request service upgrade for call + * @interruptibility: The call is interruptible, or can be canceled. + * @debug_id: The debug ID for tracing to be assigned to the call +@@ -280,13 +326,14 @@ static int rxrpc_listen(struct socket *sock, int backlog) + * supplying @srx and @key. + */ + struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, +- struct sockaddr_rxrpc *srx, ++ struct rxrpc_peer *peer, + struct key *key, + unsigned long user_call_ID, + s64 tx_total_len, + u32 hard_timeout, + gfp_t gfp, + rxrpc_notify_rx_t notify_rx, ++ u16 service_id, + bool upgrade, + enum rxrpc_interruptibility interruptibility, + unsigned int debug_id) +@@ -295,13 +342,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, + struct rxrpc_call_params p; + struct rxrpc_call *call; + struct rxrpc_sock *rx = rxrpc_sk(sock->sk); +- int ret; + + _enter(",,%x,%lx", key_serial(key), user_call_ID); + +- ret = rxrpc_validate_address(rx, srx, sizeof(*srx)); +- if (ret < 0) +- return ERR_PTR(ret); ++ if (WARN_ON_ONCE(peer->local != rx->local)) ++ return ERR_PTR(-EIO); + + lock_sock(&rx->sk); + +@@ -319,12 +364,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock, + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; ++ cp.peer = peer; + cp.key = key; + cp.security_level = rx->min_sec_level; + cp.exclusive = false; + cp.upgrade = upgrade; +- cp.service_id = srx->srx_service; +- call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id); ++ cp.service_id = service_id; ++ call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id); + /* The socket has been unlocked. */ + if (!IS_ERR(call)) { + call->notify_rx = notify_rx; +diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h +index e8b43408136a..5d5b19f20d1e 100644 +--- a/net/rxrpc/ar-internal.h ++++ b/net/rxrpc/ar-internal.h +@@ -364,6 +364,7 @@ struct rxrpc_conn_proto { + + struct rxrpc_conn_parameters { + struct rxrpc_local *local; /* Representation of local endpoint */ ++ struct rxrpc_peer *peer; /* Representation of remote endpoint */ + struct key *key; /* Security details */ + bool exclusive; /* T if conn is exclusive */ + bool upgrade; /* T if service ID can be upgraded */ +@@ -867,7 +868,6 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long + struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int); + struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *, + struct rxrpc_conn_parameters *, +- struct sockaddr_rxrpc *, + struct rxrpc_call_params *, gfp_t, + unsigned int); + void rxrpc_start_call_timer(struct rxrpc_call *call); +diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c +index f10b37c14772..0943e54370ba 100644 +--- a/net/rxrpc/call_object.c ++++ b/net/rxrpc/call_object.c +@@ -193,7 +193,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp, + * Allocate a new client call. + */ + static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, +- struct sockaddr_rxrpc *srx, + struct rxrpc_conn_parameters *cp, + struct rxrpc_call_params *p, + gfp_t gfp, +@@ -211,10 +210,12 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx, + now = ktime_get_real(); + call->acks_latest_ts = now; + call->cong_tstamp = now; +- call->dest_srx = *srx; ++ call->dest_srx = cp->peer->srx; ++ call->dest_srx.srx_service = cp->service_id; + call->interruptibility = p->interruptibility; + call->tx_total_len = p->tx_total_len; + call->key = key_get(cp->key); ++ call->peer = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call); + call->local = rxrpc_get_local(cp->local, rxrpc_local_get_call); + call->security_level = cp->security_level; + if (p->kernel) +@@ -306,10 +307,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) + + _enter("{%d,%lx},", call->debug_id, call->user_call_ID); + +- call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp); +- if (!call->peer) +- goto error; +- + ret = rxrpc_look_up_bundle(call, gfp); + if (ret < 0) + goto error; +@@ -334,7 +331,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp) + */ + struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, + struct rxrpc_conn_parameters *cp, +- struct sockaddr_rxrpc *srx, + struct rxrpc_call_params *p, + gfp_t gfp, + unsigned int debug_id) +@@ -349,13 +345,18 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx, + + _enter("%p,%lx", rx, p->user_call_ID); + ++ if (WARN_ON_ONCE(!cp->peer)) { ++ release_sock(&rx->sk); ++ return ERR_PTR(-EIO); ++ } ++ + limiter = rxrpc_get_call_slot(p, gfp); + if (!limiter) { + release_sock(&rx->sk); + return ERR_PTR(-ERESTARTSYS); + } + +- call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id); ++ call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id); + if (IS_ERR(call)) { + release_sock(&rx->sk); + up(limiter); +diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c +index 8d7a715a0bb1..49dcda67a0d5 100644 +--- a/net/rxrpc/peer_object.c ++++ b/net/rxrpc/peer_object.c +@@ -22,6 +22,8 @@ + #include + #include "ar-internal.h" + ++static const struct sockaddr_rxrpc rxrpc_null_addr; ++ + /* + * Hash a peer key. + */ +@@ -457,39 +459,53 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet) + } + + /** +- * rxrpc_kernel_get_peer - Get the peer address of a call ++ * rxrpc_kernel_get_call_peer - Get the peer address of a call + * @sock: The socket on which the call is in progress. + * @call: The call to query +- * @_srx: Where to place the result + * +- * Get the address of the remote peer in a call. ++ * Get a record for the remote peer in a call. + */ +-void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call, +- struct sockaddr_rxrpc *_srx) ++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call) + { +- *_srx = call->peer->srx; ++ return call->peer; + } +-EXPORT_SYMBOL(rxrpc_kernel_get_peer); ++EXPORT_SYMBOL(rxrpc_kernel_get_call_peer); + + /** + * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT +- * @sock: The socket on which the call is in progress. +- * @call: The call to query +- * @_srtt: Where to store the SRTT value. ++ * @peer: The peer to query + * +- * Get the call's peer smoothed RTT in uS. ++ * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples. + */ +-bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call, +- u32 *_srtt) ++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer) + { +- struct rxrpc_peer *peer = call->peer; ++ return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX; ++} ++EXPORT_SYMBOL(rxrpc_kernel_get_srtt); + +- if (peer->rtt_count == 0) { +- *_srtt = 1000000; /* 1S */ +- return false; +- } ++/** ++ * rxrpc_kernel_remote_srx - Get the address of a peer ++ * @peer: The peer to query ++ * ++ * Get a pointer to the address from a peer record. The caller is responsible ++ * for making sure that the address is not deallocated. ++ */ ++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer) ++{ ++ return peer ? &peer->srx : &rxrpc_null_addr; ++} ++EXPORT_SYMBOL(rxrpc_kernel_remote_srx); + +- *_srtt = call->peer->srtt_us >> 3; +- return true; ++/** ++ * rxrpc_kernel_remote_addr - Get the peer transport address of a call ++ * @peer: The peer to query ++ * ++ * Get a pointer to the transport address from a peer record. The caller is ++ * responsible for making sure that the address is not deallocated. ++ */ ++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer) ++{ ++ return (const struct sockaddr *) ++ (peer ? &peer->srx.transport : &rxrpc_null_addr.transport); + } +-EXPORT_SYMBOL(rxrpc_kernel_get_srtt); ++EXPORT_SYMBOL(rxrpc_kernel_remote_addr); +diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c +index 8e0b94714e84..5677d5690a02 100644 +--- a/net/rxrpc/sendmsg.c ++++ b/net/rxrpc/sendmsg.c +@@ -572,6 +572,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + __acquires(&call->user_mutex) + { + struct rxrpc_conn_parameters cp; ++ struct rxrpc_peer *peer; + struct rxrpc_call *call; + struct key *key; + +@@ -584,21 +585,29 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg, + return ERR_PTR(-EDESTADDRREQ); + } + ++ peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL); ++ if (!peer) { ++ release_sock(&rx->sk); ++ return ERR_PTR(-ENOMEM); ++ } ++ + key = rx->key; + if (key && !rx->key->payload.data[0]) + key = NULL; + + memset(&cp, 0, sizeof(cp)); + cp.local = rx->local; ++ cp.peer = peer; + cp.key = rx->key; + cp.security_level = rx->min_sec_level; + cp.exclusive = rx->exclusive | p->exclusive; + cp.upgrade = p->upgrade; + cp.service_id = srx->srx_service; +- call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL, ++ call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL, + atomic_inc_return(&rxrpc_debug_id)); + /* The socket is now unlocked */ + ++ rxrpc_put_peer(peer, rxrpc_peer_put_application); + _leave(" = %p\n", call); + return call; + } +-- +2.43.0 + diff --git a/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch b/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch new file mode 100644 index 00000000000..9a55f9d5a88 --- /dev/null +++ b/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch @@ -0,0 +1,231 @@ +From 7f04c082cab699672ebde045251234ff678693af Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 19:16:42 -0800 +Subject: selftest: Don't reuse port for SO_INCOMING_CPU test. + +From: Kuniyuki Iwashima + +[ Upstream commit 97de5a15edf2d22184f5ff588656030bbb7fa358 ] + +Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to +fire somewhat randomly. + + # # RUN so_incoming_cpu.before_reuseport.test3 ... + # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0) + # # test3: Test terminated by assertion + # # FAIL so_incoming_cpu.before_reuseport.test3 + # not ok 3 so_incoming_cpu.before_reuseport.test3 + +When the test failed, not-yet-accepted CLOSE_WAIT sockets received +SYN with a "challenging" SEQ number, which was sent from an unexpected +CPU that did not create the receiver. + +The test basically does: + + 1. for each cpu: + 1-1. create a server + 1-2. set SO_INCOMING_CPU + + 2. for each cpu: + 2-1. set cpu affinity + 2-2. create some clients + 2-3. let clients connect() to the server on the same cpu + 2-4. close() clients + + 3. for each server: + 3-1. accept() all child sockets + 3-2. check if all children have the same SO_INCOMING_CPU with the server + +The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse. + +In a loop of 2., close() changed the client state to FIN_WAIT_2, and +the peer transitioned to CLOSE_WAIT. + +In another loop of 2., connect() happened to select the same port of +the FIN_WAIT_2 socket, and it was reused as the default value of +net.ipv4.tcp_tw_reuse is 2. + +As a result, the new client sent SYN to the CLOSE_WAIT socket from +a different CPU, and the receiver's sk_incoming_cpu was overwritten +with unexpected CPU ID. + +Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket +responded with Challenge ACK. The new client properly returned RST +and effectively killed the CLOSE_WAIT socket. + +This way, all clients were created successfully, but the error was +detected later by 3-2., ASSERT_EQ(cpu, i). + +To avoid the failure, let's make sure that (i) the number of clients +is less than the number of available ports and (ii) such reuse never +happens. + +Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.") +Reported-by: Jakub Kicinski +Signed-off-by: Kuniyuki Iwashima +Tested-by: Jakub Kicinski +Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++----- + 1 file changed, 50 insertions(+), 18 deletions(-) + +diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c +index a14818164102..e9fa14e10732 100644 +--- a/tools/testing/selftests/net/so_incoming_cpu.c ++++ b/tools/testing/selftests/net/so_incoming_cpu.c +@@ -3,19 +3,16 @@ + #define _GNU_SOURCE + #include + ++#include ++ + #include + #include + #include + + #include "../kselftest_harness.h" + +-#define CLIENT_PER_SERVER 32 /* More sockets, more reliable */ +-#define NR_SERVER self->nproc +-#define NR_CLIENT (CLIENT_PER_SERVER * NR_SERVER) +- + FIXTURE(so_incoming_cpu) + { +- int nproc; + int *servers; + union { + struct sockaddr addr; +@@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen) + .when_to_set = AFTER_ALL_LISTEN, + }; + ++static void write_sysctl(struct __test_metadata *_metadata, ++ char *filename, char *string) ++{ ++ int fd, len, ret; ++ ++ fd = open(filename, O_WRONLY); ++ ASSERT_NE(fd, -1); ++ ++ len = strlen(string); ++ ret = write(fd, string, len); ++ ASSERT_EQ(ret, len); ++} ++ ++static void setup_netns(struct __test_metadata *_metadata) ++{ ++ ASSERT_EQ(unshare(CLONE_NEWNET), 0); ++ ASSERT_EQ(system("ip link set lo up"), 0); ++ ++ write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001"); ++ write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0"); ++} ++ ++#define NR_PORT (60001 - 10000 - 1) ++#define NR_CLIENT_PER_SERVER_DEFAULT 32 ++static int nr_client_per_server, nr_server, nr_client; ++ + FIXTURE_SETUP(so_incoming_cpu) + { +- self->nproc = get_nprocs(); +- ASSERT_LE(2, self->nproc); ++ setup_netns(_metadata); ++ ++ nr_server = get_nprocs(); ++ ASSERT_LE(2, nr_server); ++ ++ if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT) ++ nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT; ++ else ++ nr_client_per_server = NR_PORT / nr_server; ++ ++ nr_client = nr_client_per_server * nr_server; + +- self->servers = malloc(sizeof(int) * NR_SERVER); ++ self->servers = malloc(sizeof(int) * nr_server); + ASSERT_NE(self->servers, NULL); + + self->in_addr.sin_family = AF_INET; +@@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu) + { + int i; + +- for (i = 0; i < NR_SERVER; i++) ++ for (i = 0; i < nr_server; i++) + close(self->servers[i]); + + free(self->servers); +@@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata, + if (variant->when_to_set == BEFORE_LISTEN) + set_so_incoming_cpu(_metadata, fd, cpu); + +- /* We don't use CLIENT_PER_SERVER here not to block ++ /* We don't use nr_client_per_server here not to block + * this test at connect() if SO_INCOMING_CPU is broken. + */ +- ret = listen(fd, NR_CLIENT); ++ ret = listen(fd, nr_client); + ASSERT_EQ(ret, 0); + + if (variant->when_to_set == AFTER_LISTEN) +@@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata, + { + int i, ret; + +- for (i = 0; i < NR_SERVER; i++) { ++ for (i = 0; i < nr_server; i++) { + self->servers[i] = create_server(_metadata, self, variant, i); + + if (i == 0) { +@@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata, + } + + if (variant->when_to_set == AFTER_ALL_LISTEN) { +- for (i = 0; i < NR_SERVER; i++) ++ for (i = 0; i < nr_server; i++) + set_so_incoming_cpu(_metadata, self->servers[i], i); + } + } +@@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata, + cpu_set_t cpu_set; + int i, j, fd, ret; + +- for (i = 0; i < NR_SERVER; i++) { ++ for (i = 0; i < nr_server; i++) { + CPU_ZERO(&cpu_set); + + CPU_SET(i, &cpu_set); +@@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata, + ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set); + ASSERT_EQ(ret, 0); + +- for (j = 0; j < CLIENT_PER_SERVER; j++) { ++ for (j = 0; j < nr_client_per_server; j++) { + fd = socket(AF_INET, SOCK_STREAM, 0); + ASSERT_NE(fd, -1); + +@@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, + int i, j, fd, cpu, ret, total = 0; + socklen_t len = sizeof(int); + +- for (i = 0; i < NR_SERVER; i++) { +- for (j = 0; j < CLIENT_PER_SERVER; j++) { ++ for (i = 0; i < nr_server; i++) { ++ for (j = 0; j < nr_client_per_server; j++) { + /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */ + fd = accept(self->servers[i], &self->addr, &self->addrlen); + ASSERT_NE(fd, -1); +@@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata, + } + } + +- ASSERT_EQ(total, NR_CLIENT); ++ ASSERT_EQ(total, nr_client); + TH_LOG("SO_INCOMING_CPU is very likely to be " + "working correctly with %d sockets.", total); + } +-- +2.43.0 + diff --git a/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch b/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch new file mode 100644 index 00000000000..84772220d08 --- /dev/null +++ b/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch @@ -0,0 +1,63 @@ +From e1c2375e884d125e1bb994d6ca1b503b345d0b2c Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 15:59:17 +0800 +Subject: selftests: bonding: do not test arp/ns target with mode + balance-alb/tlb + +From: Hangbin Liu + +[ Upstream commit a2933a8759a62269754e54733d993b19de870e84 ] + +The prio_arp/ns tests hard code the mode to active-backup. At the same +time, The balance-alb/tlb modes do not support arp/ns target. So remove +the prio_arp/ns tests from the loop and only test active-backup mode. + +Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests") +Reported-by: Jay Vosburgh +Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/ +Signed-off-by: Hangbin Liu +Acked-by: Jay Vosburgh +Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + .../testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++---- + 1 file changed, 4 insertions(+), 4 deletions(-) + +diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh +index c54d1697f439..d508486cc0bd 100755 +--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh ++++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh +@@ -162,7 +162,7 @@ prio_arp() + local mode=$1 + + for primary_reselect in 0 1 2; do +- prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" ++ prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect" + done + } +@@ -178,7 +178,7 @@ prio_ns() + fi + + for primary_reselect in 0 1 2; do +- prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" ++ prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect" + log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect" + done + } +@@ -194,9 +194,9 @@ prio() + + for mode in $modes; do + prio_miimon $mode +- prio_arp $mode +- prio_ns $mode + done ++ prio_arp "active-backup" ++ prio_ns "active-backup" + } + + arp_validate_test() +-- +2.43.0 + diff --git a/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch b/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch new file mode 100644 index 00000000000..4eaf579e614 --- /dev/null +++ b/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch @@ -0,0 +1,56 @@ +From 7ea5a6e9ccc49ca9a909ce3a6a447474f910df84 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 19:12:32 -0500 +Subject: selftests: bonding: Increase timeout to 1200s + +From: Benjamin Poirier + +[ Upstream commit b01f15a7571b7aa222458bc9bf26ab59bd84e384 ] + +When tests are run by runner.sh, bond_options.sh gets killed before +it can complete: + +make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding" + [...] + # timeout set to 120 + # selftests: drivers/net/bonding: bond_options.sh + # TEST: prio (active-backup miimon primary_reselect 0) [ OK ] + # TEST: prio (active-backup miimon primary_reselect 1) [ OK ] + # TEST: prio (active-backup miimon primary_reselect 2) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 0) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 1) [ OK ] + # TEST: prio (active-backup arp_ip_target primary_reselect 2) [ OK ] + # + not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds + +This test includes many sleep statements, at least some of which are +related to timers in the operation of the bonding driver itself. Increase +the test timeout to allow the test to complete. + +I ran the test in slightly different VMs (including one without HW +virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and +13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s. + +Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test") +Reported-by: Jakub Kicinski +Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t +Suggested-by: Jakub Kicinski +Signed-off-by: Benjamin Poirier +Reviewed-by: Hangbin Liu +Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/drivers/net/bonding/settings | 2 +- + 1 file changed, 1 insertion(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings +index 6091b45d226b..79b65bdf05db 100644 +--- a/tools/testing/selftests/drivers/net/bonding/settings ++++ b/tools/testing/selftests/drivers/net/bonding/settings +@@ -1 +1 @@ +-timeout=120 ++timeout=1200 +-- +2.43.0 + diff --git a/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch b/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch new file mode 100644 index 00000000000..7993fe31313 --- /dev/null +++ b/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch @@ -0,0 +1,117 @@ +From 0e8d9de5247c4b0e9e3d8c0b504d84ca340c109d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 12:35:28 -0800 +Subject: selftests: fill in some missing configs for net + +From: Jakub Kicinski + +[ Upstream commit 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 ] + +We are missing a lot of config options from net selftests, +it seems: + +tun/tap: CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP +fib_tests: CONFIG_NET_SCH_FQ_CODEL +l2tp: CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH +sctp-vrf: CONFIG_INET_DIAG +txtimestamp: CONFIG_NET_CLS_U32 +vxlan_mdb: CONFIG_BRIDGE_VLAN_FILTERING +gre_gso: CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE +srv6_end_dt*_l3vpn: CONFIG_IPV6_SEG6_LWTUNNEL +ip_local_port_range: CONFIG_MPTCP +fib_test: CONFIG_NET_CLS_BASIC +rtnetlink: CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE + CONFIG_NET_IPGRE, CONFIG_BONDING +fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING +vxlan_mdb: CONFIG_NET_ACT_GACT +tls: CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305 +psample: CONFIG_PSAMPLE +fcnal: CONFIG_TCP_MD5SIG + +Try to add them in a semi-alphabetical order. + +Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test") +Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") +Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE") +Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++ + 1 file changed, 28 insertions(+) + +diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config +index 8da562a9ae87..19ff75051660 100644 +--- a/tools/testing/selftests/net/config ++++ b/tools/testing/selftests/net/config +@@ -1,5 +1,6 @@ + CONFIG_USER_NS=y + CONFIG_NET_NS=y ++CONFIG_BONDING=m + CONFIG_BPF_SYSCALL=y + CONFIG_TEST_BPF=m + CONFIG_NUMA=y +@@ -14,9 +15,13 @@ CONFIG_VETH=y + CONFIG_NET_IPVTI=y + CONFIG_IPV6_VTI=y + CONFIG_DUMMY=y ++CONFIG_BRIDGE_VLAN_FILTERING=y + CONFIG_BRIDGE=y ++CONFIG_CRYPTO_CHACHA20POLY1305=m + CONFIG_VLAN_8021Q=y + CONFIG_IFB=y ++CONFIG_INET_DIAG=y ++CONFIG_IP_GRE=m + CONFIG_NETFILTER=y + CONFIG_NETFILTER_ADVANCED=y + CONFIG_NF_CONNTRACK=m +@@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m + CONFIG_IP_NF_IPTABLES=m + CONFIG_IP6_NF_NAT=m + CONFIG_IP_NF_NAT=m ++CONFIG_IPV6_GRE=m ++CONFIG_IPV6_SEG6_LWTUNNEL=y ++CONFIG_L2TP_ETH=m ++CONFIG_L2TP_IP=m ++CONFIG_L2TP=m ++CONFIG_L2TP_V3=y ++CONFIG_MACSEC=m ++CONFIG_MACVLAN=y ++CONFIG_MACVTAP=y ++CONFIG_MPLS=y ++CONFIG_MPTCP=y + CONFIG_NF_TABLES=m + CONFIG_NF_TABLES_IPV6=y + CONFIG_NF_TABLES_IPV4=y + CONFIG_NFT_NAT=m ++CONFIG_NET_ACT_GACT=m ++CONFIG_NET_CLS_BASIC=m ++CONFIG_NET_CLS_U32=m ++CONFIG_NET_IPGRE_DEMUX=m ++CONFIG_NET_IPGRE=m ++CONFIG_NET_SCH_FQ_CODEL=m ++CONFIG_NET_SCH_HTB=m + CONFIG_NET_SCH_FQ=m + CONFIG_NET_SCH_ETF=m + CONFIG_NET_SCH_NETEM=y ++CONFIG_PSAMPLE=m ++CONFIG_TCP_MD5SIG=y + CONFIG_TEST_BLACKHOLE_DEV=m + CONFIG_KALLSYMS=y ++CONFIG_TLS=m + CONFIG_TRACEPOINTS=y + CONFIG_NET_DROP_MONITOR=m + CONFIG_NETDEVSIM=m +@@ -48,7 +74,9 @@ CONFIG_BAREUDP=m + CONFIG_IPV6_IOAM6_LWTUNNEL=y + CONFIG_CRYPTO_SM4_GENERIC=y + CONFIG_AMT=m ++CONFIG_TUN=y + CONFIG_VXLAN=m + CONFIG_IP_SCTP=m + CONFIG_NETFILTER_XT_MATCH_POLICY=m + CONFIG_CRYPTO_ARIA=y ++CONFIG_XFRM_INTERFACE=m +-- +2.43.0 + diff --git a/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch b/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch new file mode 100644 index 00000000000..74b4ee1c241 --- /dev/null +++ b/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch @@ -0,0 +1,51 @@ +From 59a87296fd28cff283ab62c5fcad3bbbbda290a4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 11:58:15 -0800 +Subject: selftests: net: fix rps_default_mask with >32 CPUs + +From: Jakub Kicinski + +[ Upstream commit 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 ] + +If there is more than 32 cpus the bitmask will start to contain +commas, leading to: + +./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected + +Remove the commas, bash doesn't interpret leading zeroes as oct +so that should be good enough. Switch to bash, Simon reports that +not all shells support this type of substitution. + +Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask") +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + tools/testing/selftests/net/rps_default_mask.sh | 6 +++++- + 1 file changed, 5 insertions(+), 1 deletion(-) + +diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh +index a26c5624429f..4287a8529890 100755 +--- a/tools/testing/selftests/net/rps_default_mask.sh ++++ b/tools/testing/selftests/net/rps_default_mask.sh +@@ -1,4 +1,4 @@ +-#!/bin/sh ++#!/bin/bash + # SPDX-License-Identifier: GPL-2.0 + + readonly ksft_skip=4 +@@ -33,6 +33,10 @@ chk_rps() { + + rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus) + printf "%-60s" "$msg" ++ ++ # In case there is more than 32 CPUs we need to remove commas from masks ++ rps_mask=${rps_mask//,} ++ expected_rps_mask=${expected_rps_mask//,} + if [ $rps_mask -eq $expected_rps_mask ]; then + echo "[ ok ]" + else +-- +2.43.0 + diff --git a/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch b/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch new file mode 100644 index 00000000000..87a65a9c469 --- /dev/null +++ b/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch @@ -0,0 +1,102 @@ +From dfca3552b6e84cfec030272e59570cf545b3148a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 22:05:29 -0800 +Subject: selftests: netdevsim: fix the udp_tunnel_nic test + +From: Jakub Kicinski + +[ Upstream commit 0879020a7817e7ce636372c016b4528f541c9f4d ] + +This test is missing a whole bunch of checks for interface +renaming and one ifup. Presumably it was only used on a system +with renaming disabled and NetworkManager running. + +Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra") +Acked-by: Paolo Abeni +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh | 9 +++++++++ + 1 file changed, 9 insertions(+) + +diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +index 1b08e042cf94..185b02d2d4cd 100755 +--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh ++++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh +@@ -269,6 +269,7 @@ for port in 0 1; do + echo 1 > $NSIM_DEV_SYS/new_port + fi + NSIM_NETDEV=`get_netdev_name old_netdevs` ++ ifconfig $NSIM_NETDEV up + + msg="new NIC device created" + exp0=( 0 0 0 0 ) +@@ -430,6 +431,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "overflow NIC table" +@@ -487,6 +489,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "overflow NIC table" +@@ -543,6 +546,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + overflow_table0 "destroy NIC" +@@ -572,6 +576,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -632,6 +637,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error +@@ -687,6 +693,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -746,6 +753,7 @@ for port in 0 1; do + fi + + echo $port > $NSIM_DEV_SYS/new_port ++ NSIM_NETDEV=`get_netdev_name old_netdevs` + ifconfig $NSIM_NETDEV up + + msg="create VxLANs v6" +@@ -876,6 +884,7 @@ msg="re-add a port" + + echo 2 > $NSIM_DEV_SYS/del_port + echo 2 > $NSIM_DEV_SYS/new_port ++NSIM_NETDEV=`get_netdev_name old_netdevs` + check_tables + + msg="replace VxLAN in overflow table" +-- +2.43.0 + diff --git a/queue-6.7/series b/queue-6.7/series index f8de7866666..e3a392fc5e0 100644 --- a/queue-6.7/series +++ b/queue-6.7/series @@ -142,3 +142,80 @@ ksmbd-fix-potential-circular-locking-issue-in-smb2_set_ea.patch ksmbd-don-t-increment-epoch-if-current-state-and-request-state-are-same.patch ksmbd-send-lease-break-notification-on-file_rename_information.patch ksmbd-add-missing-set_freezable-for-freezable-kthread.patch +sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch +wifi-mac80211-fix-potential-sta-link-leak.patch +btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch +net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch +selftests-bonding-increase-timeout-to-1200s.patch +tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch +bnxt_en-wait-for-flr-to-complete-during-probe.patch +bnxt_en-prevent-kernel-warning-when-running-offline-.patch +vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch +llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch +llc-drop-support-for-eth_p_tr_802_2.patch +udp-fix-busy-polling.patch +idpf-distinguish-vports-by-the-dev_port-attribute.patch +net-fix-removing-a-namespace-with-conflicting-altnam.patch +tun-fix-missing-dropped-counter-in-tun_xdp_act.patch +tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch +dpll-fix-broken-error-path-in-dpll_pin_alloc.patch +dpll-fix-pin-dump-crash-for-rebound-module.patch +dpll-fix-userspace-availability-of-pins.patch +dpll-fix-register-pin-with-unregistered-parent-pin.patch +net-micrel-fix-ptp-frame-parsing-for-lan8814.patch +net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch +netfs-fscache-prevent-oops-in-fscache_put_cache.patch +tracing-ensure-visibility-when-inserting-an-element-.patch +afs-hide-silly-rename-files-from-userspace.patch +afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch +afs-add-comments-on-abort-handling.patch +afs-turn-the-afs_addr_list-address-array-into-an-arr.patch +rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch +afs-handle-the-vio-and-uaeio-aborts-explicitly.patch +afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch +afs-wrap-most-op-error-accesses-with-inline-funcs.patch +afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch +afs-simplify-error-handling.patch +afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch +tcp-add-memory-barrier-to-tcp_push.patch +selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch +netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch +ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch +selftests-fill-in-some-missing-configs-for-net.patch +net-sched-flower-fix-chain-template-offload.patch +net-mlx5e-fix-operation-precedence-bug-in-port-times.patch +net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch +net-mlx5e-fix-peer-flow-lists-handling.patch +net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch +net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch +net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch +net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch +net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch +net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch +net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch +net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch +net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch +rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch +netfilter-nft_limit-reject-configurations-that-cause.patch +netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch +netfilter-nf_tables-validate-nfproto_-family.patch +net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch +net-mvpp2-clear-bm-pool-before-initialization.patch +selftests-net-fix-rps_default_mask-with-32-cpus.patch +selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch +xsk-recycle-buffer-in-case-rx-queue-was-full.patch +xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch +xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch +ice-work-on-pre-xdp-prog-frag-count.patch +i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch +ice-remove-redundant-xdp_rxq_info-registration.patch +intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch +ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch +xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch +i40e-set-xdp_rxq_info-frag_size.patch +i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch +fjes-fix-memleaks-in-fjes_hw_setup.patch +selftests-bonding-do-not-test-arp-ns-target-with-mod.patch +net-fec-fix-the-unhandled-context-fault-from-smmu.patch +tsnep-remove-fcs-for-xdp-data-path.patch +tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch diff --git a/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch b/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch new file mode 100644 index 00000000000..15c2a211b9f --- /dev/null +++ b/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch @@ -0,0 +1,42 @@ +From 58c1e7163139f42ce19e100c31cfa906196959e8 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 17 Jan 2024 22:06:28 +0100 +Subject: SUNRPC: use request size to initialize bio_vec in svc_udp_sendto() + +From: Lucas Stach + +[ Upstream commit 1d9cabe2817edd215779dc9c2fe5e7ab9aac0704 ] + +Use the proper size when setting up the bio_vec, as otherwise only +zero-length UDP packets will be sent. + +Fixes: baabf59c2414 ("SUNRPC: Convert svc_udp_sendto() to use the per-socket bio_vec array") +Signed-off-by: Lucas Stach +Signed-off-by: Chuck Lever +Signed-off-by: Sasha Levin +--- + net/sunrpc/svcsock.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c +index 998687421fa6..e0ce4276274b 100644 +--- a/net/sunrpc/svcsock.c ++++ b/net/sunrpc/svcsock.c +@@ -717,12 +717,12 @@ static int svc_udp_sendto(struct svc_rqst *rqstp) + ARRAY_SIZE(rqstp->rq_bvec), xdr); + + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, +- count, 0); ++ count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + if (err == -ECONNREFUSED) { + /* ICMP error on earlier request. */ + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec, +- count, 0); ++ count, rqstp->rq_res.len); + err = sock_sendmsg(svsk->sk_sock, &msg); + } + +-- +2.43.0 + diff --git a/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch b/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch new file mode 100644 index 00000000000..844e1e231e0 --- /dev/null +++ b/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch @@ -0,0 +1,101 @@ +From 38e4f18aaf076cdae4088f54afc329320ce2f0f3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 11:01:33 -0800 +Subject: tcp: Add memory barrier to tcp_push() + +From: Salvatore Dipietro + +[ Upstream commit 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 ] + +On CPUs with weak memory models, reads and updates performed by tcp_push +to the sk variables can get reordered leaving the socket throttled when +it should not. The tasklet running tcp_wfree() may also not observe the +memory updates in time and will skip flushing any packets throttled by +tcp_push(), delaying the sending. This can pathologically cause 40ms +extra latency due to bad interactions with delayed acks. + +Adding a memory barrier in tcp_push removes the bug, similarly to the +previous commit bf06200e732d ("tcp: tsq: fix nonagle handling"). +smp_mb__after_atomic() is used to not incur in unnecessary overhead +on x86 since not affected. + +Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu +22.04 and Apache Tomcat 9.0.83 running the basic servlet below: + +import java.io.IOException; +import java.io.OutputStreamWriter; +import java.io.PrintWriter; +import javax.servlet.ServletException; +import javax.servlet.http.HttpServlet; +import javax.servlet.http.HttpServletRequest; +import javax.servlet.http.HttpServletResponse; + +public class HelloWorldServlet extends HttpServlet { + @Override + protected void doGet(HttpServletRequest request, HttpServletResponse response) + throws ServletException, IOException { + response.setContentType("text/html;charset=utf-8"); + OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8"); + String s = "a".repeat(3096); + osw.write(s,0,s.length()); + osw.flush(); + } +} + +Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS +c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+ +values is observed while, with the patch, the extra latency disappears. + +No patch and tcp_autocorking=1 +./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello + ... + 50.000% 0.91ms + 75.000% 1.13ms + 90.000% 1.46ms + 99.000% 1.74ms + 99.900% 1.89ms + 99.990% 41.95ms <<< 40+ ms extra latency + 99.999% 48.32ms +100.000% 48.96ms + +With patch and tcp_autocorking=1 +./wrk -t32 -c128 -d40s --latency -R10000 http://172.31.60.173:8080/hello/hello + ... + 50.000% 0.90ms + 75.000% 1.13ms + 90.000% 1.45ms + 99.000% 1.72ms + 99.900% 1.83ms + 99.990% 2.11ms <<< no 40+ ms extra latency + 99.999% 2.53ms +100.000% 2.62ms + +Patch has been also tested on x86 (m7i.2xlarge instance) which it is not +affected by this issue and the patch doesn't introduce any additional +delay. + +Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc") +Signed-off-by: Salvatore Dipietro +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + net/ipv4/tcp.c | 1 + + 1 file changed, 1 insertion(+) + +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index ff6838ca2e58..7bce79beca2b 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now, + if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) { + NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING); + set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags); ++ smp_mb__after_atomic(); + } + /* It is possible TX completion already happened + * before we set TSQ_THROTTLED. +-- +2.43.0 + diff --git a/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch b/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch new file mode 100644 index 00000000000..adcce9e8a03 --- /dev/null +++ b/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch @@ -0,0 +1,170 @@ +From a96f3a5cb5848f1ff49b6839fc043d33bc94ec3b Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 09:20:19 +0800 +Subject: tcp: make sure init the accept_queue's spinlocks once + +From: Zhengchao Shao + +[ Upstream commit 198bc90e0e734e5f98c3d2833e8390cac3df61b2 ] + +When I run syz's reproduction C program locally, it causes the following +issue: +pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0! +WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) +Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011 +RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508) +Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7 +30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90 +RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282 +RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908 +RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900 +RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff +R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000 +R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000 +FS: 00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000 +CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0 +Call Trace: + + _raw_spin_unlock (kernel/locking/spinlock.c:186) + inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321) + inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358) + tcp_check_req (net/ipv4/tcp_minisocks.c:868) + tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260) + ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205) + ip_local_deliver_finish (net/ipv4/ip_input.c:234) + __netif_receive_skb_one_core (net/core/dev.c:5529) + process_backlog (./include/linux/rcupdate.h:779) + __napi_poll (net/core/dev.c:6533) + net_rx_action (net/core/dev.c:6604) + __do_softirq (./arch/x86/include/asm/jump_label.h:27) + do_softirq (kernel/softirq.c:454 kernel/softirq.c:441) + + + __local_bh_enable_ip (kernel/softirq.c:381) + __dev_queue_xmit (net/core/dev.c:4374) + ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235) + __ip_queue_xmit (net/ipv4/ip_output.c:535) + __tcp_transmit_skb (net/ipv4/tcp_output.c:1462) + tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469) + tcp_rcv_state_process (net/ipv4/tcp_input.c:6657) + tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929) + __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968) + release_sock (net/core/sock.c:3536) + inet_wait_for_connect (net/ipv4/af_inet.c:609) + __inet_stream_connect (net/ipv4/af_inet.c:702) + inet_stream_connect (net/ipv4/af_inet.c:748) + __sys_connect (./include/linux/file.h:45 net/socket.c:2064) + __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070) + do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82) + entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129) + RIP: 0033:0x7fa10ff05a3d + Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89 + c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48 + RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a + RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d + RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003 + RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000 + R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640 + R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20 + + +The issue triggering process is analyzed as follows: +Thread A Thread B +tcp_v4_rcv //receive ack TCP packet inet_shutdown + tcp_check_req tcp_disconnect //disconnect sock + ... tcp_set_state(sk, TCP_CLOSE) + inet_csk_complete_hashdance ... + inet_csk_reqsk_queue_add inet_listen //start listen + spin_lock(&queue->rskq_lock) inet_csk_listen_start + ... reqsk_queue_alloc + ... spin_lock_init + spin_unlock(&queue->rskq_lock) //warning + +When the socket receives the ACK packet during the three-way handshake, +it will hold spinlock. And then the user actively shutdowns the socket +and listens to the socket immediately, the spinlock will be initialized. +When the socket is going to release the spinlock, a warning is generated. +Also the same issue to fastopenq.lock. + +Move init spinlock to inet_create and inet_accept to make sure init the +accept_queue's spinlocks once. + +Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue") +Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path") +Reported-by: Ming Shu +Signed-off-by: Zhengchao Shao +Reviewed-by: Eric Dumazet +Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + include/net/inet_connection_sock.h | 8 ++++++++ + net/core/request_sock.c | 3 --- + net/ipv4/af_inet.c | 3 +++ + net/ipv4/inet_connection_sock.c | 4 ++++ + 4 files changed, 15 insertions(+), 3 deletions(-) + +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index d0a2f827d5f2..9ab4bf704e86 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk) + return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops; + } + ++static inline void inet_init_csk_locks(struct sock *sk) ++{ ++ struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ spin_lock_init(&icsk->icsk_accept_queue.rskq_lock); ++ spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock); ++} ++ + #endif /* _INET_CONNECTION_SOCK_H */ +diff --git a/net/core/request_sock.c b/net/core/request_sock.c +index f35c2e998406..63de5c635842 100644 +--- a/net/core/request_sock.c ++++ b/net/core/request_sock.c +@@ -33,9 +33,6 @@ + + void reqsk_queue_alloc(struct request_sock_queue *queue) + { +- spin_lock_init(&queue->rskq_lock); +- +- spin_lock_init(&queue->fastopenq.lock); + queue->fastopenq.rskq_rst_head = NULL; + queue->fastopenq.rskq_rst_tail = NULL; + queue->fastopenq.qlen = 0; +diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c +index ea0b0334a0fb..1c58bd72e124 100644 +--- a/net/ipv4/af_inet.c ++++ b/net/ipv4/af_inet.c +@@ -330,6 +330,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol, + if (INET_PROTOSW_REUSE & answer_flags) + sk->sk_reuse = SK_CAN_REUSE; + ++ if (INET_PROTOSW_ICSK & answer_flags) ++ inet_init_csk_locks(sk); ++ + inet = inet_sk(sk); + inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags); + +diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c +index 394a498c2823..762817d6c8d7 100644 +--- a/net/ipv4/inet_connection_sock.c ++++ b/net/ipv4/inet_connection_sock.c +@@ -730,6 +730,10 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern) + } + if (req) + reqsk_put(req); ++ ++ if (newsk) ++ inet_init_csk_locks(newsk); ++ + return newsk; + out_err: + newsk = NULL; +-- +2.43.0 + diff --git a/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch b/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch new file mode 100644 index 00000000000..b73afa982e1 --- /dev/null +++ b/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch @@ -0,0 +1,129 @@ +From bb9fda497e2b1f6927c979fc39505e3da60e56bb Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Mon, 22 Jan 2024 16:09:28 +0100 +Subject: tracing: Ensure visibility when inserting an element into tracing_map + +From: Petr Pavlu + +[ Upstream commit 2b44760609e9eaafc9d234a6883d042fc21132a7 ] + +Running the following two commands in parallel on a multi-processor +AArch64 machine can sporadically produce an unexpected warning about +duplicate histogram entries: + + $ while true; do + echo hist:key=id.syscall:val=hitcount > \ + /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger + cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist + sleep 0.001 + done + $ stress-ng --sysbadaddr $(nproc) + +The warning looks as follows: + +[ 2911.172474] ------------[ cut here ]------------ +[ 2911.173111] Duplicates detected: 1 +[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408 +[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E) +[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1 +[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G E 6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01 +[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018 +[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--) +[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408 +[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408 +[ 2911.185310] sp : ffff8000a1513900 +[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001 +[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008 +[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180 +[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff +[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8 +[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731 +[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c +[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8 +[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000 +[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480 +[ 2911.194259] Call trace: +[ 2911.194626] tracing_map_sort_entries+0x3e0/0x408 +[ 2911.195220] hist_show+0x124/0x800 +[ 2911.195692] seq_read_iter+0x1d4/0x4e8 +[ 2911.196193] seq_read+0xe8/0x138 +[ 2911.196638] vfs_read+0xc8/0x300 +[ 2911.197078] ksys_read+0x70/0x108 +[ 2911.197534] __arm64_sys_read+0x24/0x38 +[ 2911.198046] invoke_syscall+0x78/0x108 +[ 2911.198553] el0_svc_common.constprop.0+0xd0/0xf8 +[ 2911.199157] do_el0_svc+0x28/0x40 +[ 2911.199613] el0_svc+0x40/0x178 +[ 2911.200048] el0t_64_sync_handler+0x13c/0x158 +[ 2911.200621] el0t_64_sync+0x1a8/0x1b0 +[ 2911.201115] ---[ end trace 0000000000000000 ]--- + +The problem appears to be caused by CPU reordering of writes issued from +__tracing_map_insert(). + +The check for the presence of an element with a given key in this +function is: + + val = READ_ONCE(entry->val); + if (val && keys_match(key, val->key, map->key_size)) ... + +The write of a new entry is: + + elt = get_free_elt(map); + memcpy(elt->key, key, map->key_size); + entry->val = elt; + +The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;" +stores may become visible in the reversed order on another CPU. This +second CPU might then incorrectly determine that a new key doesn't match +an already present val->key and subsequently insert a new element, +resulting in a duplicate. + +Fix the problem by adding a write barrier between +"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for +good measure, also use WRITE_ONCE(entry->val, elt) for publishing the +element. The sequence pairs with the mentioned "READ_ONCE(entry->val);" +and the "val->key" check which has an address dependency. + +The barrier is placed on a path executed when adding an element for +a new key. Subsequent updates targeting the same key remain unaffected. + +From the user's perspective, the issue was introduced by commit +c193707dde77 ("tracing: Remove code which merges duplicates"), which +followed commit cbf4100efb8f ("tracing: Add support to detect and avoid +duplicates"). The previous code operated differently; it inherently +expected potential races which result in duplicates but merged them +later when they occurred. + +Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com + +Fixes: c193707dde77 ("tracing: Remove code which merges duplicates") +Signed-off-by: Petr Pavlu +Acked-by: Tom Zanussi +Signed-off-by: Steven Rostedt (Google) +Signed-off-by: Sasha Levin +--- + kernel/trace/tracing_map.c | 7 ++++++- + 1 file changed, 6 insertions(+), 1 deletion(-) + +diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c +index c774e560f2f9..a4dcf0f24352 100644 +--- a/kernel/trace/tracing_map.c ++++ b/kernel/trace/tracing_map.c +@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only) + } + + memcpy(elt->key, key, map->key_size); +- entry->val = elt; ++ /* ++ * Ensure the initialization is visible and ++ * publish the elt. ++ */ ++ smp_wmb(); ++ WRITE_ONCE(entry->val, elt); + atomic64_inc(&map->hits); + + return entry->val; +-- +2.43.0 + diff --git a/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch b/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch new file mode 100644 index 00000000000..e2a58b727f5 --- /dev/null +++ b/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch @@ -0,0 +1,52 @@ +From 96e806d8a4c5fc2829444a2c28f953e512431242 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 21:09:18 +0100 +Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring + +From: Gerhard Engleder + +[ Upstream commit 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c ] + +The fill ring of the XDP socket may contain not enough buffers to +completey fill the RX queue during socket creation. In this case the +flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX +queue is not completely filled during polling. + +Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled +during XDP socket creation. + +Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") +Signed-off-by: Gerhard Engleder +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++ + 1 file changed, 13 insertions(+) + +diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c +index 456e0336f3f6..9aeff2b37a61 100644 +--- a/drivers/net/ethernet/engleder/tsnep_main.c ++++ b/drivers/net/ethernet/engleder/tsnep_main.c +@@ -1762,6 +1762,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx) + allocated--; + } + } ++ ++ /* set need wakeup flag immediately if ring is not filled completely, ++ * first polling would be too late as need wakeup signalisation would ++ * be delayed for an indefinite time ++ */ ++ if (xsk_uses_need_wakeup(rx->xsk_pool)) { ++ int desc_available = tsnep_rx_desc_available(rx); ++ ++ if (desc_available) ++ xsk_set_rx_need_wakeup(rx->xsk_pool); ++ else ++ xsk_clear_rx_need_wakeup(rx->xsk_pool); ++ } + } + + static bool tsnep_pending(struct tsnep_queue *queue) +-- +2.43.0 + diff --git a/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch b/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch new file mode 100644 index 00000000000..7215c8fbefa --- /dev/null +++ b/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch @@ -0,0 +1,49 @@ +From 2d21d1e8559b9b89588155510d926751ac77c1ba Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Tue, 23 Jan 2024 21:09:17 +0100 +Subject: tsnep: Remove FCS for XDP data path + +From: Gerhard Engleder + +[ Upstream commit 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b ] + +The RX data buffer includes the FCS. The FCS is already stripped for the +normal data path. But for the XDP data path the FCS is included and +acts like additional/useless data. + +Remove the FCS from the RX data buffer also for XDP. + +Fixes: 65b28c810035 ("tsnep: Add XDP RX support") +Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support") +Signed-off-by: Gerhard Engleder +Signed-off-by: Paolo Abeni +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/engleder/tsnep_main.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c +index df40c720e7b2..456e0336f3f6 100644 +--- a/drivers/net/ethernet/engleder/tsnep_main.c ++++ b/drivers/net/ethernet/engleder/tsnep_main.c +@@ -1485,7 +1485,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi, + + xdp_prepare_buff(&xdp, page_address(entry->page), + XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE, +- length, false); ++ length - ETH_FCS_LEN, false); + + consume = tsnep_xdp_run_prog(rx, prog, &xdp, + &xdp_status, tx_nq, tx); +@@ -1568,7 +1568,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi, + prefetch(entry->xdp->data); + length = __le32_to_cpu(entry->desc_wb->properties) & + TSNEP_DESC_LENGTH_MASK; +- xsk_buff_set_size(entry->xdp, length); ++ xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN); + xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool); + + /* RX metadata with timestamps is in front of actual data, +-- +2.43.0 + diff --git a/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch b/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch new file mode 100644 index 00000000000..e698419b95e --- /dev/null +++ b/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch @@ -0,0 +1,49 @@ +From 1cc6dc39a13a171888b77d808036b7e7b1013f78 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 18:22:56 +0800 +Subject: tun: add missing rx stats accounting in tun_xdp_act + +From: Yunjian Wang + +[ Upstream commit f1084c427f55d573fcd5688d9ba7b31b78019716 ] + +The TUN can be used as vhost-net backend, and it is necessary to +count the packets transmitted from TUN to vhost-net/virtio-net. +However, there are some places in the receive path that were not +taken into account when using XDP. It would be beneficial to also +include new accounting for successfully received bytes using +dev_sw_netstats_rx_add. + +Fixes: 761876c857cb ("tap: XDP support") +Signed-off-by: Yunjian Wang +Reviewed-by: Willem de Bruijn +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/tun.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index 237fef557ba5..4a4f8c8e79fa 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + dev_core_stats_rx_dropped_inc(tun->dev); + return err; + } ++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); +@@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + dev_core_stats_rx_dropped_inc(tun->dev); + return err; + } ++ dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data); + break; + case XDP_PASS: + break; +-- +2.43.0 + diff --git a/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch b/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch new file mode 100644 index 00000000000..f10345d4357 --- /dev/null +++ b/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch @@ -0,0 +1,52 @@ +From c8a46f874ec240aa00fd746cab6f00c704ef999a Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Fri, 19 Jan 2024 18:22:35 +0800 +Subject: tun: fix missing dropped counter in tun_xdp_act + +From: Yunjian Wang + +[ Upstream commit 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 ] + +The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes +dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions. +Unfortunately, that commit missed the dropped counter when error +occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes +this issue. + +Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic") +Signed-off-by: Yunjian Wang +Reviewed-by: Willem de Bruijn +Acked-by: Jason Wang +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + drivers/net/tun.c | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/tun.c b/drivers/net/tun.c +index afa5497f7c35..237fef557ba5 100644 +--- a/drivers/net/tun.c ++++ b/drivers/net/tun.c +@@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog, + switch (act) { + case XDP_REDIRECT: + err = xdp_do_redirect(tun->dev, xdp, xdp_prog); +- if (err) ++ if (err) { ++ dev_core_stats_rx_dropped_inc(tun->dev); + return err; ++ } + break; + case XDP_TX: + err = tun_xdp_tx(tun->dev, xdp); +- if (err < 0) ++ if (err < 0) { ++ dev_core_stats_rx_dropped_inc(tun->dev); + return err; ++ } + break; + case XDP_PASS: + break; +-- +2.43.0 + diff --git a/queue-6.7/udp-fix-busy-polling.patch b/queue-6.7/udp-fix-busy-polling.patch new file mode 100644 index 00000000000..7751b6a56d5 --- /dev/null +++ b/queue-6.7/udp-fix-busy-polling.patch @@ -0,0 +1,134 @@ +From 40295213484304936b40dafc21ab65a5dd7cce8d Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 20:17:49 +0000 +Subject: udp: fix busy polling + +From: Eric Dumazet + +[ Upstream commit a54d51fb2dfb846aedf3751af501e9688db447f5 ] + +Generic sk_busy_loop_end() only looks at sk->sk_receive_queue +for presence of packets. + +Problem is that for UDP sockets after blamed commit, some packets +could be present in another queue: udp_sk(sk)->reader_queue + +In some cases, a busy poller could spin until timeout expiration, +even if some packets are available in udp_sk(sk)->reader_queue. + +v3: - make sk_busy_loop_end() nicer (Willem) + +v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats. + - add a sk_is_inet() check in sk_is_udp() (Willem feedback) + - add a sk_is_inet() check in sk_is_tcp(). + +Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception") +Signed-off-by: Eric Dumazet +Reviewed-by: Paolo Abeni +Reviewed-by: Willem de Bruijn +Reviewed-by: Kuniyuki Iwashima +Signed-off-by: David S. Miller +Signed-off-by: Sasha Levin +--- + include/linux/skmsg.h | 6 ------ + include/net/inet_sock.h | 5 ----- + include/net/sock.h | 18 +++++++++++++++++- + net/core/sock.c | 11 +++++++++-- + 4 files changed, 26 insertions(+), 14 deletions(-) + +diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h +index c953b8c0d2f4..bd4418377bac 100644 +--- a/include/linux/skmsg.h ++++ b/include/linux/skmsg.h +@@ -500,12 +500,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock) + return !!psock->saved_data_ready; + } + +-static inline bool sk_is_udp(const struct sock *sk) +-{ +- return sk->sk_type == SOCK_DGRAM && +- sk->sk_protocol == IPPROTO_UDP; +-} +- + #if IS_ENABLED(CONFIG_NET_SOCK_MSG) + + #define BPF_F_STRPARSER (1UL << 1) +diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h +index 74db6d97cae1..8d5fe15b0f6f 100644 +--- a/include/net/inet_sock.h ++++ b/include/net/inet_sock.h +@@ -310,11 +310,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet) + #define inet_assign_bit(nr, sk, val) \ + assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val) + +-static inline bool sk_is_inet(struct sock *sk) +-{ +- return sk->sk_family == AF_INET || sk->sk_family == AF_INET6; +-} +- + /** + * sk_to_full_sk - Access to a full socket + * @sk: pointer to a socket +diff --git a/include/net/sock.h b/include/net/sock.h +index 0201136b0b9c..f9a9f61fa122 100644 +--- a/include/net/sock.h ++++ b/include/net/sock.h +@@ -2794,9 +2794,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags) + &skb_shinfo(skb)->tskey); + } + ++static inline bool sk_is_inet(const struct sock *sk) ++{ ++ int family = READ_ONCE(sk->sk_family); ++ ++ return family == AF_INET || family == AF_INET6; ++} ++ + static inline bool sk_is_tcp(const struct sock *sk) + { +- return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP; ++ return sk_is_inet(sk) && ++ sk->sk_type == SOCK_STREAM && ++ sk->sk_protocol == IPPROTO_TCP; ++} ++ ++static inline bool sk_is_udp(const struct sock *sk) ++{ ++ return sk_is_inet(sk) && ++ sk->sk_type == SOCK_DGRAM && ++ sk->sk_protocol == IPPROTO_UDP; + } + + static inline bool sk_is_stream_unix(const struct sock *sk) +diff --git a/net/core/sock.c b/net/core/sock.c +index d02534c77413..e5d43a068f8e 100644 +--- a/net/core/sock.c ++++ b/net/core/sock.c +@@ -107,6 +107,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -4148,8 +4149,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time) + { + struct sock *sk = p; + +- return !skb_queue_empty_lockless(&sk->sk_receive_queue) || +- sk_busy_loop_timeout(sk, start_time); ++ if (!skb_queue_empty_lockless(&sk->sk_receive_queue)) ++ return true; ++ ++ if (sk_is_udp(sk) && ++ !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue)) ++ return true; ++ ++ return sk_busy_loop_timeout(sk, start_time); + } + EXPORT_SYMBOL(sk_busy_loop_end); + #endif /* CONFIG_NET_RX_BUSY_POLL */ +-- +2.43.0 + diff --git a/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch b/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch new file mode 100644 index 00000000000..baba253505b --- /dev/null +++ b/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch @@ -0,0 +1,58 @@ +From b3cc206ba9b0e1375f74191078ca0f84f0e06365 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 18 Jan 2024 21:03:06 +0800 +Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING + +From: Lin Ma + +[ Upstream commit 6c21660fe221a15c789dee2bc2fd95516bc5aeaf ] + +In the vlan_changelink function, a loop is used to parse the nested +attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to +obtain the struct ifla_vlan_qos_mapping. These two nested attributes are +checked in the vlan_validate_qos_map function, which calls +nla_validate_nested_deprecated with the vlan_map_policy. + +However, this deprecated validator applies a LIBERAL strictness, allowing +the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC. +Consequently, the loop in vlan_changelink may parse an attribute of type +IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of +struct ifla_vlan_qos_mapping, which is not necessarily true. + +To address this issue and ensure compatibility, this patch introduces two +type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING. + +Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API") +Signed-off-by: Lin Ma +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn +Signed-off-by: Jakub Kicinski +Signed-off-by: Sasha Levin +--- + net/8021q/vlan_netlink.c | 4 ++++ + 1 file changed, 4 insertions(+) + +diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c +index 214532173536..a3b68243fd4b 100644 +--- a/net/8021q/vlan_netlink.c ++++ b/net/8021q/vlan_netlink.c +@@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[], + } + if (data[IFLA_VLAN_INGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) { ++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) ++ continue; + m = nla_data(attr); + vlan_dev_set_ingress_priority(dev, m->to, m->from); + } + } + if (data[IFLA_VLAN_EGRESS_QOS]) { + nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) { ++ if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING) ++ continue; + m = nla_data(attr); + err = vlan_dev_set_egress_priority(dev, m->from, m->to); + if (err) +-- +2.43.0 + diff --git a/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch b/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch new file mode 100644 index 00000000000..3826328a829 --- /dev/null +++ b/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch @@ -0,0 +1,44 @@ +From fd2890505f1dd291a5dc74f190d704c71b303d92 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Thu, 11 Jan 2024 18:17:44 +0200 +Subject: wifi: mac80211: fix potential sta-link leak + +From: Johannes Berg + +[ Upstream commit b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 ] + +When a station is allocated, links are added but not +set to valid yet (e.g. during connection to an AP MLD), +we might remove the station without ever marking links +valid, and leak them. Fix that. + +Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal") +Signed-off-by: Johannes Berg +Reviewed-by: Ilan Peer +Signed-off-by: Miri Korenblit +Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid +Signed-off-by: Johannes Berg +Signed-off-by: Sasha Levin +--- + net/mac80211/sta_info.c | 5 ++++- + 1 file changed, 4 insertions(+), 1 deletion(-) + +diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c +index 0ba613dd1cc4..c33decbb97f2 100644 +--- a/net/mac80211/sta_info.c ++++ b/net/mac80211/sta_info.c +@@ -404,7 +404,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta) + int i; + + for (i = 0; i < ARRAY_SIZE(sta->link); i++) { +- if (!(sta->sta.valid_links & BIT(i))) ++ struct link_sta_info *link_sta; ++ ++ link_sta = rcu_access_pointer(sta->link[i]); ++ if (!link_sta) + continue; + + sta_remove_link(sta, i, false); +-- +2.43.0 + diff --git a/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch b/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch new file mode 100644 index 00000000000..bbfd0da92b1 --- /dev/null +++ b/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch @@ -0,0 +1,42 @@ +From ce88e3847c9d2c03cbd9e1a47d10d20adfe0bdc3 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:16:00 +0100 +Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL + +From: Maciej Fijalkowski + +[ Upstream commit fbadd83a612c3b7aad2987893faca6bd24aaebb3 ] + +XSK ZC Rx path calculates the size of data that will be posted to XSK Rx +queue via subtracting xdp_buff::data_end from xdp_buff::data. + +In bpf_xdp_frags_increase_tail(), when underlying memory type of +xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail +fragment, so that later on user space will be able to take into account +the amount of bytes added by XDP program. + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + net/core/filter.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/net/core/filter.c b/net/core/filter.c +index 6575288b8580..cee53838310f 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -4091,6 +4091,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) + memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset); + skb_frag_size_add(frag, offset); + sinfo->xdp_frags_size += offset; ++ if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL) ++ xsk_buff_get_tail(xdp)->data_end += offset; + + return 0; + } +-- +2.43.0 + diff --git a/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch b/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch new file mode 100644 index 00000000000..0ae327d2f8a --- /dev/null +++ b/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch @@ -0,0 +1,195 @@ +From b345185d903cd3418f8b01e7cdd56bdcb02fcac4 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:54 +0100 +Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP + +From: Maciej Fijalkowski + +[ Upstream commit c5114710c8ce86b8317e9b448f4fd15c711c2a82 ] + +Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory +type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens: + +[1136314.192256] BUG: kernel NULL pointer dereference, address: +0000000000000034 +[1136314.203943] #PF: supervisor read access in kernel mode +[1136314.213768] #PF: error_code(0x0000) - not-present page +[1136314.223550] PGD 0 P4D 0 +[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI +[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257 +[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT, +BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019 +[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210 +[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86 +[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246 +[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX: +0000000000000000 +[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI: +ffffc9003168c000 +[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09: +0000000000010000 +[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12: +0000000000000001 +[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15: +0000000000000001 +[1136314.373298] FS: 00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000) +knlGS:0000000000000000 +[1136314.386105] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033 +[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4: +00000000007706f0 +[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2: +0000000000000000 +[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: +0000000000000400 +[1136314.431890] PKRU: 55555554 +[1136314.439143] Call Trace: +[1136314.446058] +[1136314.452465] ? __die+0x20/0x70 +[1136314.459881] ? page_fault_oops+0x15b/0x440 +[1136314.468305] ? exc_page_fault+0x6a/0x150 +[1136314.476491] ? asm_exc_page_fault+0x22/0x30 +[1136314.484927] ? __xdp_return+0x6c/0x210 +[1136314.492863] bpf_xdp_adjust_tail+0x155/0x1d0 +[1136314.501269] bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60 +[1136314.511263] ice_clean_rx_irq_zc+0x206/0xc60 [ice] +[1136314.520222] ? ice_xmit_zc+0x6e/0x150 [ice] +[1136314.528506] ice_napi_poll+0x467/0x670 [ice] +[1136314.536858] ? ttwu_do_activate.constprop.0+0x8f/0x1a0 +[1136314.546010] __napi_poll+0x29/0x1b0 +[1136314.553462] net_rx_action+0x133/0x270 +[1136314.561619] __do_softirq+0xbe/0x28e +[1136314.569303] do_softirq+0x3f/0x60 + +This comes from __xdp_return() call with xdp_buff argument passed as +NULL which is supposed to be consumed by xsk_buff_free() call. + +To address this properly, in ZC case, a node that represents the frag +being removed has to be pulled out of xskb_list. Introduce +appropriate xsk helpers to do such node operation and use them +accordingly within bpf_xdp_adjust_tail(). + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Acked-by: Magnus Karlsson # For the xsk header part +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++++ + net/core/filter.c | 42 ++++++++++++++++++++++++++++++++------ + 2 files changed, 62 insertions(+), 6 deletions(-) + +diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h +index 7290eb721c07..5425f7ad5ebd 100644 +--- a/include/net/xdp_sock_drv.h ++++ b/include/net/xdp_sock_drv.h +@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) + return ret; + } + ++static inline void xsk_buff_del_tail(struct xdp_buff *tail) ++{ ++ struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp); ++ ++ list_del(&xskb->xskb_list_node); ++} ++ ++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) ++{ ++ struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp); ++ struct xdp_buff_xsk *frag; ++ ++ frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk, ++ xskb_list_node); ++ return &frag->xdp; ++} ++ + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + { + xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; +@@ -310,6 +327,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first) + return NULL; + } + ++static inline void xsk_buff_del_tail(struct xdp_buff *tail) ++{ ++} ++ ++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first) ++{ ++ return NULL; ++} ++ + static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + { + } +diff --git a/net/core/filter.c b/net/core/filter.c +index 1737884be52f..6575288b8580 100644 +--- a/net/core/filter.c ++++ b/net/core/filter.c +@@ -83,6 +83,7 @@ + #include + #include + #include ++#include + + #include "dev.h" + +@@ -4094,6 +4095,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset) + return 0; + } + ++static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink, ++ struct xdp_mem_info *mem_info, bool release) ++{ ++ struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp); ++ ++ if (release) { ++ xsk_buff_del_tail(zc_frag); ++ __xdp_return(NULL, mem_info, false, zc_frag); ++ } else { ++ zc_frag->data_end -= shrink; ++ } ++} ++ ++static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag, ++ int shrink) ++{ ++ struct xdp_mem_info *mem_info = &xdp->rxq->mem; ++ bool release = skb_frag_size(frag) == shrink; ++ ++ if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) { ++ bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release); ++ goto out; ++ } ++ ++ if (release) { ++ struct page *page = skb_frag_page(frag); ++ ++ __xdp_return(page_address(page), mem_info, false, NULL); ++ } ++ ++out: ++ return release; ++} ++ + static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) + { + struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp); +@@ -4108,12 +4143,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset) + + len_free += shrink; + offset -= shrink; +- +- if (skb_frag_size(frag) == shrink) { +- struct page *page = skb_frag_page(frag); +- +- __xdp_return(page_address(page), &xdp->rxq->mem, +- false, NULL); ++ if (bpf_xdp_shrink_data(xdp, frag, shrink)) { + n_frags_free++; + } else { + skb_frag_size_sub(frag, shrink); +-- +2.43.0 + diff --git a/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch b/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch new file mode 100644 index 00000000000..be79226dedc --- /dev/null +++ b/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch @@ -0,0 +1,107 @@ +From 4b6c54fb2e4f8e8a3de0a7e6e4fd37f4ccaf58d0 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:53 +0100 +Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags + +From: Maciej Fijalkowski + +[ Upstream commit f7f6aa8e24383fbb11ac55942e66da9660110f80 ] + +XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is +used by drivers to notify data path whether xdp_buff contains fragments +or not. Data path looks up mentioned flag on first buffer that occupies +the linear part of xdp_buff, so drivers only modify it there. This is +sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on +stack or it resides within struct representing driver's queue and +fragments are carried via skb_frag_t structs. IOW, we are dealing with +only one xdp_buff. + +ZC mode though relies on list of xdp_buff structs that is carried via +xsk_buff_pool::xskb_list, so ZC data path has to make sure that +fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise, +xsk_buff_free() could misbehave if it would be executed against xdp_buff +that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can +take place when within supplied XDP program bpf_xdp_adjust_tail() is +used with negative offset that would in turn release the tail fragment +from multi-buffer frame. + +Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would +result in releasing all the nodes from xskb_list that were produced by +driver before XDP program execution, which is not what is intended - +only tail fragment should be deleted from xskb_list and then it should +be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never +make it up to user space, so from AF_XDP application POV there would be +no traffic running, however due to free_list getting constantly new +nodes, driver will be able to feed HW Rx queue with recycled buffers. +Bottom line is that instead of traffic being redirected to user space, +it would be continuously dropped. + +To fix this, let us clear the mentioned flag on xsk_buff_pool side +during xdp_buff initialization, which is what should have been done +right from the start of XSK multi-buffer support. + +Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support") +Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support") +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 - + drivers/net/ethernet/intel/ice/ice_xsk.c | 1 - + include/net/xdp_sock_drv.h | 1 + + net/xdp/xsk_buff_pool.c | 1 + + 4 files changed, 2 insertions(+), 2 deletions(-) + +diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +index e99fa854d17f..fede0bb3e047 100644 +--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c ++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c +@@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget) + xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog); + i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets, + &rx_bytes, xdp_res, &failure); +- first->flags = 0; + next_to_clean = next_to_process; + if (failure) + break; +diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c +index 99954508184f..951f84bfdf2b 100644 +--- a/drivers/net/ethernet/intel/ice/ice_xsk.c ++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c +@@ -891,7 +891,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget) + + if (!first) { + first = xdp; +- xdp_buff_clear_frags_flag(first); + } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) { + break; + } +diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h +index 1f6fc8c7a84c..7290eb721c07 100644 +--- a/include/net/xdp_sock_drv.h ++++ b/include/net/xdp_sock_drv.h +@@ -152,6 +152,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size) + xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM; + xdp->data_meta = xdp->data; + xdp->data_end = xdp->data + size; ++ xdp->flags = 0; + } + + static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool, +diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c +index 49cb9f9a09be..b0a611677865 100644 +--- a/net/xdp/xsk_buff_pool.c ++++ b/net/xdp/xsk_buff_pool.c +@@ -541,6 +541,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool) + + xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM; + xskb->xdp.data_meta = xskb->xdp.data; ++ xskb->xdp.flags = 0; + + if (pool->dma_need_sync) { + dma_sync_single_range_for_device(pool->dev, xskb->dma, 0, +-- +2.43.0 + diff --git a/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch b/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch new file mode 100644 index 00000000000..0df44b920ef --- /dev/null +++ b/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch @@ -0,0 +1,58 @@ +From c2fe6af64698a43889e90bbda82f4a926d00e464 Mon Sep 17 00:00:00 2001 +From: Sasha Levin +Date: Wed, 24 Jan 2024 20:15:52 +0100 +Subject: xsk: recycle buffer in case Rx queue was full + +From: Maciej Fijalkowski + +[ Upstream commit 269009893146c495f41e9572dd9319e787c2eba9 ] + +Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce +descriptor to XSK Rx queue. + +Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX") +Acked-by: Magnus Karlsson +Signed-off-by: Maciej Fijalkowski +Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com +Signed-off-by: Alexei Starovoitov +Signed-off-by: Sasha Levin +--- + net/xdp/xsk.c | 12 ++++++++---- + 1 file changed, 8 insertions(+), 4 deletions(-) + +diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c +index 3da0b52f308d..688e641cd278 100644 +--- a/net/xdp/xsk.c ++++ b/net/xdp/xsk.c +@@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) + contd = XDP_PKT_CONTD; + + err = __xsk_rcv_zc(xs, xskb, len, contd); +- if (err || likely(!frags)) +- goto out; ++ if (err) ++ goto err; ++ if (likely(!frags)) ++ return 0; + + xskb_list = &xskb->pool->xskb_list; + list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) { +@@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len) + len = pos->xdp.data_end - pos->xdp.data; + err = __xsk_rcv_zc(xs, pos, len, contd); + if (err) +- return err; ++ goto err; + list_del(&pos->xskb_list_node); + } + +-out: ++ return 0; ++err: ++ xsk_buff_free(xdp); + return err; + } + +-- +2.43.0 +