Fixes for 6.7

author Sasha Levin <sashal@kernel.org>

Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)

committer Sasha Levin <sashal@kernel.org>

Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)
author Sasha Levin <sashal@kernel.org>
Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)
committer Sasha Levin <sashal@kernel.org>
Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)
diff --git a/queue-6.7/afs-add-comments-on-abort-handling.patch b/queue-6.7/afs-add-comments-on-abort-handling.patch

new file mode 100644 (file)

index 0000000..3aca94d
--- /dev/null
+++ b/queue-6.7/afs-add-comments-on-abort-handling.patch
@@ -0,0 +1,189 @@
+From d4c784e91102c0fe5bfb9431f7842247c55a3d6d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Oct 2023 08:42:18 +0100
+Subject: afs: Add comments on abort handling
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit fe245c8fcdac339e6b42076c828a6bede3a5e948 ]
+
+Add some comments on AFS abort code handling in the rotation algorithm and
+adjust the errors produced to match.
+
+Reported-by: Jeffrey E Altman <jaltman@auristor.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Jeffrey Altman <jaltman@auristor.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/rotate.c | 101 ++++++++++++++++++++++++++++++++++++++++++------
+ 1 file changed, 90 insertions(+), 11 deletions(-)
+
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a840c3588ebb..a3d127953ac6 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -13,6 +13,7 @@
+ #include <linux/sched/signal.h>
+ #include "internal.h"
+ #include "afs_fs.h"
++#include "protocol_uae.h"
+ 
+ /*
+  * Begin iteration through a server list, starting with the vnode's last used
+@@ -143,6 +144,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+       case -ECONNABORTED:
+               /* The far side rejected the operation on some grounds.  This
+                * might involve the server being busy or the volume having been moved.
++               *
++               * Note that various V* errors should not be sent to a cache manager
++               * by a fileserver as they should be translated to more modern UAE*
++               * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
++               * these abort codes.
+                */
+               switch (op->ac.abort_code) {
+               case VNOVOL:
+@@ -150,6 +156,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * - May indicate that the VL is wrong - retry once and compare
+                        *   the results.
+                        * - May indicate that the fileserver couldn't attach to the vol.
++                       * - The volume might have been temporarily removed so that it can
++                       *   be replaced by a volume restore.  "vos" might have ended one
++                       *   transaction and has yet to create the next.
++                       * - The volume might not be blessed or might not be in-service
++                       *   (administrative action).
+                        */
+                       if (op->flags & AFS_OPERATION_VNOVOL) {
+                               op->error = -EREMOTEIO;
+@@ -183,16 +194,56 @@ bool afs_select_fileserver(struct afs_operation *op)
+                       _leave(" = t [vnovol]");
+                       return true;
+ 
+-              case VSALVAGE: /* TODO: Should this return an error or iterate? */
+               case VVOLEXISTS:
+-              case VNOSERVICE:
+               case VONLINE:
+-              case VDISKFULL:
+-              case VOVERQUOTA:
+-                      op->error = afs_abort_to_error(op->ac.abort_code);
++                      /* These should not be returned from the fileserver. */
++                      pr_warn("Fileserver returned unexpected abort %d\n",
++                              op->ac.abort_code);
++                      op->error = -EREMOTEIO;
++                      goto next_server;
++
++              case VNOSERVICE:
++                      /* Prior to AFS 3.2 VNOSERVICE was returned from the fileserver
++                       * if the volume was neither in-service nor administratively
++                       * blessed.  All usage was replaced by VNOVOL because AFS 3.1 and
++                       * earlier cache managers did not handle VNOSERVICE and assumed
++                       * it was the client OSes errno 105.
++                       *
++                       * Starting with OpenAFS 1.4.8 VNOSERVICE was repurposed as the
++                       * fileserver idle dead time error which was sent in place of
++                       * RX_CALL_TIMEOUT (-3).  The error was intended to be sent if the
++                       * fileserver took too long to send a reply to the client.
++                       * RX_CALL_TIMEOUT would have caused the cache manager to mark the
++                       * server down whereas VNOSERVICE since AFS 3.2 would cause cache
++                       * manager to temporarily (up to 15 minutes) mark the volume
++                       * instance as unusable.
++                       *
++                       * The idle dead logic resulted in cache inconsistency since a
++                       * state changing call that the cache manager assumed was dead
++                       * could still be processed to completion by the fileserver.  This
++                       * logic was removed in OpenAFS 1.8.0 and VNOSERVICE is no longer
++                       * returned.  However, many 1.4.8 through 1.6.24 fileservers are
++                       * still in existence.
++                       *
++                       * AuriStorFS fileservers have never returned VNOSERVICE.
++                       *
++                       * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
++                       */
++              case RX_CALL_TIMEOUT:
++                      op->error = -ETIMEDOUT;
+                       goto next_server;
+ 
++              case VSALVAGING: /* This error should not be leaked to cache managers
++                                * but is from OpenAFS demand attach fileservers.
++                                * It should be treated as an alias for VOFFLINE.
++                                */
++              case VSALVAGE: /* VSALVAGE should be treated as a synonym of VOFFLINE */
+               case VOFFLINE:
++                      /* The volume is in use by the volserver or another volume utility
++                       * for an operation that might alter the contents.  The volume is
++                       * expected to come back but it might take a long time (could be
++                       * days).
++                       */
+                       if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
+                               afs_busy(op->volume, op->ac.abort_code);
+                               clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+@@ -207,11 +258,20 @@ bool afs_select_fileserver(struct afs_operation *op)
+                       }
+                       goto busy;
+ 
+-              case VSALVAGING:
+-              case VRESTARTING:
++              case VRESTARTING: /* The fileserver is either shutting down or starting up. */
+               case VBUSY:
+-                      /* Retry after going round all the servers unless we
+-                       * have a file lock we need to maintain.
++                      /* The volume is in use by the volserver or another volume
++                       * utility for an operation that is not expected to alter the
++                       * contents of the volume.  VBUSY does not need to be returned
++                       * for a ROVOL or BACKVOL bound to an ITBusy volserver
++                       * transaction.  The fileserver is permitted to continue serving
++                       * content from ROVOLs and BACKVOLs during an ITBusy transaction
++                       * because the content will not change.  However, many fileserver
++                       * releases do return VBUSY for ROVOL and BACKVOL instances under
++                       * many circumstances.
++                       *
++                       * Retry after going round all the servers unless we have a file
++                       * lock we need to maintain.
+                        */
+                       if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+                               op->error = -EBUSY;
+@@ -226,7 +286,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                               if (!afs_sleep_and_retry(op))
+                                       goto failed;
+ 
+-                               /* Retry with same server & address */
++                              /* Retry with same server & address */
+                               _leave(" = t [vbusy]");
+                               return true;
+                       }
+@@ -270,10 +330,29 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+                       goto restart_from_beginning;
+ 
++              case VDISKFULL:
++              case UAENOSPC:
++                      /* The partition is full.  Only applies to RWVOLs.
++                       * Translate locally and return ENOSPC.
++                       * No replicas to failover to.
++                       */
++                      op->error = -ENOSPC;
++                      goto failed_but_online;
++
++              case VOVERQUOTA:
++              case UAEDQUOT:
++                      /* Volume is full.  Only applies to RWVOLs.
++                       * Translate locally and return EDQUOT.
++                       * No replicas to failover to.
++                       */
++                      op->error = -EDQUOT;
++                      goto failed_but_online;
++
+               default:
++                      op->error = afs_abort_to_error(op->ac.abort_code);
++              failed_but_online:
+                       clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+                       clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+-                      op->error = afs_abort_to_error(op->ac.abort_code);
+                       goto failed;
+               }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch b/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch

new file mode 100644 (file)

index 0000000..8188664
--- /dev/null
+++ b/queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch
@@ -0,0 +1,352 @@
+From 2885f7375cc37abe94fcd4895fa663b6b24e4904 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Oct 2023 09:54:07 +0100
+Subject: afs: Don't put afs_call in afs_wait_for_call_to_complete()
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 6f2ff7e89bd05677f4c08fccafcf625ca3e09c1c ]
+
+Don't put the afs_call struct in afs_wait_for_call_to_complete() but rather
+have the caller do it.  This will allow the caller to fish stuff out of the
+afs_call struct rather than the afs_addr_cursor struct, thereby allowing a
+subsequent patch to subsume it.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/fs_operation.c |  7 +++--
+ fs/afs/fsclient.c     |  5 ++-
+ fs/afs/internal.h     |  2 +-
+ fs/afs/rxrpc.c        | 73 ++++++++++++++++---------------------------
+ fs/afs/vlclient.c     | 64 ++++++++++++++++++++++---------------
+ 5 files changed, 75 insertions(+), 76 deletions(-)
+
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index bfb9a7634bd9..1c22d6e77846 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -191,8 +191,11 @@ void afs_wait_for_operation(struct afs_operation *op)
+               else
+                       op->ac.error = -ENOTSUPP;
+ 
+-              if (op->call)
+-                      op->error = afs_wait_for_call_to_complete(op->call, &op->ac);
++              if (op->call) {
++                      afs_wait_for_call_to_complete(op->call, &op->ac);
++                      op->error = op->ac.error;
++                      afs_put_call(op->call);
++              }
+       }
+ 
+       switch (op->error) {
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 6821ce0f9d63..020073387111 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1612,6 +1612,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+ {
+       struct afs_call *call;
+       __be32 *bp;
++      int ret;
+ 
+       _enter("");
+ 
+@@ -1627,7 +1628,9 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+ 
+       call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+       afs_make_call(ac, call, GFP_NOFS);
+-      return afs_wait_for_call_to_complete(call, ac);
++      afs_wait_for_call_to_complete(call, ac);
++      afs_put_call(call);
++      return ret;
+ }
+ 
+ /*
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 1a306df267b0..45c4526b56be 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -1291,7 +1291,7 @@ extern void __net_exit afs_close_socket(struct afs_net *);
+ extern void afs_charge_preallocation(struct work_struct *);
+ extern void afs_put_call(struct afs_call *);
+ extern void afs_make_call(struct afs_addr_cursor *, struct afs_call *, gfp_t);
+-extern long afs_wait_for_call_to_complete(struct afs_call *, struct afs_addr_cursor *);
++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac);
+ extern struct afs_call *afs_alloc_flat_call(struct afs_net *,
+                                           const struct afs_call_type *,
+                                           size_t, size_t);
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index 2603db03b7ff..dad8efadbc44 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -575,48 +575,44 @@ static void afs_deliver_to_call(struct afs_call *call)
+ /*
+  * Wait synchronously for a call to complete and clean up the call struct.
+  */
+-long afs_wait_for_call_to_complete(struct afs_call *call,
+-                                 struct afs_addr_cursor *ac)
++void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac)
+ {
+-      long ret;
+       bool rxrpc_complete = false;
+ 
+-      DECLARE_WAITQUEUE(myself, current);
+-
+       _enter("");
+ 
+-      ret = call->error;
+-      if (ret < 0)
+-              goto out;
++      if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
++              DECLARE_WAITQUEUE(myself, current);
++
++              add_wait_queue(&call->waitq, &myself);
++              for (;;) {
++                      set_current_state(TASK_UNINTERRUPTIBLE);
++
++                      /* deliver any messages that are in the queue */
++                      if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
++                          call->need_attention) {
++                              call->need_attention = false;
++                              __set_current_state(TASK_RUNNING);
++                              afs_deliver_to_call(call);
++                              continue;
++                      }
+ 
+-      add_wait_queue(&call->waitq, &myself);
+-      for (;;) {
+-              set_current_state(TASK_UNINTERRUPTIBLE);
+-
+-              /* deliver any messages that are in the queue */
+-              if (!afs_check_call_state(call, AFS_CALL_COMPLETE) &&
+-                  call->need_attention) {
+-                      call->need_attention = false;
+-                      __set_current_state(TASK_RUNNING);
+-                      afs_deliver_to_call(call);
+-                      continue;
+-              }
++                      if (afs_check_call_state(call, AFS_CALL_COMPLETE))
++                              break;
+ 
+-              if (afs_check_call_state(call, AFS_CALL_COMPLETE))
+-                      break;
++                      if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
++                              /* rxrpc terminated the call. */
++                              rxrpc_complete = true;
++                              break;
++                      }
+ 
+-              if (!rxrpc_kernel_check_life(call->net->socket, call->rxcall)) {
+-                      /* rxrpc terminated the call. */
+-                      rxrpc_complete = true;
+-                      break;
++                      schedule();
+               }
+ 
+-              schedule();
++              remove_wait_queue(&call->waitq, &myself);
++              __set_current_state(TASK_RUNNING);
+       }
+ 
+-      remove_wait_queue(&call->waitq, &myself);
+-      __set_current_state(TASK_RUNNING);
+-
+       if (!afs_check_call_state(call, AFS_CALL_COMPLETE)) {
+               if (rxrpc_complete) {
+                       afs_set_call_complete(call, call->error, call->abort_code);
+@@ -635,23 +631,8 @@ long afs_wait_for_call_to_complete(struct afs_call *call,
+       ac->error = call->error;
+       spin_unlock_bh(&call->state_lock);
+ 
+-      ret = ac->error;
+-      switch (ret) {
+-      case 0:
+-              ret = call->ret0;
+-              call->ret0 = 0;
+-
+-              fallthrough;
+-      case -ECONNABORTED:
++      if (call->error == 0 || call->error == -ECONNABORTED)
+               ac->responded = true;
+-              break;
+-      }
+-
+-out:
+-      _debug("call complete");
+-      afs_put_call(call);
+-      _leave(" = %p", (void *)ret);
+-      return ret;
+ }
+ 
+ /*
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 41e7932d75c6..650534892a20 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -106,12 +106,6 @@ static int afs_deliver_vl_get_entry_by_name_u(struct afs_call *call)
+       return 0;
+ }
+ 
+-static void afs_destroy_vl_get_entry_by_name_u(struct afs_call *call)
+-{
+-      kfree(call->ret_vldb);
+-      afs_flat_call_destructor(call);
+-}
+-
+ /*
+  * VL.GetEntryByNameU operation type.
+  */
+@@ -119,7 +113,7 @@ static const struct afs_call_type afs_RXVLGetEntryByNameU = {
+       .name           = "VL.GetEntryByNameU",
+       .op             = afs_VL_GetEntryByNameU,
+       .deliver        = afs_deliver_vl_get_entry_by_name_u,
+-      .destructor     = afs_destroy_vl_get_entry_by_name_u,
++      .destructor     = afs_flat_call_destructor,
+ };
+ 
+ /*
+@@ -166,7 +160,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
+ 
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+-      return (struct afs_vldb_entry *)afs_wait_for_call_to_complete(call, &vc->ac);
++      afs_wait_for_call_to_complete(call, &vc->ac);
++      afs_put_call(call);
++      if (vc->ac.error) {
++              kfree(entry);
++              return ERR_PTR(vc->ac.error);
++      }
++      return entry;
+ }
+ 
+ /*
+@@ -249,12 +249,6 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+       return 0;
+ }
+ 
+-static void afs_vl_get_addrs_u_destructor(struct afs_call *call)
+-{
+-      afs_put_addrlist(call->ret_alist);
+-      return afs_flat_call_destructor(call);
+-}
+-
+ /*
+  * VL.GetAddrsU operation type.
+  */
+@@ -262,7 +256,7 @@ static const struct afs_call_type afs_RXVLGetAddrsU = {
+       .name           = "VL.GetAddrsU",
+       .op             = afs_VL_GetAddrsU,
+       .deliver        = afs_deliver_vl_get_addrs_u,
+-      .destructor     = afs_vl_get_addrs_u_destructor,
++      .destructor     = afs_flat_call_destructor,
+ };
+ 
+ /*
+@@ -273,6 +267,7 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+                                        const uuid_t *uuid)
+ {
+       struct afs_ListAddrByAttributes__xdr *r;
++      struct afs_addr_list *alist;
+       const struct afs_uuid *u = (const struct afs_uuid *)uuid;
+       struct afs_call *call;
+       struct afs_net *net = vc->cell->net;
+@@ -309,7 +304,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+ 
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+-      return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
++      afs_wait_for_call_to_complete(call, &vc->ac);
++      alist = call->ret_alist;
++      afs_put_call(call);
++      if (vc->ac.error) {
++              afs_put_addrlist(alist);
++              return ERR_PTR(vc->ac.error);
++      }
++      return alist;
+ }
+ 
+ /*
+@@ -618,7 +620,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
+       .name           = "YFSVL.GetEndpoints",
+       .op             = afs_YFSVL_GetEndpoints,
+       .deliver        = afs_deliver_yfsvl_get_endpoints,
+-      .destructor     = afs_vl_get_addrs_u_destructor,
++      .destructor     = afs_flat_call_destructor,
+ };
+ 
+ /*
+@@ -628,6 +630,7 @@ static const struct afs_call_type afs_YFSVLGetEndpoints = {
+ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+                                             const uuid_t *uuid)
+ {
++      struct afs_addr_list *alist;
+       struct afs_call *call;
+       struct afs_net *net = vc->cell->net;
+       __be32 *bp;
+@@ -652,7 +655,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+ 
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+-      return (struct afs_addr_list *)afs_wait_for_call_to_complete(call, &vc->ac);
++      afs_wait_for_call_to_complete(call, &vc->ac);
++      alist = call->ret_alist;
++      afs_put_call(call);
++      if (vc->ac.error) {
++              afs_put_addrlist(alist);
++              return ERR_PTR(vc->ac.error);
++      }
++      return alist;
+ }
+ 
+ /*
+@@ -717,12 +727,6 @@ static int afs_deliver_yfsvl_get_cell_name(struct afs_call *call)
+       return 0;
+ }
+ 
+-static void afs_destroy_yfsvl_get_cell_name(struct afs_call *call)
+-{
+-      kfree(call->ret_str);
+-      afs_flat_call_destructor(call);
+-}
+-
+ /*
+  * VL.GetCapabilities operation type
+  */
+@@ -730,7 +734,7 @@ static const struct afs_call_type afs_YFSVLGetCellName = {
+       .name           = "YFSVL.GetCellName",
+       .op             = afs_YFSVL_GetCellName,
+       .deliver        = afs_deliver_yfsvl_get_cell_name,
+-      .destructor     = afs_destroy_yfsvl_get_cell_name,
++      .destructor     = afs_flat_call_destructor,
+ };
+ 
+ /*
+@@ -745,6 +749,7 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+       struct afs_call *call;
+       struct afs_net *net = vc->cell->net;
+       __be32 *bp;
++      char *cellname;
+ 
+       _enter("");
+ 
+@@ -763,5 +768,12 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+       /* Can't take a ref on server */
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+-      return (char *)afs_wait_for_call_to_complete(call, &vc->ac);
++      afs_wait_for_call_to_complete(call, &vc->ac);
++      cellname = call->ret_str;
++      afs_put_call(call);
++      if (vc->ac.error) {
++              kfree(cellname);
++              return ERR_PTR(vc->ac.error);
++      }
++      return cellname;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch b/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch

new file mode 100644 (file)

index 0000000..f1ffd64
--- /dev/null
+++ b/queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch
@@ -0,0 +1,121 @@
+From 118984ee79c3e4b1b67ee3211d94d7d945c76fec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 2 Jan 2024 14:02:37 +0000
+Subject: afs: Fix error handling with lookup via FS.InlineBulkStatus
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 17ba6f0bd14fe3ac606aac6bebe5e69bdaad8ba1 ]
+
+When afs does a lookup, it tries to use FS.InlineBulkStatus to preemptively
+look up a bunch of files in the parent directory and cache this locally, on
+the basis that we might want to look at them too (for example if someone
+does an ls on a directory, they may want want to then stat every file
+listed).
+
+FS.InlineBulkStatus can be considered a compound op with the normal abort
+code applying to the compound as a whole.  Each status fetch within the
+compound is then given its own individual abort code - but assuming no
+error that prevents the bulk fetch from returning the compound result will
+be 0, even if all the constituent status fetches failed.
+
+At the conclusion of afs_do_lookup(), we should use the abort code from the
+appropriate status to determine the error to return, if any - but instead
+it is assumed that we were successful if the op as a whole succeeded and we
+return an incompletely initialised inode, resulting in ENOENT, no matter
+the actual reason.  In the particular instance reported, a vnode with no
+permission granted to be accessed is being given a UAEACCES abort code
+which should be reported as EACCES, but is instead being reported as
+ENOENT.
+
+Fix this by abandoning the inode (which will be cleaned up with the op) if
+file[1] has an abort code indicated and turn that abort code into an error
+instead.
+
+Whilst we're at it, add a tracepoint so that the abort codes of the
+individual subrequests of FS.InlineBulkStatus can be logged.  At the moment
+only the container abort code can be 0.
+
+Fixes: e49c7b2f6de7 ("afs: Build an abstraction around an "operation" concept")
+Reported-by: Jeffrey Altman <jaltman@auristor.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Reviewed-by: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c               | 12 +++++++++---
+ include/trace/events/afs.h | 25 +++++++++++++++++++++++++
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 75896a677b96..9140780be5a4 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -716,6 +716,8 @@ static void afs_do_lookup_success(struct afs_operation *op)
+                       break;
+               }
+ 
++              if (vp->scb.status.abort_code)
++                      trace_afs_bulkstat_error(op, &vp->fid, i, vp->scb.status.abort_code);
+               if (!vp->scb.have_status && !vp->scb.have_error)
+                       continue;
+ 
+@@ -905,12 +907,16 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+               afs_begin_vnode_operation(op);
+               afs_wait_for_operation(op);
+       }
+-      inode = ERR_PTR(afs_op_error(op));
+ 
+ out_op:
+       if (!afs_op_error(op)) {
+-              inode = &op->file[1].vnode->netfs.inode;
+-              op->file[1].vnode = NULL;
++              if (op->file[1].scb.status.abort_code) {
++                      afs_op_accumulate_error(op, -ECONNABORTED,
++                                              op->file[1].scb.status.abort_code);
++              } else {
++                      inode = &op->file[1].vnode->netfs.inode;
++                      op->file[1].vnode = NULL;
++              }
+       }
+ 
+       if (op->file[0].scb.have_status)
+diff --git a/include/trace/events/afs.h b/include/trace/events/afs.h
+index e9d412d19dbb..caec276515dc 100644
+--- a/include/trace/events/afs.h
++++ b/include/trace/events/afs.h
+@@ -1216,6 +1216,31 @@ TRACE_EVENT(afs_file_error,
+                     __print_symbolic(__entry->where, afs_file_errors))
+           );
+ 
++TRACE_EVENT(afs_bulkstat_error,
++          TP_PROTO(struct afs_operation *op, struct afs_fid *fid, unsigned int index, s32 abort),
++
++          TP_ARGS(op, fid, index, abort),
++
++          TP_STRUCT__entry(
++                  __field_struct(struct afs_fid,      fid)
++                  __field(unsigned int,               op)
++                  __field(unsigned int,               index)
++                  __field(s32,                        abort)
++                           ),
++
++          TP_fast_assign(
++                  __entry->op = op->debug_id;
++                  __entry->fid = *fid;
++                  __entry->index = index;
++                  __entry->abort = abort;
++                         ),
++
++          TP_printk("OP=%08x[%02x] %llx:%llx:%x a=%d",
++                    __entry->op, __entry->index,
++                    __entry->fid.vid, __entry->fid.vnode, __entry->fid.unique,
++                    __entry->abort)
++          );
++
+ TRACE_EVENT(afs_cm_no_server,
+           TP_PROTO(struct afs_call *call, struct sockaddr_rxrpc *srx),
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch b/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch

new file mode 100644 (file)

index 0000000..fbccf51
--- /dev/null
+++ b/queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch
@@ -0,0 +1,90 @@
+From f7c86f260e13437b71984ae0f0cd27554335461d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 30 Nov 2023 12:56:14 +0100
+Subject: afs: fix the usage of read_seqbegin_or_lock() in afs_find_server*()
+
+From: Oleg Nesterov <oleg@redhat.com>
+
+[ Upstream commit 1702e0654ca9a7bcd7c7619c8a5004db58945b71 ]
+
+David Howells says:
+
+ (5) afs_find_server().
+
+     There could be a lot of servers in the list and each server can have
+     multiple addresses, so I think this would be better with an exclusive
+     second pass.
+
+     The server list isn't likely to change all that often, but when it does
+     change, there's a good chance several servers are going to be
+     added/removed one after the other.  Further, this is only going to be
+     used for incoming cache management/callback requests from the server,
+     which hopefully aren't going to happen too often - but it is remotely
+     drivable.
+
+ (6) afs_find_server_by_uuid().
+
+     Similarly to (5), there could be a lot of servers to search through, but
+     they are in a tree not a flat list, so it should be faster to process.
+     Again, it's not likely to change that often and, again, when it does
+     change it's likely to involve multiple changes.  This can be driven
+     remotely by an incoming cache management request but is mostly going to
+     be driven by setting up or reconfiguring a volume's server list -
+     something that also isn't likely to happen often.
+
+Make the "seq" counter odd on the 2nd pass, otherwise read_seqbegin_or_lock()
+never takes the lock.
+
+Signed-off-by: Oleg Nesterov <oleg@redhat.com>
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Link: https://lore.kernel.org/r/20231130115614.GA21581@redhat.com/
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/server.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index b5237206eac3..0bd2f5ba6900 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -27,7 +27,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+       const struct afs_addr_list *alist;
+       struct afs_server *server = NULL;
+       unsigned int i;
+-      int seq = 0, diff;
++      int seq = 1, diff;
+ 
+       rcu_read_lock();
+ 
+@@ -35,6 +35,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+               if (server)
+                       afs_unuse_server_notime(net, server, afs_server_trace_put_find_rsq);
+               server = NULL;
++              seq++; /* 2 on the 1st/lockless path, otherwise odd */
+               read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+ 
+               if (srx->transport.family == AF_INET6) {
+@@ -90,7 +91,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
+ {
+       struct afs_server *server = NULL;
+       struct rb_node *p;
+-      int diff, seq = 0;
++      int diff, seq = 1;
+ 
+       _enter("%pU", uuid);
+ 
+@@ -102,7 +103,7 @@ struct afs_server *afs_find_server_by_uuid(struct afs_net *net, const uuid_t *uu
+               if (server)
+                       afs_unuse_server(net, server, afs_server_trace_put_uuid_rsq);
+               server = NULL;
+-
++              seq++; /* 2 on the 1st/lockless path, otherwise odd */
+               read_seqbegin_or_lock(&net->fs_lock, &seq);
+ 
+               p = net->fs_servers.rb_node;
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch b/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch

new file mode 100644 (file)

index 0000000..65a186d
--- /dev/null
+++ b/queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch
@@ -0,0 +1,44 @@
+From e9106b5b7e80e3ab85a87fbea7ea3ecdc53673cd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 Oct 2023 16:00:18 +0100
+Subject: afs: Handle the VIO and UAEIO aborts explicitly
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit eb8eae65f0c713bcef84b082aa919f72c3d83268 ]
+
+When processing the result of a call, handle the VIO and UAEIO abort
+specifically rather than leaving it to a default case.  Rather than
+erroring out unconditionally, see if there's another server if the volume
+has more than one server available, otherwise return -EREMOTEIO.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/rotate.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 59aed7a6dd11..a108cd55bb4e 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -330,6 +330,13 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+                       goto restart_from_beginning;
+ 
++              case UAEIO:
++              case VIO:
++                      op->error = -EREMOTEIO;
++                      if (op->volume->type != AFSVL_RWVOL)
++                              goto next_server;
++                      goto failed;
++
+               case VDISKFULL:
+               case UAENOSPC:
+                       /* The partition is full.  Only applies to RWVOLs.
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch b/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch

new file mode 100644 (file)

index 0000000..d285df1
--- /dev/null
+++ b/queue-6.7/afs-hide-silly-rename-files-from-userspace.patch
@@ -0,0 +1,54 @@
+From b8adfd03eeab12713e571102691a0551705137fc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 8 Jan 2024 17:22:36 +0000
+Subject: afs: Hide silly-rename files from userspace
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 57e9d49c54528c49b8bffe6d99d782ea051ea534 ]
+
+There appears to be a race between silly-rename files being created/removed
+and various userspace tools iterating over the contents of a directory,
+leading to such errors as:
+
+       find: './kernel/.tmp_cpio_dir/include/dt-bindings/reset/.__afs2080': No such file or directory
+       tar: ./include/linux/greybus/.__afs3C95: File removed before we read it
+
+when building a kernel.
+
+Fix afs_readdir() so that it doesn't return .__afsXXXX silly-rename files
+to userspace.  This doesn't stop them being looked up directly by name as
+we need to be able to look them up from within the kernel as part of the
+silly-rename algorithm.
+
+Fixes: 79ddbfa500b3 ("afs: Implement sillyrename for unlink and rename")
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c | 8 ++++++++
+ 1 file changed, 8 insertions(+)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 5219182e52e1..2df2e9ee130d 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -474,6 +474,14 @@ static int afs_dir_iterate_block(struct afs_vnode *dvnode,
+                       continue;
+               }
+ 
++              /* Don't expose silly rename entries to userspace. */
++              if (nlen > 6 &&
++                  dire->u.name[0] == '.' &&
++                  ctx->actor != afs_lookup_filldir &&
++                  ctx->actor != afs_lookup_one_filldir &&
++                  memcmp(dire->u.name, ".__afs", 6) == 0)
++                      continue;
++
+               /* found the next entry */
+               if (!dir_emit(ctx, dire->u.name, nlen,
+                             ntohl(dire->u.vnode),
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-simplify-error-handling.patch b/queue-6.7/afs-simplify-error-handling.patch

new file mode 100644 (file)

index 0000000..71b95db
--- /dev/null
+++ b/queue-6.7/afs-simplify-error-handling.patch
@@ -0,0 +1,1028 @@
+From d2f27e70f3691aa4364cbb1f807ed0811f3a3ab0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 25 Oct 2023 17:53:33 +0100
+Subject: afs: Simplify error handling
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit aa453becce5d1ae1b94b7fc22f47d7b05d22b14e ]
+
+Simplify error handling a bit by moving it from the afs_addr_cursor struct
+to the afs_operation and afs_vl_cursor structs and using the error
+prioritisation function for accumulating errors from multiple sources (AFS
+tries to rotate between multiple fileservers, some of which may be
+inaccessible or in some state of offlinedness).
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c    |  8 ++--
+ fs/afs/dir.c          | 14 ++++---
+ fs/afs/dir_silly.c    |  2 +-
+ fs/afs/file.c         |  3 --
+ fs/afs/fs_operation.c | 24 +++++------
+ fs/afs/fsclient.c     |  1 +
+ fs/afs/internal.h     | 44 +++++++++++++++------
+ fs/afs/misc.c         | 10 ++++-
+ fs/afs/rotate.c       | 58 ++++++++++++++-------------
+ fs/afs/rxrpc.c        | 17 ++++----
+ fs/afs/server.c       |  1 -
+ fs/afs/vl_alias.c     |  2 +-
+ fs/afs/vl_probe.c     |  7 ++--
+ fs/afs/vl_rotate.c    | 92 +++++++++++++++++++++----------------------
+ fs/afs/vlclient.c     | 34 ++++++++++------
+ 15 files changed, 174 insertions(+), 143 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index 519821f5aedc..f4837c3b8ae2 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -386,26 +386,24 @@ bool afs_iterate_addresses(struct afs_addr_cursor *ac)
+ selected:
+       ac->index = index;
+       set_bit(index, &ac->tried);
+-      ac->responded = false;
++      ac->call_responded = false;
+       return true;
+ }
+ 
+ /*
+  * Release an address list cursor.
+  */
+-int afs_end_cursor(struct afs_addr_cursor *ac)
++void afs_end_cursor(struct afs_addr_cursor *ac)
+ {
+       struct afs_addr_list *alist;
+ 
+       alist = ac->alist;
+       if (alist) {
+-              if (ac->responded &&
++              if (ac->call_responded &&
+                   ac->index != alist->preferred &&
+                   test_bit(ac->alist->preferred, &ac->tried))
+                       WRITE_ONCE(alist->preferred, ac->index);
+               afs_put_addrlist(alist);
+               ac->alist = NULL;
+       }
+-
+-      return ac->error;
+ }
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 15763418a938..75896a677b96 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -701,8 +701,9 @@ static void afs_do_lookup_success(struct afs_operation *op)
+                       vp = &op->file[0];
+                       abort_code = vp->scb.status.abort_code;
+                       if (abort_code != 0) {
+-                              op->ac.abort_code = abort_code;
+-                              op->error = afs_abort_to_error(abort_code);
++                              op->call_abort_code = abort_code;
++                              afs_op_set_error(op, afs_abort_to_error(abort_code));
++                              op->cumul_error.abort_code = abort_code;
+                       }
+                       break;
+ 
+@@ -854,13 +855,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+       _debug("nr_files %u", op->nr_files);
+ 
+       /* Need space for examining all the selected files */
+-      op->error = -ENOMEM;
+       if (op->nr_files > 2) {
+               op->more_files = kvcalloc(op->nr_files - 2,
+                                         sizeof(struct afs_vnode_param),
+                                         GFP_KERNEL);
+-              if (!op->more_files)
++              if (!op->more_files) {
++                      afs_op_nomem(op);
+                       goto out_op;
++              }
+ 
+               for (i = 2; i < op->nr_files; i++) {
+                       vp = &op->more_files[i - 2];
+@@ -1263,7 +1265,7 @@ void afs_check_for_remote_deletion(struct afs_operation *op)
+ {
+       struct afs_vnode *vnode = op->file[0].vnode;
+ 
+-      switch (op->ac.abort_code) {
++      switch (afs_op_abort_code(op)) {
+       case VNOVNODE:
+               set_bit(AFS_VNODE_DELETED, &vnode->flags);
+               afs_break_callback(vnode, afs_cb_break_for_deleted);
+@@ -1288,7 +1290,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+               /* ENOMEM or EINTR at a really inconvenient time - just abandon
+                * the new directory on the server.
+                */
+-              op->error = PTR_ERR(inode);
++              afs_op_accumulate_error(op, PTR_ERR(inode), 0);
+               return;
+       }
+ 
+diff --git a/fs/afs/dir_silly.c b/fs/afs/dir_silly.c
+index bb5807e87fa4..a1e581946b93 100644
+--- a/fs/afs/dir_silly.c
++++ b/fs/afs/dir_silly.c
+@@ -218,7 +218,7 @@ static int afs_do_silly_unlink(struct afs_vnode *dvnode, struct afs_vnode *vnode
+       /* If there was a conflict with a third party, check the status of the
+        * unlinked vnode.
+        */
+-      if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
++      if (op->cumul_error.error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+               op->file[1].update_ctime = false;
+               op->fetch_status.which = 1;
+               op->ops = &afs_fetch_status_operation;
+diff --git a/fs/afs/file.c b/fs/afs/file.c
+index 0c81c39c32f5..8f9b42427569 100644
+--- a/fs/afs/file.c
++++ b/fs/afs/file.c
+@@ -245,10 +245,7 @@ static void afs_fetch_data_notify(struct afs_operation *op)
+       struct netfs_io_subrequest *subreq = req->subreq;
+       int error = afs_op_error(op);
+ 
+-      if (error == -ECONNABORTED)
+-              error = afs_abort_to_error(op->ac.abort_code);
+       req->error = error;
+-
+       if (subreq) {
+               __set_bit(NETFS_SREQ_CLEAR_TAIL, &subreq->flags);
+               netfs_subreq_terminated(subreq, error ?: req->actual_len, false);
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 1c22d6e77846..cebe4fad8192 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -169,9 +169,6 @@ static void afs_end_vnode_operation(struct afs_operation *op)
+       }
+ 
+       afs_drop_io_locks(op);
+-
+-      if (op->error == -ECONNABORTED)
+-              op->error = afs_abort_to_error(op->ac.abort_code);
+ }
+ 
+ /*
+@@ -182,6 +179,8 @@ void afs_wait_for_operation(struct afs_operation *op)
+       _enter("");
+ 
+       while (afs_select_fileserver(op)) {
++              op->call_error = 0;
++              op->call_abort_code = 0;
+               op->cb_s_break = op->server->cb_s_break;
+               if (test_bit(AFS_SERVER_FL_IS_YFS, &op->server->flags) &&
+                   op->ops->issue_yfs_rpc)
+@@ -189,28 +188,29 @@ void afs_wait_for_operation(struct afs_operation *op)
+               else if (op->ops->issue_afs_rpc)
+                       op->ops->issue_afs_rpc(op);
+               else
+-                      op->ac.error = -ENOTSUPP;
++                      op->call_error = -ENOTSUPP;
+ 
+               if (op->call) {
+                       afs_wait_for_call_to_complete(op->call, &op->ac);
+-                      op->error = op->ac.error;
++                      op->call_abort_code = op->call->abort_code;
++                      op->call_error = op->call->error;
++                      op->call_responded = op->call->responded;
++                      op->ac.call_responded = true;
++                      WRITE_ONCE(op->ac.alist->addrs[op->ac.index].last_error,
++                                 op->call_error);
+                       afs_put_call(op->call);
+               }
+       }
+ 
+-      switch (op->error) {
+-      case 0:
++      if (!afs_op_error(op)) {
+               _debug("success");
+               op->ops->success(op);
+-              break;
+-      case -ECONNABORTED:
++      } else if (op->cumul_error.aborted) {
+               if (op->ops->aborted)
+                       op->ops->aborted(op);
+-              fallthrough;
+-      default:
++      } else {
+               if (op->ops->failed)
+                       op->ops->failed(op);
+-              break;
+       }
+ 
+       afs_end_vnode_operation(op);
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 020073387111..2a56dea22519 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1629,6 +1629,7 @@ int afs_fs_give_up_all_callbacks(struct afs_net *net,
+       call->server = afs_use_server(server, afs_server_trace_give_up_cb);
+       afs_make_call(ac, call, GFP_NOFS);
+       afs_wait_for_call_to_complete(call, ac);
++      ret = call->error;
+       afs_put_call(call);
+       return ret;
+ }
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 45c4526b56be..5f6db0ac06ac 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -75,6 +75,7 @@ enum afs_call_state {
+ struct afs_address {
+       struct rxrpc_peer       *peer;
+       u16                     service_id;
++      short                   last_error;     /* Last error from this address */
+ };
+ 
+ /*
+@@ -121,7 +122,6 @@ struct afs_call {
+       };
+       void                    *buffer;        /* reply receive buffer */
+       union {
+-              long                    ret0;   /* Value to reply with instead of 0 */
+               struct afs_addr_list    *ret_alist;
+               struct afs_vldb_entry   *ret_vldb;
+               char                    *ret_str;
+@@ -145,6 +145,7 @@ struct afs_call {
+       bool                    upgrade;        /* T to request service upgrade */
+       bool                    intr;           /* T if interruptible */
+       bool                    unmarshalling_error; /* T if an unmarshalling error occurred */
++      bool                    responded;      /* Got a response from the call (may be abort) */
+       u16                     service_id;     /* Actual service ID (after upgrade) */
+       unsigned int            debug_id;       /* Trace ID */
+       u32                     operation_ID;   /* operation ID for an incoming call */
+@@ -719,8 +720,10 @@ struct afs_permits {
+  * Error prioritisation and accumulation.
+  */
+ struct afs_error {
+-      short   error;                  /* Accumulated error */
++      s32     abort_code;             /* Cumulative abort code */
++      short   error;                  /* Cumulative error */
+       bool    responded;              /* T if server responded */
++      bool    aborted;                /* T if ->error is from an abort */
+ };
+ 
+ /*
+@@ -730,10 +733,8 @@ struct afs_addr_cursor {
+       struct afs_addr_list    *alist;         /* Current address list (pins ref) */
+       unsigned long           tried;          /* Tried addresses */
+       signed char             index;          /* Current address */
+-      bool                    responded;      /* T if the current address responded */
+       unsigned short          nr_iterations;  /* Number of address iterations */
+-      short                   error;
+-      u32                     abort_code;
++      bool                    call_responded;
+ };
+ 
+ /*
+@@ -746,13 +747,16 @@ struct afs_vl_cursor {
+       struct afs_vlserver     *server;        /* Server on which this resides */
+       struct key              *key;           /* Key for the server */
+       unsigned long           untried;        /* Bitmask of untried servers */
++      struct afs_error        cumul_error;    /* Cumulative error */
++      s32                     call_abort_code;
+       short                   index;          /* Current server */
+-      short                   error;
++      short                   call_error;     /* Error from single call */
+       unsigned short          flags;
+ #define AFS_VL_CURSOR_STOP    0x0001          /* Set to cease iteration */
+ #define AFS_VL_CURSOR_RETRY   0x0002          /* Set to do a retry */
+ #define AFS_VL_CURSOR_RETRIED 0x0004          /* Set if started a retry */
+-      unsigned short          nr_iterations;  /* Number of server iterations */
++      short                   nr_iterations;  /* Number of server iterations */
++      bool                    call_responded; /* T if the current address responded */
+ };
+ 
+ /*
+@@ -803,8 +807,10 @@ struct afs_operation {
+       struct dentry           *dentry_2;      /* Second dentry to be altered */
+       struct timespec64       mtime;          /* Modification time to record */
+       struct timespec64       ctime;          /* Change time to set */
++      struct afs_error        cumul_error;    /* Cumulative error */
+       short                   nr_files;       /* Number of entries in file[], more_files */
+-      short                   error;
++      short                   call_error;     /* Error from single call */
++      s32                     call_abort_code; /* Abort code from single call */
+       unsigned int            debug_id;
+ 
+       unsigned int            cb_v_break;     /* Volume break counter before op */
+@@ -860,6 +866,8 @@ struct afs_operation {
+       unsigned long           untried;        /* Bitmask of untried servers */
+       short                   index;          /* Current server */
+       short                   nr_iterations;  /* Number of server iterations */
++      bool                    call_responded; /* T if the current address responded */
++
+ 
+       unsigned int            flags;
+ #define AFS_OPERATION_STOP            0x0001  /* Set to cease iteration */
+@@ -976,7 +984,7 @@ bool afs_addr_list_same(const struct afs_addr_list *a,
+                       const struct afs_addr_list *b);
+ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+-extern int afs_end_cursor(struct afs_addr_cursor *);
++extern void afs_end_cursor(struct afs_addr_cursor *ac);
+ 
+ extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
+                             __be32 xdr, u16 port);
+@@ -1235,17 +1243,27 @@ extern void afs_prioritise_error(struct afs_error *, int, u32);
+ 
+ static inline void afs_op_nomem(struct afs_operation *op)
+ {
+-      op->error = -ENOMEM;
++      op->cumul_error.error = -ENOMEM;
+ }
+ 
+ static inline int afs_op_error(const struct afs_operation *op)
+ {
+-      return op->error;
++      return op->cumul_error.error;
++}
++
++static inline s32 afs_op_abort_code(const struct afs_operation *op)
++{
++      return op->cumul_error.abort_code;
+ }
+ 
+ static inline int afs_op_set_error(struct afs_operation *op, int error)
+ {
+-      return op->error = error;
++      return op->cumul_error.error = error;
++}
++
++static inline void afs_op_accumulate_error(struct afs_operation *op, int error, s32 abort_code)
++{
++      afs_prioritise_error(&op->cumul_error, error, abort_code);
+ }
+ 
+ /*
+@@ -1619,7 +1637,7 @@ static inline void afs_update_dentry_version(struct afs_operation *op,
+                                            struct afs_vnode_param *dir_vp,
+                                            struct dentry *dentry)
+ {
+-      if (!op->error)
++      if (!op->cumul_error.error)
+               dentry->d_fsdata =
+                       (void *)(unsigned long)dir_vp->scb.status.data_version;
+ }
+diff --git a/fs/afs/misc.c b/fs/afs/misc.c
+index 805328ca5428..b8180bf2281f 100644
+--- a/fs/afs/misc.c
++++ b/fs/afs/misc.c
+@@ -116,6 +116,8 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+ {
+       switch (error) {
+       case 0:
++              e->aborted = false;
++              e->error = 0;
+               return;
+       default:
+               if (e->error == -ETIMEDOUT ||
+@@ -161,12 +163,16 @@ void afs_prioritise_error(struct afs_error *e, int error, u32 abort_code)
+               if (e->responded)
+                       return;
+               e->error = error;
++              e->aborted = false;
+               return;
+ 
+       case -ECONNABORTED:
+-              error = afs_abort_to_error(abort_code);
+-              fallthrough;
++              e->error = afs_abort_to_error(abort_code);
++              e->aborted = true;
++              e->responded = true;
++              return;
+       case -ENETRESET: /* Responded, but we seem to have changed address */
++              e->aborted = false;
+               e->responded = true;
+               e->error = error;
+               return;
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index d64c1d90faed..68c88e3a0916 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -112,9 +112,9 @@ bool afs_select_fileserver(struct afs_operation *op)
+       struct afs_addr_list *alist;
+       struct afs_server *server;
+       struct afs_vnode *vnode = op->file[0].vnode;
+-      struct afs_error e;
+       unsigned int rtt;
+-      int error = op->ac.error, i;
++      s32 abort_code = op->call_abort_code;
++      int error = op->call_error, i;
+ 
+       op->nr_iterations++;
+ 
+@@ -122,7 +122,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+              op->debug_id, op->nr_iterations, op->volume->vid,
+              op->untried, op->index,
+              op->ac.tried, op->ac.index,
+-             error, op->ac.abort_code);
++             error, abort_code);
+ 
+       if (op->flags & AFS_OPERATION_STOP) {
+               _leave(" = f [stopped]");
+@@ -133,8 +133,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+               goto start;
+ 
+       /* Evaluate the result of the previous operation, if there was one. */
+-      switch (error) {
++      switch (op->call_error) {
+       case 0:
++              op->cumul_error.responded = true;
++              fallthrough;
+       default:
+               /* Success or local failure.  Stop. */
+               afs_op_set_error(op, error);
+@@ -151,7 +153,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+                * errors instead.  IBM AFS and OpenAFS fileservers, however, do leak
+                * these abort codes.
+                */
+-              switch (op->ac.abort_code) {
++              op->cumul_error.responded = true;
++              switch (abort_code) {
+               case VNOVOL:
+                       /* This fileserver doesn't know about the volume.
+                        * - May indicate that the VL is wrong - retry once and compare
+@@ -164,7 +167,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        *   (administrative action).
+                        */
+                       if (op->flags & AFS_OPERATION_VNOVOL) {
+-                              op->error = -EREMOTEIO;
++                              afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+                               goto next_server;
+                       }
+ 
+@@ -188,7 +191,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * it's the fileserver having trouble.
+                        */
+                       if (rcu_access_pointer(op->volume->servers) == op->server_list) {
+-                              op->error = -EREMOTEIO;
++                              afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+                               goto next_server;
+                       }
+ 
+@@ -201,8 +204,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+               case VONLINE:
+                       /* These should not be returned from the fileserver. */
+                       pr_warn("Fileserver returned unexpected abort %d\n",
+-                              op->ac.abort_code);
+-                      op->error = -EREMOTEIO;
++                              abort_code);
++                      afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+                       goto next_server;
+ 
+               case VNOSERVICE:
+@@ -233,7 +236,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * VNOSERVICE should be treated as an alias for RX_CALL_TIMEOUT.
+                        */
+               case RX_CALL_TIMEOUT:
+-                      op->error = -ETIMEDOUT;
++                      afs_op_accumulate_error(op, -ETIMEDOUT, abort_code);
+                       goto next_server;
+ 
+               case VSALVAGING: /* This error should not be leaked to cache managers
+@@ -248,7 +251,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * days).
+                        */
+                       if (!test_and_set_bit(AFS_VOLUME_OFFLINE, &op->volume->flags)) {
+-                              afs_busy(op->volume, op->ac.abort_code);
++                              afs_busy(op->volume, abort_code);
+                               clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+                       }
+                       if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+@@ -281,7 +284,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                               goto failed;
+                       }
+                       if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
+-                              afs_busy(op->volume, op->ac.abort_code);
++                              afs_busy(op->volume, abort_code);
+                               clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+                       }
+               busy:
+@@ -329,7 +332,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * TODO: Retry a few times with sleeps.
+                        */
+                       if (rcu_access_pointer(op->volume->servers) == op->server_list) {
+-                              op->error = -ENOMEDIUM;
++                              afs_op_accumulate_error(op, -ENOMEDIUM, abort_code);
+                               goto failed;
+                       }
+ 
+@@ -337,7 +340,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+               case UAEIO:
+               case VIO:
+-                      op->error = -EREMOTEIO;
++                      afs_op_accumulate_error(op, -EREMOTEIO, abort_code);
+                       if (op->volume->type != AFSVL_RWVOL)
+                               goto next_server;
+                       goto failed;
+@@ -361,7 +364,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                       goto failed_but_online;
+ 
+               default:
+-                      op->error = afs_abort_to_error(op->ac.abort_code);
++                      afs_op_accumulate_error(op, error, abort_code);
+               failed_but_online:
+                       clear_bit(AFS_VOLUME_OFFLINE, &op->volume->flags);
+                       clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+@@ -380,7 +383,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+       case -EHOSTDOWN:
+       case -ECONNREFUSED:
+               _debug("no conn");
+-              op->error = error;
++              afs_op_accumulate_error(op, error, 0);
+               goto iterate_address;
+ 
+       case -ENETRESET:
+@@ -506,6 +509,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+              op->index, op->ac.index, op->ac.alist->nr_addrs,
+              rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
+ 
++      op->call_responded = false;
+       _leave(" = t");
+       return true;
+ 
+@@ -543,17 +547,14 @@ bool afs_select_fileserver(struct afs_operation *op)
+       if (op->flags & AFS_OPERATION_VBUSY)
+               goto restart_from_beginning;
+ 
+-      e.error = -EDESTADDRREQ;
+-      e.responded = false;
+       for (i = 0; i < op->server_list->nr_servers; i++) {
+               struct afs_server *s = op->server_list->servers[i].server;
+ 
+-              afs_prioritise_error(&e, READ_ONCE(s->probe.error),
+-                                   s->probe.abort_code);
++              error = READ_ONCE(s->probe.error);
++              if (error < 0)
++                      afs_op_accumulate_error(op, error, s->probe.abort_code);
+       }
+ 
+-      error = e.error;
+-      op->error = error;
+ failed:
+       op->flags |= AFS_OPERATION_STOP;
+       afs_end_cursor(&op->ac);
+@@ -576,11 +577,13 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
+       rcu_read_lock();
+ 
+       pr_notice("EDESTADDR occurred\n");
+-      pr_notice("FC: cbb=%x cbb2=%x fl=%x err=%hd\n",
++      pr_notice("OP: cbb=%x cbb2=%x fl=%x err=%hd\n",
+                 op->file[0].cb_break_before,
+-                op->file[1].cb_break_before, op->flags, op->error);
+-      pr_notice("FC: ut=%lx ix=%d ni=%u\n",
++                op->file[1].cb_break_before, op->flags, op->cumul_error.error);
++      pr_notice("OP: ut=%lx ix=%d ni=%u\n",
+                 op->untried, op->index, op->nr_iterations);
++      pr_notice("OP: call  er=%d ac=%d r=%u\n",
++                op->call_error, op->call_abort_code, op->call_responded);
+ 
+       if (op->server_list) {
+               const struct afs_server_list *sl = op->server_list;
+@@ -605,8 +608,7 @@ void afs_dump_edestaddrreq(const struct afs_operation *op)
+               }
+       }
+ 
+-      pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+-                op->ac.tried, op->ac.index, op->ac.abort_code, op->ac.error,
+-                op->ac.responded, op->ac.nr_iterations);
++      pr_notice("AC: t=%lx ax=%u ni=%u\n",
++                op->ac.tried, op->ac.index, op->ac.nr_iterations);
+       rcu_read_unlock();
+ }
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index dad8efadbc44..0b3e2f20b0e0 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -408,8 +408,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+               rxrpc_kernel_recv_data(call->net->socket, rxcall,
+                                      &msg.msg_iter, &len, false,
+                                      &call->abort_code, &call->service_id);
+-              ac->abort_code = call->abort_code;
+-              ac->responded = true;
++              call->responded = true;
+       }
+       call->error = ret;
+       trace_afs_call_done(call);
+@@ -429,7 +428,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+               afs_set_call_complete(call, ret, 0);
+       }
+ 
+-      ac->error = ret;
++      call->error = ret;
+       call->state = AFS_CALL_COMPLETE;
+       _leave(" = %d", ret);
+ }
+@@ -510,6 +509,7 @@ static void afs_deliver_to_call(struct afs_call *call)
+                       ret = -EBADMSG;
+               switch (ret) {
+               case 0:
++                      call->responded = true;
+                       afs_queue_call_work(call);
+                       if (state == AFS_CALL_CL_PROC_REPLY) {
+                               if (call->op)
+@@ -524,9 +524,11 @@ static void afs_deliver_to_call(struct afs_call *call)
+                       goto out;
+               case -ECONNABORTED:
+                       ASSERTCMP(state, ==, AFS_CALL_COMPLETE);
++                      call->responded = true;
+                       afs_log_error(call, call->abort_code);
+                       goto done;
+               case -ENOTSUPP:
++                      call->responded = true;
+                       abort_code = RXGEN_OPCODE;
+                       rxrpc_kernel_abort_call(call->net->socket, call->rxcall,
+                                               abort_code, ret,
+@@ -573,7 +575,7 @@ static void afs_deliver_to_call(struct afs_call *call)
+ }
+ 
+ /*
+- * Wait synchronously for a call to complete and clean up the call struct.
++ * Wait synchronously for a call to complete.
+  */
+ void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor *ac)
+ {
+@@ -626,13 +628,8 @@ void afs_wait_for_call_to_complete(struct afs_call *call, struct afs_addr_cursor
+               }
+       }
+ 
+-      spin_lock_bh(&call->state_lock);
+-      ac->abort_code = call->abort_code;
+-      ac->error = call->error;
+-      spin_unlock_bh(&call->state_lock);
+-
+       if (call->error == 0 || call->error == -ECONNABORTED)
+-              ac->responded = true;
++              call->responded = true;
+ }
+ 
+ /*
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 2826e6eced71..f7791ef13618 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -437,7 +437,6 @@ static void afs_give_up_callbacks(struct afs_net *net, struct afs_server *server
+       struct afs_addr_cursor ac = {
+               .alist  = alist,
+               .index  = alist->preferred,
+-              .error  = 0,
+       };
+ 
+       afs_fs_give_up_all_callbacks(net, server, &ac, NULL);
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index 6fdf9f1bedc0..89cadd9a69e1 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -236,7 +236,7 @@ static char *afs_vl_get_cell_name(struct afs_cell *cell, struct key *key)
+ 
+       while (afs_select_vlserver(&vc)) {
+               if (!test_bit(AFS_VLSERVER_FL_IS_YFS, &vc.server->flags)) {
+-                      vc.ac.error = -EOPNOTSUPP;
++                      vc.call_error = -EOPNOTSUPP;
+                       skipped = true;
+                       continue;
+               }
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index 9551aef07cee..2f8a13c2bf0c 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -169,10 +169,11 @@ static bool afs_do_probe_vlserver(struct afs_net *net,
+               call = afs_vl_get_capabilities(net, &ac, key, server,
+                                              server_index);
+               if (!IS_ERR(call)) {
++                      afs_prioritise_error(_e, call->error, call->abort_code);
+                       afs_put_call(call);
+                       in_progress = true;
+               } else {
+-                      afs_prioritise_error(_e, PTR_ERR(call), ac.abort_code);
++                      afs_prioritise_error(_e, PTR_ERR(call), 0);
+                       afs_done_one_vl_probe(server, false);
+               }
+       }
+@@ -187,12 +188,10 @@ int afs_send_vl_probes(struct afs_net *net, struct key *key,
+                      struct afs_vlserver_list *vllist)
+ {
+       struct afs_vlserver *server;
+-      struct afs_error e;
++      struct afs_error e = {};
+       bool in_progress = false;
+       int i;
+ 
+-      e.error = 0;
+-      e.responded = false;
+       for (i = 0; i < vllist->nr_servers; i++) {
+               server = vllist->servers[i].server;
+               if (test_bit(AFS_VLSERVER_FL_PROBED, &server->flags))
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index f8f255c966ae..e2dc54082a05 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -20,11 +20,11 @@ bool afs_begin_vlserver_operation(struct afs_vl_cursor *vc, struct afs_cell *cel
+       memset(vc, 0, sizeof(*vc));
+       vc->cell = cell;
+       vc->key = key;
+-      vc->error = -EDESTADDRREQ;
+-      vc->ac.error = SHRT_MAX;
++      vc->cumul_error.error = -EDESTADDRREQ;
++      vc->nr_iterations = -1;
+ 
+       if (signal_pending(current)) {
+-              vc->error = -EINTR;
++              vc->cumul_error.error = -EINTR;
+               vc->flags |= AFS_VL_CURSOR_STOP;
+               return false;
+       }
+@@ -52,7 +52,7 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+                                   &cell->dns_lookup_count,
+                                   smp_load_acquire(&cell->dns_lookup_count)
+                                   != dns_lookup_count) < 0) {
+-                              vc->error = -ERESTARTSYS;
++                              vc->cumul_error.error = -ERESTARTSYS;
+                               return false;
+                       }
+               }
+@@ -60,12 +60,12 @@ static bool afs_start_vl_iteration(struct afs_vl_cursor *vc)
+               /* Status load is ordered after lookup counter load */
+               if (cell->dns_status == DNS_LOOKUP_GOT_NOT_FOUND) {
+                       pr_warn("No record of cell %s\n", cell->name);
+-                      vc->error = -ENOENT;
++                      vc->cumul_error.error = -ENOENT;
+                       return false;
+               }
+ 
+               if (cell->dns_source == DNS_RECORD_UNAVAILABLE) {
+-                      vc->error = -EDESTADDRREQ;
++                      vc->cumul_error.error = -EDESTADDRREQ;
+                       return false;
+               }
+       }
+@@ -91,52 +91,52 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ {
+       struct afs_addr_list *alist;
+       struct afs_vlserver *vlserver;
+-      struct afs_error e;
+       unsigned int rtt;
+-      int error = vc->ac.error, i;
++      s32 abort_code = vc->call_abort_code;
++      int error = vc->call_error, i;
++
++      vc->nr_iterations++;
+ 
+       _enter("%lx[%d],%lx[%d],%d,%d",
+              vc->untried, vc->index,
+              vc->ac.tried, vc->ac.index,
+-             error, vc->ac.abort_code);
++             error, abort_code);
+ 
+       if (vc->flags & AFS_VL_CURSOR_STOP) {
+               _leave(" = f [stopped]");
+               return false;
+       }
+ 
+-      vc->nr_iterations++;
++      if (vc->nr_iterations == 0)
++              goto start;
+ 
+       /* Evaluate the result of the previous operation, if there was one. */
+       switch (error) {
+-      case SHRT_MAX:
+-              goto start;
+-
+       default:
+       case 0:
+               /* Success or local failure.  Stop. */
+-              vc->error = error;
++              vc->cumul_error.error = error;
+               vc->flags |= AFS_VL_CURSOR_STOP;
+-              _leave(" = f [okay/local %d]", vc->ac.error);
++              _leave(" = f [okay/local %d]", vc->cumul_error.error);
+               return false;
+ 
+       case -ECONNABORTED:
+               /* The far side rejected the operation on some grounds.  This
+                * might involve the server being busy or the volume having been moved.
+                */
+-              switch (vc->ac.abort_code) {
++              switch (abort_code) {
+               case AFSVL_IO:
+               case AFSVL_BADVOLOPER:
+               case AFSVL_NOMEM:
+                       /* The server went weird. */
+-                      vc->error = -EREMOTEIO;
++                      afs_prioritise_error(&vc->cumul_error, -EREMOTEIO, abort_code);
+                       //write_lock(&vc->cell->vl_servers_lock);
+                       //vc->server_list->weird_mask |= 1 << vc->index;
+                       //write_unlock(&vc->cell->vl_servers_lock);
+                       goto next_server;
+ 
+               default:
+-                      vc->error = afs_abort_to_error(vc->ac.abort_code);
++                      afs_prioritise_error(&vc->cumul_error, error, abort_code);
+                       goto failed;
+               }
+ 
+@@ -149,12 +149,12 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+       case -ETIMEDOUT:
+       case -ETIME:
+               _debug("no conn %d", error);
+-              vc->error = error;
++              afs_prioritise_error(&vc->cumul_error, error, 0);
+               goto iterate_address;
+ 
+       case -ECONNRESET:
+               _debug("call reset");
+-              vc->error = error;
++              afs_prioritise_error(&vc->cumul_error, error, 0);
+               vc->flags |= AFS_VL_CURSOR_RETRY;
+               goto next_server;
+ 
+@@ -178,15 +178,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+               goto failed;
+ 
+       error = afs_send_vl_probes(vc->cell->net, vc->key, vc->server_list);
+-      if (error < 0)
+-              goto failed_set_error;
++      if (error < 0) {
++              afs_prioritise_error(&vc->cumul_error, error, 0);
++              goto failed;
++      }
+ 
+ pick_server:
+       _debug("pick [%lx]", vc->untried);
+ 
+       error = afs_wait_for_vl_probes(vc->server_list, vc->untried);
+-      if (error < 0)
+-              goto failed_set_error;
++      if (error < 0) {
++              afs_prioritise_error(&vc->cumul_error, error, 0);
++              goto failed;
++      }
+ 
+       /* Pick the untried server with the lowest RTT. */
+       vc->index = vc->server_list->preferred;
+@@ -249,6 +253,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ 
+       _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ 
++      vc->call_responded = false;
+       _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer));
+       return true;
+ 
+@@ -264,25 +269,19 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+       if (vc->flags & AFS_VL_CURSOR_RETRY)
+               goto restart_from_beginning;
+ 
+-      e.error = -EDESTADDRREQ;
+-      e.responded = false;
+       for (i = 0; i < vc->server_list->nr_servers; i++) {
+               struct afs_vlserver *s = vc->server_list->servers[i].server;
+ 
+               if (test_bit(AFS_VLSERVER_FL_RESPONDING, &s->flags))
+-                      e.responded = true;
+-              afs_prioritise_error(&e, READ_ONCE(s->probe.error),
++                      vc->cumul_error.responded = true;
++              afs_prioritise_error(&vc->cumul_error, READ_ONCE(s->probe.error),
+                                    s->probe.abort_code);
+       }
+ 
+-      error = e.error;
+-
+-failed_set_error:
+-      vc->error = error;
+ failed:
+       vc->flags |= AFS_VL_CURSOR_STOP;
+       afs_end_cursor(&vc->ac);
+-      _leave(" = f [failed %d]", vc->error);
++      _leave(" = f [failed %d]", vc->cumul_error.error);
+       return false;
+ }
+ 
+@@ -305,7 +304,10 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+       pr_notice("DNS: src=%u st=%u lc=%x\n",
+                 cell->dns_source, cell->dns_status, cell->dns_lookup_count);
+       pr_notice("VC: ut=%lx ix=%u ni=%hu fl=%hx err=%hd\n",
+-                vc->untried, vc->index, vc->nr_iterations, vc->flags, vc->error);
++                vc->untried, vc->index, vc->nr_iterations, vc->flags,
++                vc->cumul_error.error);
++      pr_notice("VC: call  er=%d ac=%d r=%u\n",
++                vc->call_error, vc->call_abort_code, vc->call_responded);
+ 
+       if (vc->server_list) {
+               const struct afs_vlserver_list *sl = vc->server_list;
+@@ -329,9 +331,8 @@ static void afs_vl_dump_edestaddrreq(const struct afs_vl_cursor *vc)
+               }
+       }
+ 
+-      pr_notice("AC: t=%lx ax=%u ac=%d er=%d r=%u ni=%u\n",
+-                vc->ac.tried, vc->ac.index, vc->ac.abort_code, vc->ac.error,
+-                vc->ac.responded, vc->ac.nr_iterations);
++      pr_notice("AC: t=%lx ax=%u ni=%u\n",
++                vc->ac.tried, vc->ac.index, vc->ac.nr_iterations);
+       rcu_read_unlock();
+ }
+ 
+@@ -342,17 +343,16 @@ int afs_end_vlserver_operation(struct afs_vl_cursor *vc)
+ {
+       struct afs_net *net = vc->cell->net;
+ 
+-      if (vc->error == -EDESTADDRREQ ||
+-          vc->error == -EADDRNOTAVAIL ||
+-          vc->error == -ENETUNREACH ||
+-          vc->error == -EHOSTUNREACH)
++      switch (vc->cumul_error.error) {
++      case -EDESTADDRREQ:
++      case -EADDRNOTAVAIL:
++      case -ENETUNREACH:
++      case -EHOSTUNREACH:
+               afs_vl_dump_edestaddrreq(vc);
++              break;
++      }
+ 
+       afs_end_cursor(&vc->ac);
+       afs_put_vlserverlist(net, vc->server_list);
+-
+-      if (vc->error == -ECONNABORTED)
+-              vc->error = afs_abort_to_error(vc->ac.abort_code);
+-
+-      return vc->error;
++      return vc->cumul_error.error;
+ }
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 650534892a20..db7e94584e87 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -161,10 +161,13 @@ struct afs_vldb_entry *afs_vl_get_entry_by_name_u(struct afs_vl_cursor *vc,
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+       afs_wait_for_call_to_complete(call, &vc->ac);
++      vc->call_abort_code     = call->abort_code;
++      vc->call_error          = call->error;
++      vc->call_responded      = call->responded;
+       afs_put_call(call);
+-      if (vc->ac.error) {
++      if (vc->call_error) {
+               kfree(entry);
+-              return ERR_PTR(vc->ac.error);
++              return ERR_PTR(vc->call_error);
+       }
+       return entry;
+ }
+@@ -305,11 +308,14 @@ struct afs_addr_list *afs_vl_get_addrs_u(struct afs_vl_cursor *vc,
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+       afs_wait_for_call_to_complete(call, &vc->ac);
+-      alist = call->ret_alist;
++      vc->call_abort_code     = call->abort_code;
++      vc->call_error          = call->error;
++      vc->call_responded      = call->responded;
++      alist                   = call->ret_alist;
+       afs_put_call(call);
+-      if (vc->ac.error) {
++      if (vc->call_error) {
+               afs_put_addrlist(alist);
+-              return ERR_PTR(vc->ac.error);
++              return ERR_PTR(vc->call_error);
+       }
+       return alist;
+ }
+@@ -656,11 +662,14 @@ struct afs_addr_list *afs_yfsvl_get_endpoints(struct afs_vl_cursor *vc,
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+       afs_wait_for_call_to_complete(call, &vc->ac);
+-      alist = call->ret_alist;
++      vc->call_abort_code     = call->abort_code;
++      vc->call_error          = call->error;
++      vc->call_responded      = call->responded;
++      alist                   = call->ret_alist;
+       afs_put_call(call);
+-      if (vc->ac.error) {
++      if (vc->call_error) {
+               afs_put_addrlist(alist);
+-              return ERR_PTR(vc->ac.error);
++              return ERR_PTR(vc->call_error);
+       }
+       return alist;
+ }
+@@ -769,11 +778,14 @@ char *afs_yfsvl_get_cell_name(struct afs_vl_cursor *vc)
+       trace_afs_make_vl_call(call);
+       afs_make_call(&vc->ac, call, GFP_KERNEL);
+       afs_wait_for_call_to_complete(call, &vc->ac);
+-      cellname = call->ret_str;
++      vc->call_abort_code     = call->abort_code;
++      vc->call_error          = call->error;
++      vc->call_responded      = call->responded;
++      cellname                = call->ret_str;
+       afs_put_call(call);
+-      if (vc->ac.error) {
++      if (vc->call_error) {
+               kfree(cellname);
+-              return ERR_PTR(vc->ac.error);
++              return ERR_PTR(vc->call_error);
+       }
+       return cellname;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch b/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch

new file mode 100644 (file)

index 0000000..607217d
--- /dev/null
+++ b/queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch
@@ -0,0 +1,269 @@
+From 8001a9917176e5da09c7619530be845008e837ed Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 18 Oct 2023 15:38:14 +0100
+Subject: afs: Turn the afs_addr_list address array into an array of structs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 07f3502b33a260f873e35708d2fa693eb52225cb ]
+
+Turn the afs_addr_list address array into an array of structs, thereby
+allowing per-address (such as RTT) info to be added.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c | 10 +++++-----
+ fs/afs/fs_probe.c  |  6 +++---
+ fs/afs/internal.h  |  6 +++++-
+ fs/afs/proc.c      |  4 ++--
+ fs/afs/rotate.c    |  2 +-
+ fs/afs/rxrpc.c     |  4 ++--
+ fs/afs/server.c    |  4 ++--
+ fs/afs/vl_alias.c  |  4 ++--
+ fs/afs/vl_probe.c  |  6 +++---
+ fs/afs/vl_rotate.c |  2 +-
+ 10 files changed, 26 insertions(+), 22 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index de1ae0bead3b..ac05a59e9d46 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -45,7 +45,7 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+       alist->max_addrs = nr;
+ 
+       for (i = 0; i < nr; i++) {
+-              struct sockaddr_rxrpc *srx = &alist->addrs[i];
++              struct sockaddr_rxrpc *srx = &alist->addrs[i].srx;
+               srx->srx_family                 = AF_RXRPC;
+               srx->srx_service                = service;
+               srx->transport_type             = SOCK_DGRAM;
+@@ -281,7 +281,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+               return;
+ 
+       for (i = 0; i < alist->nr_ipv4; i++) {
+-              struct sockaddr_in *a = &alist->addrs[i].transport.sin;
++              struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin;
+               u32 a_addr = ntohl(a->sin_addr.s_addr);
+               u16 a_port = ntohs(a->sin_port);
+ 
+@@ -298,7 +298,7 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+                       alist->addrs + i,
+                       sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+ 
+-      srx = &alist->addrs[i];
++      srx = &alist->addrs[i].srx;
+       srx->srx_family = AF_RXRPC;
+       srx->transport_type = SOCK_DGRAM;
+       srx->transport_len = sizeof(srx->transport.sin);
+@@ -321,7 +321,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+               return;
+ 
+       for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+-              struct sockaddr_in6 *a = &alist->addrs[i].transport.sin6;
++              struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6;
+               u16 a_port = ntohs(a->sin6_port);
+ 
+               diff = memcmp(xdr, &a->sin6_addr, 16);
+@@ -338,7 +338,7 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+                       alist->addrs + i,
+                       sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+ 
+-      srx = &alist->addrs[i];
++      srx = &alist->addrs[i].srx;
+       srx->srx_family = AF_RXRPC;
+       srx->transport_type = SOCK_DGRAM;
+       srx->transport_len = sizeof(srx->transport.sin6);
+diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
+index daaf3810cc92..3dd24842f277 100644
+--- a/fs/afs/fs_probe.c
++++ b/fs/afs/fs_probe.c
+@@ -153,12 +153,12 @@ void afs_fileserver_probe_result(struct afs_call *call)
+       if (call->service_id == YFS_FS_SERVICE) {
+               server->probe.is_yfs = true;
+               set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+-              alist->addrs[index].srx_service = call->service_id;
++              alist->addrs[index].srx.srx_service = call->service_id;
+       } else {
+               server->probe.not_yfs = true;
+               if (!server->probe.is_yfs) {
+                       clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+-                      alist->addrs[index].srx_service = call->service_id;
++                      alist->addrs[index].srx.srx_service = call->service_id;
+               }
+               cap0 = ntohl(call->tmp);
+               if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+@@ -182,7 +182,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
+       spin_unlock(&server->probe_lock);
+ 
+       _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
+-             &server->uuid, index, &alist->addrs[index].transport,
++             &server->uuid, index, &alist->addrs[index].srx.transport,
+              rtt_us, ret);
+ 
+       return afs_done_one_fs_probe(call->net, server);
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 7385d62c8cf5..e2adb314ab6a 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -87,7 +87,9 @@ struct afs_addr_list {
+       enum dns_lookup_status  status:8;
+       unsigned long           failed;         /* Mask of addrs that failed locally/ICMP */
+       unsigned long           responded;      /* Mask of addrs that responded */
+-      struct sockaddr_rxrpc   addrs[] __counted_by(max_addrs);
++      struct {
++              struct sockaddr_rxrpc   srx;
++      } addrs[] __counted_by(max_addrs);
+ #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+ };
+ 
+@@ -969,6 +971,8 @@ extern void afs_put_addrlist(struct afs_addr_list *);
+ extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+                                                     const char *, size_t, char,
+                                                     unsigned short, unsigned short);
++bool afs_addr_list_same(const struct afs_addr_list *a,
++                      const struct afs_addr_list *b);
+ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+ extern int afs_end_cursor(struct afs_addr_cursor *);
+diff --git a/fs/afs/proc.c b/fs/afs/proc.c
+index 2a0c83d71565..ab9cd986cfd9 100644
+--- a/fs/afs/proc.c
++++ b/fs/afs/proc.c
+@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+               for (i = 0; i < alist->nr_addrs; i++)
+                       seq_printf(m, " %c %pISpc\n",
+                                  alist->preferred == i ? '>' : '-',
+-                                 &alist->addrs[i].transport);
++                                 &alist->addrs[i].srx.transport);
+       }
+       seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+       seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+@@ -399,7 +399,7 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
+                  alist->version, alist->responded, alist->failed);
+       for (i = 0; i < alist->nr_addrs; i++)
+               seq_printf(m, "    [%x] %pISpc%s\n",
+-                         i, &alist->addrs[i].transport,
++                         i, &alist->addrs[i].srx.transport,
+                          alist->preferred == i ? "*" : "");
+       return 0;
+ }
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a3d127953ac6..46081e5da6f5 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+       _debug("address [%u] %u/%u %pISp",
+              op->index, op->ac.index, op->ac.alist->nr_addrs,
+-             &op->ac.alist->addrs[op->ac.index].transport);
++             &op->ac.alist->addrs[op->ac.index].srx.transport);
+ 
+       _leave(" = t");
+       return true;
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index d642d06a453b..181317126e43 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -296,7 +296,7 @@ static void afs_notify_end_request_tx(struct sock *sock,
+  */
+ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ {
+-      struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index];
++      struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx;
+       struct rxrpc_call *rxcall;
+       struct msghdr msg;
+       struct kvec iov[1];
+@@ -461,7 +461,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
+               max = m + 1;
+               pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+                         msg, call->type->name,
+-                        &call->alist->addrs[call->addr_ix].transport);
++                        &call->alist->addrs[call->addr_ix].srx.transport);
+       }
+ }
+ 
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 0bd2f5ba6900..b8e2d211d4a1 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -43,7 +43,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+                       hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+                               alist = rcu_dereference(server->addresses);
+                               for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+-                                      b = &alist->addrs[i].transport.sin6;
++                                      b = &alist->addrs[i].srx.transport.sin6;
+                                       diff = ((u16 __force)a->sin6_port -
+                                               (u16 __force)b->sin6_port);
+                                       if (diff == 0)
+@@ -59,7 +59,7 @@ struct afs_server *afs_find_server(struct afs_net *net,
+                       hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+                               alist = rcu_dereference(server->addresses);
+                               for (i = 0; i < alist->nr_ipv4; i++) {
+-                                      b = &alist->addrs[i].transport.sin;
++                                      b = &alist->addrs[i].srx.transport.sin;
+                                       diff = ((u16 __force)a->sin_port -
+                                               (u16 __force)b->sin_port);
+                                       if (diff == 0)
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index f04a80e4f5c3..d3c0df70a1a5 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -94,8 +94,8 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
+       lb = rcu_dereference(server_b->addresses);
+ 
+       while (a < la->nr_addrs && b < lb->nr_addrs) {
+-              const struct sockaddr_rxrpc *srx_a = &la->addrs[a];
+-              const struct sockaddr_rxrpc *srx_b = &lb->addrs[b];
++              const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx;
++              const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx;
+               int diff = afs_compare_addrs(srx_a, srx_b);
+ 
+               if (diff < 0) {
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index 58452b86e672..bdd9372e3fb2 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -106,12 +106,12 @@ void afs_vlserver_probe_result(struct afs_call *call)
+       if (call->service_id == YFS_VL_SERVICE) {
+               server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
+               set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+-              alist->addrs[index].srx_service = call->service_id;
++              alist->addrs[index].srx.srx_service = call->service_id;
+       } else {
+               server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+               if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
+                       clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+-                      alist->addrs[index].srx_service = call->service_id;
++                      alist->addrs[index].srx.srx_service = call->service_id;
+               }
+       }
+ 
+@@ -131,7 +131,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
+       spin_unlock(&server->probe_lock);
+ 
+       _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+-             server_index, index, &alist->addrs[index].transport, rtt_us, ret);
++             server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret);
+ 
+       afs_done_one_vl_probe(server, have_result);
+ }
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index eb415ce56360..e52b9d4c8a0a 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ 
+       _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ 
+-      _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].transport);
++      _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport);
+       return true;
+ 
+ next_server:
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch b/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch

new file mode 100644 (file)

index 0000000..d8ce058
--- /dev/null
+++ b/queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch
@@ -0,0 +1,86 @@
+From c0d15d3cc0ef5a5944b6b80309a57145d4500e26 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 20 Oct 2023 16:04:52 +0100
+Subject: afs: Use op->nr_iterations=-1 to indicate to begin fileserver
+ iteration
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 075171fd22be33acf4ab354814bfa6de1c3412ce ]
+
+Set op->nr_iterations to -1 to indicate that we need to begin fileserver
+iteration rather than setting error to SHRT_MAX.  This makes it easier to
+eliminate the address cursor.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/fs_operation.c |  2 +-
+ fs/afs/internal.h     |  2 +-
+ fs/afs/rotate.c       | 11 ++++++-----
+ 3 files changed, 8 insertions(+), 7 deletions(-)
+
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 7a3803ce3a22..3e31fae9a149 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -41,7 +41,7 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
+       op->cb_v_break  = volume->cb_v_break;
+       op->debug_id    = atomic_inc_return(&afs_operation_debug_counter);
+       op->error       = -EDESTADDRREQ;
+-      op->ac.error    = SHRT_MAX;
++      op->nr_iterations = -1;
+ 
+       _leave(" = [op=%08x]", op->debug_id);
+       return op;
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index ec08b4a7e499..88381935bd66 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -859,7 +859,7 @@ struct afs_operation {
+       struct afs_call         *call;
+       unsigned long           untried;        /* Bitmask of untried servers */
+       short                   index;          /* Current server */
+-      unsigned short          nr_iterations;  /* Number of server iterations */
++      short                   nr_iterations;  /* Number of server iterations */
+ 
+       unsigned int            flags;
+ #define AFS_OPERATION_STOP            0x0001  /* Set to cease iteration */
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index a108cd55bb4e..4084e023ff43 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -116,7 +116,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+       unsigned int rtt;
+       int error = op->ac.error, i;
+ 
+-      _enter("%lx[%d],%lx[%d],%d,%d",
++      op->nr_iterations++;
++
++      _enter("OP=%x+%x,%llx,%lx[%d],%lx[%d],%d,%d",
++             op->debug_id, op->nr_iterations, op->volume->vid,
+              op->untried, op->index,
+              op->ac.tried, op->ac.index,
+              error, op->ac.abort_code);
+@@ -126,13 +129,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+               return false;
+       }
+ 
+-      op->nr_iterations++;
++      if (op->nr_iterations == 0)
++              goto start;
+ 
+       /* Evaluate the result of the previous operation, if there was one. */
+       switch (error) {
+-      case SHRT_MAX:
+-              goto start;
+-
+       case 0:
+       default:
+               /* Success or local failure.  Stop. */
+-- 
+2.43.0
+
diff --git a/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch b/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch

new file mode 100644 (file)

index 0000000..2987e8b
--- /dev/null
+++ b/queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch
@@ -0,0 +1,591 @@
+From 2dc8dd6e4d297a768f1c515165a1918554b25b85 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 26 Oct 2023 09:43:23 +0100
+Subject: afs: Wrap most op->error accesses with inline funcs
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 2de5599f63babb416e09b1a6be429a47910dd47c ]
+
+Wrap most op->error accesses with inline funcs which will make it easier
+for a subsequent patch to replace op->error with something else.  Two
+functions are added to this end:
+
+ (1) afs_op_error() - Get the error code.
+
+ (2) afs_op_set_error() - Set the error code.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/dir.c          | 38 +++++++++++++++---------------
+ fs/afs/file.c         |  4 ++--
+ fs/afs/fs_operation.c | 21 ++++++++++-------
+ fs/afs/fsclient.c     |  2 +-
+ fs/afs/inode.c        |  2 +-
+ fs/afs/internal.h     | 20 ++++++++++++----
+ fs/afs/rotate.c       | 55 ++++++++++++++++++++++++-------------------
+ fs/afs/server.c       |  6 ++---
+ fs/afs/write.c        |  6 ++---
+ 9 files changed, 87 insertions(+), 67 deletions(-)
+
+diff --git a/fs/afs/dir.c b/fs/afs/dir.c
+index 2df2e9ee130d..15763418a938 100644
+--- a/fs/afs/dir.c
++++ b/fs/afs/dir.c
+@@ -886,14 +886,14 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+        * lookups contained therein are stored in the reply without aborting
+        * the whole operation.
+        */
+-      op->error = -ENOTSUPP;
++      afs_op_set_error(op, -ENOTSUPP);
+       if (!cookie->one_only) {
+               op->ops = &afs_inline_bulk_status_operation;
+               afs_begin_vnode_operation(op);
+               afs_wait_for_operation(op);
+       }
+ 
+-      if (op->error == -ENOTSUPP) {
++      if (afs_op_error(op) == -ENOTSUPP) {
+               /* We could try FS.BulkStatus next, but this aborts the entire
+                * op if any of the lookups fails - so, for the moment, revert
+                * to FS.FetchStatus for op->file[1].
+@@ -903,10 +903,10 @@ static struct inode *afs_do_lookup(struct inode *dir, struct dentry *dentry,
+               afs_begin_vnode_operation(op);
+               afs_wait_for_operation(op);
+       }
+-      inode = ERR_PTR(op->error);
++      inode = ERR_PTR(afs_op_error(op));
+ 
+ out_op:
+-      if (op->error == 0) {
++      if (!afs_op_error(op)) {
+               inode = &op->file[1].vnode->netfs.inode;
+               op->file[1].vnode = NULL;
+       }
+@@ -1281,7 +1281,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+ 
+       _enter("");
+ 
+-      ASSERTCMP(op->error, ==, 0);
++      ASSERTCMP(afs_op_error(op), ==, 0);
+ 
+       inode = afs_iget(op, vp);
+       if (IS_ERR(inode)) {
+@@ -1294,7 +1294,7 @@ static void afs_vnode_new_inode(struct afs_operation *op)
+ 
+       vnode = AFS_FS_I(inode);
+       set_bit(AFS_VNODE_NEW_CONTENT, &vnode->flags);
+-      if (!op->error)
++      if (!afs_op_error(op))
+               afs_cache_permit(vnode, op->key, vnode->cb_break, &vp->scb);
+       d_instantiate(op->dentry, inode);
+ }
+@@ -1328,7 +1328,7 @@ static void afs_create_put(struct afs_operation *op)
+ {
+       _enter("op=%08x", op->debug_id);
+ 
+-      if (op->error)
++      if (afs_op_error(op))
+               d_drop(op->dentry);
+ }
+ 
+@@ -1488,7 +1488,7 @@ static void afs_dir_remove_link(struct afs_operation *op)
+       struct dentry *dentry = op->dentry;
+       int ret;
+ 
+-      if (op->error != 0 ||
++      if (afs_op_error(op) ||
+           (op->file[1].scb.have_status && op->file[1].scb.have_error))
+               return;
+       if (d_really_is_positive(dentry))
+@@ -1512,10 +1512,10 @@ static void afs_dir_remove_link(struct afs_operation *op)
+ 
+               ret = afs_validate(vnode, op->key);
+               if (ret != -ESTALE)
+-                      op->error = ret;
++                      afs_op_set_error(op, ret);
+       }
+ 
+-      _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, op->error);
++      _debug("nlink %d [val %d]", vnode->netfs.inode.i_nlink, afs_op_error(op));
+ }
+ 
+ static void afs_unlink_success(struct afs_operation *op)
+@@ -1546,7 +1546,7 @@ static void afs_unlink_edit_dir(struct afs_operation *op)
+ static void afs_unlink_put(struct afs_operation *op)
+ {
+       _enter("op=%08x", op->debug_id);
+-      if (op->unlink.need_rehash && op->error < 0 && op->error != -ENOENT)
++      if (op->unlink.need_rehash && afs_op_error(op) < 0 && afs_op_error(op) != -ENOENT)
+               d_rehash(op->dentry);
+ }
+ 
+@@ -1587,7 +1587,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+       /* Try to make sure we have a callback promise on the victim. */
+       ret = afs_validate(vnode, op->key);
+       if (ret < 0) {
+-              op->error = ret;
++              afs_op_set_error(op, ret);
+               goto error;
+       }
+ 
+@@ -1596,7 +1596,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+               spin_unlock(&dentry->d_lock);
+               /* Start asynchronous writeout of the inode */
+               write_inode_now(d_inode(dentry), 0);
+-              op->error = afs_sillyrename(dvnode, vnode, dentry, op->key);
++              afs_op_set_error(op, afs_sillyrename(dvnode, vnode, dentry, op->key));
+               goto error;
+       }
+       if (!d_unhashed(dentry)) {
+@@ -1617,7 +1617,7 @@ static int afs_unlink(struct inode *dir, struct dentry *dentry)
+       /* If there was a conflict with a third party, check the status of the
+        * unlinked vnode.
+        */
+-      if (op->error == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
++      if (afs_op_error(op) == 0 && (op->flags & AFS_OPERATION_DIR_CONFLICT)) {
+               op->file[1].update_ctime = false;
+               op->fetch_status.which = 1;
+               op->ops = &afs_fetch_status_operation;
+@@ -1699,7 +1699,7 @@ static void afs_link_success(struct afs_operation *op)
+ static void afs_link_put(struct afs_operation *op)
+ {
+       _enter("op=%08x", op->debug_id);
+-      if (op->error)
++      if (afs_op_error(op))
+               d_drop(op->dentry);
+ }
+ 
+@@ -1897,7 +1897,7 @@ static void afs_rename_put(struct afs_operation *op)
+       if (op->rename.rehash)
+               d_rehash(op->rename.rehash);
+       dput(op->rename.tmp);
+-      if (op->error)
++      if (afs_op_error(op))
+               d_rehash(op->dentry);
+ }
+ 
+@@ -1942,7 +1942,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+               return PTR_ERR(op);
+ 
+       ret = afs_validate(vnode, op->key);
+-      op->error = ret;
++      afs_op_set_error(op, ret);
+       if (ret < 0)
+               goto error;
+ 
+@@ -1979,7 +1979,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+                       op->rename.tmp = d_alloc(new_dentry->d_parent,
+                                                &new_dentry->d_name);
+                       if (!op->rename.tmp) {
+-                              op->error = -ENOMEM;
++                              afs_op_nomem(op);
+                               goto error;
+                       }
+ 
+@@ -1987,7 +1987,7 @@ static int afs_rename(struct mnt_idmap *idmap, struct inode *old_dir,
+                                             AFS_FS_I(d_inode(new_dentry)),
+                                             new_dentry, op->key);
+                       if (ret) {
+-                              op->error = ret;
++                              afs_op_set_error(op, ret);
+                               goto error;
+                       }
+ 
+diff --git a/fs/afs/file.c b/fs/afs/file.c
+index d37dd201752b..0c81c39c32f5 100644
+--- a/fs/afs/file.c
++++ b/fs/afs/file.c
+@@ -243,7 +243,7 @@ static void afs_fetch_data_notify(struct afs_operation *op)
+ {
+       struct afs_read *req = op->fetch.req;
+       struct netfs_io_subrequest *subreq = req->subreq;
+-      int error = op->error;
++      int error = afs_op_error(op);
+ 
+       if (error == -ECONNABORTED)
+               error = afs_abort_to_error(op->ac.abort_code);
+@@ -271,7 +271,7 @@ static void afs_fetch_data_success(struct afs_operation *op)
+ 
+ static void afs_fetch_data_put(struct afs_operation *op)
+ {
+-      op->fetch.req->error = op->error;
++      op->fetch.req->error = afs_op_error(op);
+       afs_put_read(op->fetch.req);
+ }
+ 
+diff --git a/fs/afs/fs_operation.c b/fs/afs/fs_operation.c
+index 3e31fae9a149..bfb9a7634bd9 100644
+--- a/fs/afs/fs_operation.c
++++ b/fs/afs/fs_operation.c
+@@ -40,8 +40,8 @@ struct afs_operation *afs_alloc_operation(struct key *key, struct afs_volume *vo
+       op->net         = volume->cell->net;
+       op->cb_v_break  = volume->cb_v_break;
+       op->debug_id    = atomic_inc_return(&afs_operation_debug_counter);
+-      op->error       = -EDESTADDRREQ;
+       op->nr_iterations = -1;
++      afs_op_set_error(op, -EDESTADDRREQ);
+ 
+       _leave(" = [op=%08x]", op->debug_id);
+       return op;
+@@ -71,7 +71,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
+               swap(vnode, vnode2);
+ 
+       if (mutex_lock_interruptible(&vnode->io_lock) < 0) {
+-              op->error = -ERESTARTSYS;
++              afs_op_set_error(op, -ERESTARTSYS);
+               op->flags |= AFS_OPERATION_STOP;
+               _leave(" = f [I 0]");
+               return false;
+@@ -80,7 +80,7 @@ static bool afs_get_io_locks(struct afs_operation *op)
+ 
+       if (vnode2) {
+               if (mutex_lock_interruptible_nested(&vnode2->io_lock, 1) < 0) {
+-                      op->error = -ERESTARTSYS;
++                      afs_op_set_error(op, -ERESTARTSYS);
+                       op->flags |= AFS_OPERATION_STOP;
+                       mutex_unlock(&vnode->io_lock);
+                       op->flags &= ~AFS_OPERATION_LOCK_0;
+@@ -159,11 +159,14 @@ static void afs_end_vnode_operation(struct afs_operation *op)
+ {
+       _enter("");
+ 
+-      if (op->error == -EDESTADDRREQ ||
+-          op->error == -EADDRNOTAVAIL ||
+-          op->error == -ENETUNREACH ||
+-          op->error == -EHOSTUNREACH)
++      switch (afs_op_error(op)) {
++      case -EDESTADDRREQ:
++      case -EADDRNOTAVAIL:
++      case -ENETUNREACH:
++      case -EHOSTUNREACH:
+               afs_dump_edestaddrreq(op);
++              break;
++      }
+ 
+       afs_drop_io_locks(op);
+ 
+@@ -209,7 +212,7 @@ void afs_wait_for_operation(struct afs_operation *op)
+ 
+       afs_end_vnode_operation(op);
+ 
+-      if (op->error == 0 && op->ops->edit_dir) {
++      if (!afs_op_error(op) && op->ops->edit_dir) {
+               _debug("edit_dir");
+               op->ops->edit_dir(op);
+       }
+@@ -221,7 +224,7 @@ void afs_wait_for_operation(struct afs_operation *op)
+  */
+ int afs_put_operation(struct afs_operation *op)
+ {
+-      int i, ret = op->error;
++      int i, ret = afs_op_error(op);
+ 
+       _enter("op=%08x,%d", op->debug_id, ret);
+ 
+diff --git a/fs/afs/fsclient.c b/fs/afs/fsclient.c
+index 7d37f63ef0f0..6821ce0f9d63 100644
+--- a/fs/afs/fsclient.c
++++ b/fs/afs/fsclient.c
+@@ -1899,7 +1899,7 @@ void afs_fs_inline_bulk_status(struct afs_operation *op)
+       int i;
+ 
+       if (test_bit(AFS_SERVER_FL_NO_IBULK, &op->server->flags)) {
+-              op->error = -ENOTSUPP;
++              afs_op_set_error(op, -ENOTSUPP);
+               return;
+       }
+ 
+diff --git a/fs/afs/inode.c b/fs/afs/inode.c
+index 78efc9719349..d6eed332507f 100644
+--- a/fs/afs/inode.c
++++ b/fs/afs/inode.c
+@@ -331,7 +331,7 @@ static void afs_fetch_status_success(struct afs_operation *op)
+ 
+       if (vnode->netfs.inode.i_state & I_NEW) {
+               ret = afs_inode_init_from_status(op, vp, vnode);
+-              op->error = ret;
++              afs_op_set_error(op, ret);
+               if (ret == 0)
+                       afs_cache_permit(vnode, op->key, vp->cb_break_before, &vp->scb);
+       } else {
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index 88381935bd66..1a306df267b0 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -1140,11 +1140,6 @@ extern bool afs_begin_vnode_operation(struct afs_operation *);
+ extern void afs_wait_for_operation(struct afs_operation *);
+ extern int afs_do_sync_operation(struct afs_operation *);
+ 
+-static inline void afs_op_nomem(struct afs_operation *op)
+-{
+-      op->error = -ENOMEM;
+-}
+-
+ static inline void afs_op_set_vnode(struct afs_operation *op, unsigned int n,
+                                   struct afs_vnode *vnode)
+ {
+@@ -1238,6 +1233,21 @@ static inline void __afs_stat(atomic_t *s)
+ extern int afs_abort_to_error(u32);
+ extern void afs_prioritise_error(struct afs_error *, int, u32);
+ 
++static inline void afs_op_nomem(struct afs_operation *op)
++{
++      op->error = -ENOMEM;
++}
++
++static inline int afs_op_error(const struct afs_operation *op)
++{
++      return op->error;
++}
++
++static inline int afs_op_set_error(struct afs_operation *op, int error)
++{
++      return op->error = error;
++}
++
+ /*
+  * mntpt.c
+  */
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 4084e023ff43..d64c1d90faed 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -51,7 +51,7 @@ static bool afs_start_fs_iteration(struct afs_operation *op,
+                * and have to return an error.
+                */
+               if (op->flags & AFS_OPERATION_CUR_ONLY) {
+-                      op->error = -ESTALE;
++                      afs_op_set_error(op, -ESTALE);
+                       return false;
+               }
+ 
+@@ -93,7 +93,7 @@ static bool afs_sleep_and_retry(struct afs_operation *op)
+       if (!(op->flags & AFS_OPERATION_UNINTR)) {
+               msleep_interruptible(1000);
+               if (signal_pending(current)) {
+-                      op->error = -ERESTARTSYS;
++                      afs_op_set_error(op, -ERESTARTSYS);
+                       return false;
+               }
+       } else {
+@@ -137,7 +137,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+       case 0:
+       default:
+               /* Success or local failure.  Stop. */
+-              op->error = error;
++              afs_op_set_error(op, error);
+               op->flags |= AFS_OPERATION_STOP;
+               _leave(" = f [okay/local %d]", error);
+               return false;
+@@ -174,11 +174,13 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+                       set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
+                       error = afs_check_volume_status(op->volume, op);
+-                      if (error < 0)
+-                              goto failed_set_error;
++                      if (error < 0) {
++                              afs_op_set_error(op, error);
++                              goto failed;
++                      }
+ 
+                       if (test_bit(AFS_VOLUME_DELETED, &op->volume->flags)) {
+-                              op->error = -ENOMEDIUM;
++                              afs_op_set_error(op, -ENOMEDIUM);
+                               goto failed;
+                       }
+ 
+@@ -250,11 +252,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+                               clear_bit(AFS_VOLUME_BUSY, &op->volume->flags);
+                       }
+                       if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+-                              op->error = -EADV;
++                              afs_op_set_error(op, -EADV);
+                               goto failed;
+                       }
+                       if (op->flags & AFS_OPERATION_CUR_ONLY) {
+-                              op->error = -ESTALE;
++                              afs_op_set_error(op, -ESTALE);
+                               goto failed;
+                       }
+                       goto busy;
+@@ -275,7 +277,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * lock we need to maintain.
+                        */
+                       if (op->flags & AFS_OPERATION_NO_VSLEEP) {
+-                              op->error = -EBUSY;
++                              afs_op_set_error(op, -EBUSY);
+                               goto failed;
+                       }
+                       if (!test_and_set_bit(AFS_VOLUME_BUSY, &op->volume->flags)) {
+@@ -304,7 +306,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * honour, just in case someone sets up a loop.
+                        */
+                       if (op->flags & AFS_OPERATION_VMOVED) {
+-                              op->error = -EREMOTEIO;
++                              afs_op_set_error(op, -EREMOTEIO);
+                               goto failed;
+                       }
+                       op->flags |= AFS_OPERATION_VMOVED;
+@@ -312,8 +314,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+                       set_bit(AFS_VOLUME_WAIT, &op->volume->flags);
+                       set_bit(AFS_VOLUME_NEEDS_UPDATE, &op->volume->flags);
+                       error = afs_check_volume_status(op->volume, op);
+-                      if (error < 0)
+-                              goto failed_set_error;
++                      if (error < 0) {
++                              afs_op_set_error(op, error);
++                              goto failed;
++                      }
+ 
+                       /* If the server list didn't change, then the VLDB is
+                        * out of sync with the fileservers.  This is hopefully
+@@ -344,7 +348,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * Translate locally and return ENOSPC.
+                        * No replicas to failover to.
+                        */
+-                      op->error = -ENOSPC;
++                      afs_op_set_error(op, -ENOSPC);
+                       goto failed_but_online;
+ 
+               case VOVERQUOTA:
+@@ -353,7 +357,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+                        * Translate locally and return EDQUOT.
+                        * No replicas to failover to.
+                        */
+-                      op->error = -EDQUOT;
++                      afs_op_set_error(op, -EDQUOT);
+                       goto failed_but_online;
+ 
+               default:
+@@ -366,7 +370,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+       case -ETIMEDOUT:
+       case -ETIME:
+-              if (op->error != -EDESTADDRREQ)
++              if (afs_op_error(op) != -EDESTADDRREQ)
+                       goto iterate_address;
+               fallthrough;
+       case -ERFKILL:
+@@ -385,7 +389,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+               fallthrough;
+       case -ECONNRESET:
+               _debug("call reset");
+-              op->error = error;
++              afs_op_set_error(op, error);
+               goto failed;
+       }
+ 
+@@ -401,8 +405,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+        * volume may have moved or even have been deleted.
+        */
+       error = afs_check_volume_status(op->volume, op);
+-      if (error < 0)
+-              goto failed_set_error;
++      if (error < 0) {
++              afs_op_set_error(op, error);
++              goto failed;
++      }
+ 
+       if (!afs_start_fs_iteration(op, vnode))
+               goto failed;
+@@ -413,8 +419,10 @@ bool afs_select_fileserver(struct afs_operation *op)
+       _debug("pick [%lx]", op->untried);
+ 
+       error = afs_wait_for_fs_probes(op->server_list, op->untried);
+-      if (error < 0)
+-              goto failed_set_error;
++      if (error < 0) {
++              afs_op_set_error(op, error);
++              goto failed;
++      }
+ 
+       /* Pick the untried server with the lowest RTT.  If we have outstanding
+        * callbacks, we stick with the server we're already using if we can.
+@@ -515,7 +523,8 @@ bool afs_select_fileserver(struct afs_operation *op)
+                       op->flags &= ~AFS_OPERATION_RETRY_SERVER;
+                       goto retry_server;
+               case -ERESTARTSYS:
+-                      goto failed_set_error;
++                      afs_op_set_error(op, error);
++                      goto failed;
+               case -ETIME:
+               case -EDESTADDRREQ:
+                       goto next_server;
+@@ -544,13 +553,11 @@ bool afs_select_fileserver(struct afs_operation *op)
+       }
+ 
+       error = e.error;
+-
+-failed_set_error:
+       op->error = error;
+ failed:
+       op->flags |= AFS_OPERATION_STOP;
+       afs_end_cursor(&op->ac);
+-      _leave(" = f [failed %d]", op->error);
++      _leave(" = f [failed %d]", afs_op_error(op));
+       return false;
+ }
+ 
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index 5b5fa94005c9..2826e6eced71 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -629,8 +629,8 @@ static noinline bool afs_update_server_record(struct afs_operation *op,
+                       _leave(" = t [intr]");
+                       return true;
+               }
+-              op->error = PTR_ERR(alist);
+-              _leave(" = f [%d]", op->error);
++              afs_op_set_error(op, PTR_ERR(alist));
++              _leave(" = f [%d]", afs_op_error(op));
+               return false;
+       }
+ 
+@@ -684,7 +684,7 @@ bool afs_check_server_record(struct afs_operation *op, struct afs_server *server
+                         (op->flags & AFS_OPERATION_UNINTR) ?
+                         TASK_UNINTERRUPTIBLE : TASK_INTERRUPTIBLE);
+       if (ret == -ERESTARTSYS) {
+-              op->error = ret;
++              afs_op_set_error(op, ret);
+               _leave(" = f [intr]");
+               return false;
+       }
+diff --git a/fs/afs/write.c b/fs/afs/write.c
+index 4a168781936b..9f90d8970ce9 100644
+--- a/fs/afs/write.c
++++ b/fs/afs/write.c
+@@ -366,7 +366,7 @@ static void afs_store_data_success(struct afs_operation *op)
+ 
+       op->ctime = op->file[0].scb.status.mtime_client;
+       afs_vnode_commit_status(op, &op->file[0]);
+-      if (op->error == 0) {
++      if (!afs_op_error(op)) {
+               if (!op->store.laundering)
+                       afs_pages_written_back(vnode, op->store.pos, op->store.size);
+               afs_stat_v(vnode, n_stores);
+@@ -428,7 +428,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
+ 
+       afs_wait_for_operation(op);
+ 
+-      switch (op->error) {
++      switch (afs_op_error(op)) {
+       case -EACCES:
+       case -EPERM:
+       case -ENOKEY:
+@@ -447,7 +447,7 @@ static int afs_store_data(struct afs_vnode *vnode, struct iov_iter *iter, loff_t
+       }
+ 
+       afs_put_wb_key(wbk);
+-      _leave(" = %d", op->error);
++      _leave(" = %d", afs_op_error(op));
+       return afs_put_operation(op);
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch b/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch

new file mode 100644 (file)

index 0000000..d018f57
--- /dev/null
+++ b/queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch
@@ -0,0 +1,113 @@
+From 171db765ad289c8587da5dec137d4deb2f99c402 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:14 -0800
+Subject: bnxt_en: Prevent kernel warning when running offline self test
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit c20f482129a582455f02eb9a6dcb2a4215274599 ]
+
+We call bnxt_half_open_nic() to setup the chip partially to run
+loopback tests.  The rings and buffers are initialized normally
+so that we can transmit and receive packets in loopback mode.
+That means page pool buffers are allocated for the aggregation ring
+just like the normal case.  NAPI is not needed because we are just
+polling for the loopback packets.
+
+When we're done with the loopback tests, we call bnxt_half_close_nic()
+to clean up.  When freeing the page pools, we hit a WARN_ON()
+in page_pool_unlink_napi() because the NAPI state linked to the
+page pool is uninitialized.
+
+The simplest way to avoid this warning is just to initialize the
+NAPIs during half open and delete the NAPIs during half close.
+Trying to skip the page pool initialization or skip linking of
+NAPI during half open will be more complicated.
+
+This fix avoids this warning:
+
+WARNING: CPU: 4 PID: 46967 at net/core/page_pool.c:946 page_pool_unlink_napi+0x1f/0x30
+CPU: 4 PID: 46967 Comm: ethtool Tainted: G S      W          6.7.0-rc5+ #22
+Hardware name: Dell Inc. PowerEdge R750/06V45N, BIOS 1.3.8 08/31/2021
+RIP: 0010:page_pool_unlink_napi+0x1f/0x30
+Code: 90 90 90 90 90 90 90 90 90 90 90 0f 1f 44 00 00 48 8b 47 18 48 85 c0 74 1b 48 8b 50 10 83 e2 01 74 08 8b 40 34 83 f8 ff 74 02 <0f> 0b 48 c7 47 18 00 00 00 00 c3 cc cc cc cc 66 90 90 90 90 90 90
+RSP: 0018:ffa000003d0dfbe8 EFLAGS: 00010246
+RAX: ff110003607ce640 RBX: ff110010baf5d000 RCX: 0000000000000008
+RDX: 0000000000000000 RSI: ff110001e5e522c0 RDI: ff110010baf5d000
+RBP: ff11000145539b40 R08: 0000000000000001 R09: ffffffffc063f641
+R10: ff110001361eddb8 R11: 000000000040000f R12: 0000000000000001
+R13: 000000000000001c R14: ff1100014553a080 R15: 0000000000003fc0
+FS:  00007f9301c4f740(0000) GS:ff1100103fd00000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f91344fa8f0 CR3: 00000003527cc005 CR4: 0000000000771ef0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+PKRU: 55555554
+Call Trace:
+ <TASK>
+ ? __warn+0x81/0x140
+ ? page_pool_unlink_napi+0x1f/0x30
+ ? report_bug+0x102/0x200
+ ? handle_bug+0x44/0x70
+ ? exc_invalid_op+0x13/0x60
+ ? asm_exc_invalid_op+0x16/0x20
+ ? bnxt_free_ring.isra.123+0xb1/0xd0 [bnxt_en]
+ ? page_pool_unlink_napi+0x1f/0x30
+ page_pool_destroy+0x3e/0x150
+ bnxt_free_mem+0x441/0x5e0 [bnxt_en]
+ bnxt_half_close_nic+0x2a/0x40 [bnxt_en]
+ bnxt_self_test+0x21d/0x450 [bnxt_en]
+ __dev_ethtool+0xeda/0x2e30
+ ? native_queued_spin_lock_slowpath+0x17f/0x2b0
+ ? __link_object+0xa1/0x160
+ ? _raw_spin_unlock_irqrestore+0x23/0x40
+ ? __create_object+0x5f/0x90
+ ? __kmem_cache_alloc_node+0x317/0x3c0
+ ? dev_ethtool+0x59/0x170
+ dev_ethtool+0xa7/0x170
+ dev_ioctl+0xc3/0x530
+ sock_do_ioctl+0xa8/0xf0
+ sock_ioctl+0x270/0x310
+ __x64_sys_ioctl+0x8c/0xc0
+ do_syscall_64+0x3e/0xf0
+ entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+Fixes: 294e39e0d034 ("bnxt: hook NAPIs to page pools")
+Reviewed-by: Andy Gospodarek <andrew.gospodarek@broadcom.com>
+Reviewed-by: Ajit Khaparde <ajit.khaparde@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-5-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index 1019b4dc7bed..22c8bfb5ed9d 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -10627,10 +10627,12 @@ int bnxt_half_open_nic(struct bnxt *bp)
+               netdev_err(bp->dev, "bnxt_alloc_mem err: %x\n", rc);
+               goto half_open_err;
+       }
++      bnxt_init_napi(bp);
+       set_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+       rc = bnxt_init_nic(bp, true);
+       if (rc) {
+               clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
++              bnxt_del_napi(bp);
+               netdev_err(bp->dev, "bnxt_init_nic err: %x\n", rc);
+               goto half_open_err;
+       }
+@@ -10649,6 +10651,7 @@ int bnxt_half_open_nic(struct bnxt *bp)
+ void bnxt_half_close_nic(struct bnxt *bp)
+ {
+       bnxt_hwrm_resource_free(bp, false, true);
++      bnxt_del_napi(bp);
+       bnxt_free_skbs(bp);
+       bnxt_free_mem(bp, true);
+       clear_bit(BNXT_STATE_HALF_OPEN, &bp->state);
+-- 
+2.43.0
+
diff --git a/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch b/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch

new file mode 100644 (file)

index 0000000..55a85d9
--- /dev/null
+++ b/queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch
@@ -0,0 +1,43 @@
+From c6cfa8547d19c5c8f5f9a9fe22bd0b1064af03a7 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:45:11 -0800
+Subject: bnxt_en: Wait for FLR to complete during probe
+
+From: Michael Chan <michael.chan@broadcom.com>
+
+[ Upstream commit 3c1069fa42872f95cf3c6fedf80723d391e12d57 ]
+
+The first message to firmware may fail if the device is undergoing FLR.
+The driver has some recovery logic for this failure scenario but we must
+wait 100 msec for FLR to complete before proceeding.  Otherwise the
+recovery will always fail.
+
+Fixes: ba02629ff6cb ("bnxt_en: log firmware status on firmware init failure")
+Reviewed-by: Damodharam Ammepalli <damodharam.ammepalli@broadcom.com>
+Signed-off-by: Michael Chan <michael.chan@broadcom.com>
+Link: https://lore.kernel.org/r/20240117234515.226944-2-michael.chan@broadcom.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/broadcom/bnxt/bnxt.c | 5 +++++
+ 1 file changed, 5 insertions(+)
+
+diff --git a/drivers/net/ethernet/broadcom/bnxt/bnxt.c b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+index e1f1e646cf48..1019b4dc7bed 100644
+--- a/drivers/net/ethernet/broadcom/bnxt/bnxt.c
++++ b/drivers/net/ethernet/broadcom/bnxt/bnxt.c
+@@ -12298,6 +12298,11 @@ static int bnxt_fw_init_one_p1(struct bnxt *bp)
+ 
+       bp->fw_cap = 0;
+       rc = bnxt_hwrm_ver_get(bp);
++      /* FW may be unresponsive after FLR. FLR must complete within 100 msec
++       * so wait before continuing with recovery.
++       */
++      if (rc)
++              msleep(100);
+       bnxt_try_map_fw_health_reg(bp);
+       if (rc) {
+               rc = bnxt_try_recover_fw(bp);
+-- 
+2.43.0
+
diff --git a/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch b/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch

new file mode 100644 (file)

index 0000000..9e96756
--- /dev/null
+++ b/queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch
@@ -0,0 +1,161 @@
+From 7893c91364a20cba4d0c74f3b3455ab5e3175dec Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 11:02:25 +1030
+Subject: btrfs: scrub: avoid use-after-free when chunk length is not 64K
+ aligned
+
+From: Qu Wenruo <wqu@suse.com>
+
+[ Upstream commit f546c4282673497a06ecb6190b50ae7f6c85b02f ]
+
+[BUG]
+There is a bug report that, on a ext4-converted btrfs, scrub leads to
+various problems, including:
+
+- "unable to find chunk map" errors
+  BTRFS info (device vdb): scrub: started on devid 1
+  BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 4096
+  BTRFS critical (device vdb): unable to find chunk map for logical 2214744064 length 45056
+
+  This would lead to unrepariable errors.
+
+- Use-after-free KASAN reports:
+  ==================================================================
+  BUG: KASAN: slab-use-after-free in __blk_rq_map_sg+0x18f/0x7c0
+  Read of size 8 at addr ffff8881013c9040 by task btrfs/909
+  CPU: 0 PID: 909 Comm: btrfs Not tainted 6.7.0-x64v3-dbg #11 c50636e9419a8354555555245df535e380563b2b
+  Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS 2023.11-2 12/24/2023
+  Call Trace:
+   <TASK>
+   dump_stack_lvl+0x43/0x60
+   print_report+0xcf/0x640
+   kasan_report+0xa6/0xd0
+   __blk_rq_map_sg+0x18f/0x7c0
+   virtblk_prep_rq.isra.0+0x215/0x6a0 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+   virtio_queue_rqs+0xc4/0x310 [virtio_blk 19a65eeee9ae6fcf02edfad39bb9ddee07dcdaff]
+   blk_mq_flush_plug_list.part.0+0x780/0x860
+   __blk_flush_plug+0x1ba/0x220
+   blk_finish_plug+0x3b/0x60
+   submit_initial_group_read+0x10a/0x290 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   flush_scrub_stripes+0x38e/0x430 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_stripe+0x82a/0xae0 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_chunk+0x178/0x200 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   scrub_enumerate_chunks+0x4bc/0xa30 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   btrfs_scrub_dev+0x398/0x810 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   btrfs_ioctl+0x4b9/0x3020 [btrfs e57987a360bed82fe8756dcd3e0de5406ccfe965]
+   __x64_sys_ioctl+0xbd/0x100
+   do_syscall_64+0x5d/0xe0
+   entry_SYSCALL_64_after_hwframe+0x63/0x6b
+  RIP: 0033:0x7f47e5e0952b
+
+- Crash, mostly due to above use-after-free
+
+[CAUSE]
+The converted fs has the following data chunk layout:
+
+    item 2 key (FIRST_CHUNK_TREE CHUNK_ITEM 2214658048) itemoff 16025 itemsize 80
+        length 86016 owner 2 stripe_len 65536 type DATA|single
+
+For above logical bytenr 2214744064, it's at the chunk end
+(2214658048 + 86016 = 2214744064).
+
+This means btrfs_submit_bio() would split the bio, and trigger endio
+function for both of the two halves.
+
+However scrub_submit_initial_read() would only expect the endio function
+to be called once, not any more.
+This means the first endio function would already free the bbio::bio,
+leaving the bvec freed, thus the 2nd endio call would lead to
+use-after-free.
+
+[FIX]
+- Make sure scrub_read_endio() only updates bits in its range
+  Since we may read less than 64K at the end of the chunk, we should not
+  touch the bits beyond chunk boundary.
+
+- Make sure scrub_submit_initial_read() only to read the chunk range
+  This is done by calculating the real number of sectors we need to
+  read, and add sector-by-sector to the bio.
+
+Thankfully the scrub read repair path won't need extra fixes:
+
+- scrub_stripe_submit_repair_read()
+  With above fixes, we won't update error bit for range beyond chunk,
+  thus scrub_stripe_submit_repair_read() should never submit any read
+  beyond the chunk.
+
+Reported-by: Rongrong <i@rong.moe>
+Fixes: e02ee89baa66 ("btrfs: scrub: switch scrub_simple_mirror() to scrub_stripe infrastructure")
+Tested-by: Rongrong <i@rong.moe>
+Reviewed-by: Johannes Thumshirn <johannes.thumshirn@wdc.com>
+Signed-off-by: Qu Wenruo <wqu@suse.com>
+Signed-off-by: David Sterba <dsterba@suse.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/btrfs/scrub.c | 29 ++++++++++++++++++++++-------
+ 1 file changed, 22 insertions(+), 7 deletions(-)
+
+diff --git a/fs/btrfs/scrub.c b/fs/btrfs/scrub.c
+index f62a408671cb..443d2519f0a9 100644
+--- a/fs/btrfs/scrub.c
++++ b/fs/btrfs/scrub.c
+@@ -1099,12 +1099,22 @@ static void scrub_stripe_read_repair_worker(struct work_struct *work)
+ static void scrub_read_endio(struct btrfs_bio *bbio)
+ {
+       struct scrub_stripe *stripe = bbio->private;
++      struct bio_vec *bvec;
++      int sector_nr = calc_sector_number(stripe, bio_first_bvec_all(&bbio->bio));
++      int num_sectors;
++      u32 bio_size = 0;
++      int i;
++
++      ASSERT(sector_nr < stripe->nr_sectors);
++      bio_for_each_bvec_all(bvec, &bbio->bio, i)
++              bio_size += bvec->bv_len;
++      num_sectors = bio_size >> stripe->bg->fs_info->sectorsize_bits;
+ 
+       if (bbio->bio.bi_status) {
+-              bitmap_set(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
+-              bitmap_set(&stripe->error_bitmap, 0, stripe->nr_sectors);
++              bitmap_set(&stripe->io_error_bitmap, sector_nr, num_sectors);
++              bitmap_set(&stripe->error_bitmap, sector_nr, num_sectors);
+       } else {
+-              bitmap_clear(&stripe->io_error_bitmap, 0, stripe->nr_sectors);
++              bitmap_clear(&stripe->io_error_bitmap, sector_nr, num_sectors);
+       }
+       bio_put(&bbio->bio);
+       if (atomic_dec_and_test(&stripe->pending_io)) {
+@@ -1705,6 +1715,9 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+ {
+       struct btrfs_fs_info *fs_info = sctx->fs_info;
+       struct btrfs_bio *bbio;
++      unsigned int nr_sectors = min(BTRFS_STRIPE_LEN, stripe->bg->start +
++                                    stripe->bg->length - stripe->logical) >>
++                                fs_info->sectorsize_bits;
+       int mirror = stripe->mirror_num;
+ 
+       ASSERT(stripe->bg);
+@@ -1719,14 +1732,16 @@ static void scrub_submit_initial_read(struct scrub_ctx *sctx,
+       bbio = btrfs_bio_alloc(SCRUB_STRIPE_PAGES, REQ_OP_READ, fs_info,
+                              scrub_read_endio, stripe);
+ 
+-      /* Read the whole stripe. */
+       bbio->bio.bi_iter.bi_sector = stripe->logical >> SECTOR_SHIFT;
+-      for (int i = 0; i < BTRFS_STRIPE_LEN >> PAGE_SHIFT; i++) {
++      /* Read the whole range inside the chunk boundary. */
++      for (unsigned int cur = 0; cur < nr_sectors; cur++) {
++              struct page *page = scrub_stripe_get_page(stripe, cur);
++              unsigned int pgoff = scrub_stripe_get_page_offset(stripe, cur);
+               int ret;
+ 
+-              ret = bio_add_page(&bbio->bio, stripe->pages[i], PAGE_SIZE, 0);
++              ret = bio_add_page(&bbio->bio, page, fs_info->sectorsize, pgoff);
+               /* We should have allocated enough bio vectors. */
+-              ASSERT(ret == PAGE_SIZE);
++              ASSERT(ret == fs_info->sectorsize);
+       }
+       atomic_inc(&stripe->pending_io);
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch b/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch

new file mode 100644 (file)

index 0000000..10ce30a
--- /dev/null
+++ b/queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch
@@ -0,0 +1,54 @@
+From 1a1ebca1fa42f6ee08f20960c224d7176929bbc5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:01 +0100
+Subject: dpll: fix broken error path in dpll_pin_alloc(..)
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit b6a11a7fc4d6337f7ea720b9287d1b9749c4eae0 ]
+
+If pin type is not expected, or pin properities failed to allocate
+memory, the unwind error path shall not destroy pin's xarrays, which
+were not yet initialized.
+Add new goto label and use it to fix broken error path.
+
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Stable-dep-of: 830ead5fb0c5 ("dpll: fix pin dump crash for rebound module")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c | 7 ++++---
+ 1 file changed, 4 insertions(+), 3 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 3568149b9562..36f5c0eaf604 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -440,7 +440,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+       if (WARN_ON(prop->type < DPLL_PIN_TYPE_MUX ||
+                   prop->type > DPLL_PIN_TYPE_MAX)) {
+               ret = -EINVAL;
+-              goto err;
++              goto err_pin_prop;
+       }
+       pin->prop = prop;
+       refcount_set(&pin->refcount, 1);
+@@ -448,11 +448,12 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+       xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+       ret = xa_alloc(&dpll_pin_xa, &pin->id, pin, xa_limit_16b, GFP_KERNEL);
+       if (ret)
+-              goto err;
++              goto err_xa_alloc;
+       return pin;
+-err:
++err_xa_alloc:
+       xa_destroy(&pin->dpll_refs);
+       xa_destroy(&pin->parent_refs);
++err_pin_prop:
+       kfree(pin);
+       return ERR_PTR(ret);
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch b/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch

new file mode 100644 (file)

index 0000000..2e8616c
--- /dev/null
+++ b/queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch
@@ -0,0 +1,258 @@
+From c92ec4869e0b3bdd99cf0a23d92cc463e45348dd Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:02 +0100
+Subject: dpll: fix pin dump crash for rebound module
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit 830ead5fb0c5855ce4d70ba2ed4a673b5f1e7d9b ]
+
+When a kernel module is unbound but the pin resources were not entirely
+freed (other kernel module instance of the same PCI device have had kept
+the reference to that pin), and kernel module is again bound, the pin
+properties would not be updated (the properties are only assigned when
+memory for the pin is allocated), prop pointer still points to the
+kernel module memory of the kernel module which was deallocated on the
+unbind.
+
+If the pin dump is invoked in this state, the result is a kernel crash.
+Prevent the crash by storing persistent pin properties in dpll subsystem,
+copy the content from the kernel module when pin is allocated, instead of
+using memory of the kernel module.
+
+Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Przemek Kitszel <przemyslaw.kitszel@intel.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c    | 55 +++++++++++++++++++++++++++++++++++--
+ drivers/dpll/dpll_core.h    |  4 +--
+ drivers/dpll/dpll_netlink.c | 28 +++++++++----------
+ 3 files changed, 69 insertions(+), 18 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 36f5c0eaf604..5e3b9b5679f9 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -424,6 +424,53 @@ void dpll_device_unregister(struct dpll_device *dpll,
+ }
+ EXPORT_SYMBOL_GPL(dpll_device_unregister);
+ 
++static void dpll_pin_prop_free(struct dpll_pin_properties *prop)
++{
++      kfree(prop->package_label);
++      kfree(prop->panel_label);
++      kfree(prop->board_label);
++      kfree(prop->freq_supported);
++}
++
++static int dpll_pin_prop_dup(const struct dpll_pin_properties *src,
++                           struct dpll_pin_properties *dst)
++{
++      memcpy(dst, src, sizeof(*dst));
++      if (src->freq_supported && src->freq_supported_num) {
++              size_t freq_size = src->freq_supported_num *
++                                 sizeof(*src->freq_supported);
++              dst->freq_supported = kmemdup(src->freq_supported,
++                                            freq_size, GFP_KERNEL);
++              if (!src->freq_supported)
++                      return -ENOMEM;
++      }
++      if (src->board_label) {
++              dst->board_label = kstrdup(src->board_label, GFP_KERNEL);
++              if (!dst->board_label)
++                      goto err_board_label;
++      }
++      if (src->panel_label) {
++              dst->panel_label = kstrdup(src->panel_label, GFP_KERNEL);
++              if (!dst->panel_label)
++                      goto err_panel_label;
++      }
++      if (src->package_label) {
++              dst->package_label = kstrdup(src->package_label, GFP_KERNEL);
++              if (!dst->package_label)
++                      goto err_package_label;
++      }
++
++      return 0;
++
++err_package_label:
++      kfree(dst->panel_label);
++err_panel_label:
++      kfree(dst->board_label);
++err_board_label:
++      kfree(dst->freq_supported);
++      return -ENOMEM;
++}
++
+ static struct dpll_pin *
+ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+              const struct dpll_pin_properties *prop)
+@@ -442,7 +489,9 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+               ret = -EINVAL;
+               goto err_pin_prop;
+       }
+-      pin->prop = prop;
++      ret = dpll_pin_prop_dup(prop, &pin->prop);
++      if (ret)
++              goto err_pin_prop;
+       refcount_set(&pin->refcount, 1);
+       xa_init_flags(&pin->dpll_refs, XA_FLAGS_ALLOC);
+       xa_init_flags(&pin->parent_refs, XA_FLAGS_ALLOC);
+@@ -453,6 +502,7 @@ dpll_pin_alloc(u64 clock_id, u32 pin_idx, struct module *module,
+ err_xa_alloc:
+       xa_destroy(&pin->dpll_refs);
+       xa_destroy(&pin->parent_refs);
++      dpll_pin_prop_free(&pin->prop);
+ err_pin_prop:
+       kfree(pin);
+       return ERR_PTR(ret);
+@@ -513,6 +563,7 @@ void dpll_pin_put(struct dpll_pin *pin)
+               xa_destroy(&pin->dpll_refs);
+               xa_destroy(&pin->parent_refs);
+               xa_erase(&dpll_pin_xa, pin->id);
++              dpll_pin_prop_free(&pin->prop);
+               kfree(pin);
+       }
+       mutex_unlock(&dpll_lock);
+@@ -635,7 +686,7 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+       unsigned long i, stop;
+       int ret;
+ 
+-      if (WARN_ON(parent->prop->type != DPLL_PIN_TYPE_MUX))
++      if (WARN_ON(parent->prop.type != DPLL_PIN_TYPE_MUX))
+               return -EINVAL;
+ 
+       if (WARN_ON(!ops) ||
+diff --git a/drivers/dpll/dpll_core.h b/drivers/dpll/dpll_core.h
+index 5585873c5c1b..717f715015c7 100644
+--- a/drivers/dpll/dpll_core.h
++++ b/drivers/dpll/dpll_core.h
+@@ -44,7 +44,7 @@ struct dpll_device {
+  * @module:           module of creator
+  * @dpll_refs:                hold referencees to dplls pin was registered with
+  * @parent_refs:      hold references to parent pins pin was registered with
+- * @prop:             pointer to pin properties given by registerer
++ * @prop:             pin properties copied from the registerer
+  * @rclk_dev_name:    holds name of device when pin can recover clock from it
+  * @refcount:         refcount
+  **/
+@@ -55,7 +55,7 @@ struct dpll_pin {
+       struct module *module;
+       struct xarray dpll_refs;
+       struct xarray parent_refs;
+-      const struct dpll_pin_properties *prop;
++      struct dpll_pin_properties prop;
+       refcount_t refcount;
+ };
+ 
+diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
+index ce7cf736f020..4c64611d32ac 100644
+--- a/drivers/dpll/dpll_netlink.c
++++ b/drivers/dpll/dpll_netlink.c
+@@ -278,17 +278,17 @@ dpll_msg_add_pin_freq(struct sk_buff *msg, struct dpll_pin *pin,
+       if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY, sizeof(freq), &freq,
+                         DPLL_A_PIN_PAD))
+               return -EMSGSIZE;
+-      for (fs = 0; fs < pin->prop->freq_supported_num; fs++) {
++      for (fs = 0; fs < pin->prop.freq_supported_num; fs++) {
+               nest = nla_nest_start(msg, DPLL_A_PIN_FREQUENCY_SUPPORTED);
+               if (!nest)
+                       return -EMSGSIZE;
+-              freq = pin->prop->freq_supported[fs].min;
++              freq = pin->prop.freq_supported[fs].min;
+               if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MIN, sizeof(freq),
+                                 &freq, DPLL_A_PIN_PAD)) {
+                       nla_nest_cancel(msg, nest);
+                       return -EMSGSIZE;
+               }
+-              freq = pin->prop->freq_supported[fs].max;
++              freq = pin->prop.freq_supported[fs].max;
+               if (nla_put_64bit(msg, DPLL_A_PIN_FREQUENCY_MAX, sizeof(freq),
+                                 &freq, DPLL_A_PIN_PAD)) {
+                       nla_nest_cancel(msg, nest);
+@@ -304,9 +304,9 @@ static bool dpll_pin_is_freq_supported(struct dpll_pin *pin, u32 freq)
+ {
+       int fs;
+ 
+-      for (fs = 0; fs < pin->prop->freq_supported_num; fs++)
+-              if (freq >= pin->prop->freq_supported[fs].min &&
+-                  freq <= pin->prop->freq_supported[fs].max)
++      for (fs = 0; fs < pin->prop.freq_supported_num; fs++)
++              if (freq >= pin->prop.freq_supported[fs].min &&
++                  freq <= pin->prop.freq_supported[fs].max)
+                       return true;
+       return false;
+ }
+@@ -396,7 +396,7 @@ static int
+ dpll_cmd_pin_get_one(struct sk_buff *msg, struct dpll_pin *pin,
+                    struct netlink_ext_ack *extack)
+ {
+-      const struct dpll_pin_properties *prop = pin->prop;
++      const struct dpll_pin_properties *prop = &pin->prop;
+       struct dpll_pin_ref *ref;
+       int ret;
+ 
+@@ -689,7 +689,7 @@ dpll_pin_on_pin_state_set(struct dpll_pin *pin, u32 parent_idx,
+       int ret;
+ 
+       if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+-            pin->prop->capabilities)) {
++            pin->prop.capabilities)) {
+               NL_SET_ERR_MSG(extack, "state changing is not allowed");
+               return -EOPNOTSUPP;
+       }
+@@ -725,7 +725,7 @@ dpll_pin_state_set(struct dpll_device *dpll, struct dpll_pin *pin,
+       int ret;
+ 
+       if (!(DPLL_PIN_CAPABILITIES_STATE_CAN_CHANGE &
+-            pin->prop->capabilities)) {
++            pin->prop.capabilities)) {
+               NL_SET_ERR_MSG(extack, "state changing is not allowed");
+               return -EOPNOTSUPP;
+       }
+@@ -752,7 +752,7 @@ dpll_pin_prio_set(struct dpll_device *dpll, struct dpll_pin *pin,
+       int ret;
+ 
+       if (!(DPLL_PIN_CAPABILITIES_PRIORITY_CAN_CHANGE &
+-            pin->prop->capabilities)) {
++            pin->prop.capabilities)) {
+               NL_SET_ERR_MSG(extack, "prio changing is not allowed");
+               return -EOPNOTSUPP;
+       }
+@@ -780,7 +780,7 @@ dpll_pin_direction_set(struct dpll_pin *pin, struct dpll_device *dpll,
+       int ret;
+ 
+       if (!(DPLL_PIN_CAPABILITIES_DIRECTION_CAN_CHANGE &
+-            pin->prop->capabilities)) {
++            pin->prop.capabilities)) {
+               NL_SET_ERR_MSG(extack, "direction changing is not allowed");
+               return -EOPNOTSUPP;
+       }
+@@ -810,8 +810,8 @@ dpll_pin_phase_adj_set(struct dpll_pin *pin, struct nlattr *phase_adj_attr,
+       int ret;
+ 
+       phase_adj = nla_get_s32(phase_adj_attr);
+-      if (phase_adj > pin->prop->phase_range.max ||
+-          phase_adj < pin->prop->phase_range.min) {
++      if (phase_adj > pin->prop.phase_range.max ||
++          phase_adj < pin->prop.phase_range.min) {
+               NL_SET_ERR_MSG_ATTR(extack, phase_adj_attr,
+                                   "phase adjust value not supported");
+               return -EINVAL;
+@@ -995,7 +995,7 @@ dpll_pin_find(u64 clock_id, struct nlattr *mod_name_attr,
+       unsigned long i;
+ 
+       xa_for_each_marked(&dpll_pin_xa, i, pin, DPLL_REGISTERED) {
+-              prop = pin->prop;
++              prop = &pin->prop;
+               cid_match = clock_id ? pin->clock_id == clock_id : true;
+               mod_match = mod_name_attr && module_name(pin->module) ?
+                       !nla_strcmp(mod_name_attr,
+-- 
+2.43.0
+
diff --git a/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch b/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch

new file mode 100644 (file)

index 0000000..1384b28
--- /dev/null
+++ b/queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch
@@ -0,0 +1,69 @@
+From c414d49fa449a866c343aa87835bb4d65c568c92 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:04 +0100
+Subject: dpll: fix register pin with unregistered parent pin
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit 7dc5b18ff71bd6f948810ab8a08b6a6ff8b315c5 ]
+
+In case of multiple kernel module instances using the same dpll device:
+if only one registers dpll device, then only that one can register
+directly connected pins with a dpll device. When unregistered parent is
+responsible for determining if the muxed pin can be registered with it
+or not, the drivers need to be loaded in serialized order to work
+correctly - first the driver instance which registers the direct pins
+needs to be loaded, then the other instances could register muxed type
+pins.
+
+Allow registration of a pin with a parent even if the parent was not
+yet registered, thus allow ability for unserialized driver instance
+load order.
+Do not WARN_ON notification for unregistered pin, which can be invoked
+for described case, instead just return error.
+
+Fixes: 9431063ad323 ("dpll: core: Add DPLL framework base functions")
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_core.c | 6 ------
+ 1 file changed, 6 deletions(-)
+
+diff --git a/drivers/dpll/dpll_core.c b/drivers/dpll/dpll_core.c
+index 5e3b9b5679f9..f8fbf0394288 100644
+--- a/drivers/dpll/dpll_core.c
++++ b/drivers/dpll/dpll_core.c
+@@ -28,8 +28,6 @@ static u32 dpll_xa_id;
+       WARN_ON_ONCE(!xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+ #define ASSERT_DPLL_NOT_REGISTERED(d) \
+       WARN_ON_ONCE(xa_get_mark(&dpll_device_xa, (d)->id, DPLL_REGISTERED))
+-#define ASSERT_PIN_REGISTERED(p)      \
+-      WARN_ON_ONCE(!xa_get_mark(&dpll_pin_xa, (p)->id, DPLL_REGISTERED))
+ 
+ struct dpll_device_registration {
+       struct list_head list;
+@@ -614,8 +612,6 @@ dpll_pin_register(struct dpll_device *dpll, struct dpll_pin *pin,
+           WARN_ON(!ops->state_on_dpll_get) ||
+           WARN_ON(!ops->direction_get))
+               return -EINVAL;
+-      if (ASSERT_DPLL_REGISTERED(dpll))
+-              return -EINVAL;
+ 
+       mutex_lock(&dpll_lock);
+       if (WARN_ON(!(dpll->module == pin->module &&
+@@ -693,8 +689,6 @@ int dpll_pin_on_pin_register(struct dpll_pin *parent, struct dpll_pin *pin,
+           WARN_ON(!ops->state_on_pin_get) ||
+           WARN_ON(!ops->direction_get))
+               return -EINVAL;
+-      if (ASSERT_PIN_REGISTERED(parent))
+-              return -EINVAL;
+ 
+       mutex_lock(&dpll_lock);
+       ret = dpll_xa_ref_pin_add(&pin->parent_refs, parent, ops, priv);
+-- 
+2.43.0
+
diff --git a/queue-6.7/dpll-fix-userspace-availability-of-pins.patch b/queue-6.7/dpll-fix-userspace-availability-of-pins.patch

new file mode 100644 (file)

index 0000000..75c0643
--- /dev/null
+++ b/queue-6.7/dpll-fix-userspace-availability-of-pins.patch
@@ -0,0 +1,98 @@
+From be5f9be3b22c6869a4688312effac793c9550550 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 14:43:03 +0100
+Subject: dpll: fix userspace availability of pins
+
+From: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+
+[ Upstream commit db2ec3c94667eaeecc6a74d96594fab6baf80fdc ]
+
+If parent pin was unregistered but child pin was not, the userspace
+would see the "zombie" pins - the ones that were registered with
+a parent pin (dpll_pin_on_pin_register(..)).
+Technically those are not available - as there is no dpll device in the
+system. Do not dump those pins and prevent userspace from any
+interaction with them. Provide a unified function to determine if the
+pin is available and use it before acting/responding for user requests.
+
+Fixes: 9d71b54b65b1 ("dpll: netlink: Add DPLL framework base functions")
+Reviewed-by: Jan Glaza <jan.glaza@intel.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Signed-off-by: Arkadiusz Kubalewski <arkadiusz.kubalewski@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/dpll/dpll_netlink.c | 29 +++++++++++++++++++++++++++--
+ 1 file changed, 27 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/dpll/dpll_netlink.c b/drivers/dpll/dpll_netlink.c
+index 4c64611d32ac..7cc99d627942 100644
+--- a/drivers/dpll/dpll_netlink.c
++++ b/drivers/dpll/dpll_netlink.c
+@@ -525,6 +525,24 @@ __dpll_device_change_ntf(struct dpll_device *dpll)
+       return dpll_device_event_send(DPLL_CMD_DEVICE_CHANGE_NTF, dpll);
+ }
+ 
++static bool dpll_pin_available(struct dpll_pin *pin)
++{
++      struct dpll_pin_ref *par_ref;
++      unsigned long i;
++
++      if (!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED))
++              return false;
++      xa_for_each(&pin->parent_refs, i, par_ref)
++              if (xa_get_mark(&dpll_pin_xa, par_ref->pin->id,
++                              DPLL_REGISTERED))
++                      return true;
++      xa_for_each(&pin->dpll_refs, i, par_ref)
++              if (xa_get_mark(&dpll_device_xa, par_ref->dpll->id,
++                              DPLL_REGISTERED))
++                      return true;
++      return false;
++}
++
+ /**
+  * dpll_device_change_ntf - notify that the dpll device has been changed
+  * @dpll: registered dpll pointer
+@@ -551,7 +569,7 @@ dpll_pin_event_send(enum dpll_cmd event, struct dpll_pin *pin)
+       int ret = -ENOMEM;
+       void *hdr;
+ 
+-      if (WARN_ON(!xa_get_mark(&dpll_pin_xa, pin->id, DPLL_REGISTERED)))
++      if (!dpll_pin_available(pin))
+               return -ENODEV;
+ 
+       msg = genlmsg_new(NLMSG_GOODSIZE, GFP_KERNEL);
+@@ -1102,6 +1120,10 @@ int dpll_nl_pin_id_get_doit(struct sk_buff *skb, struct genl_info *info)
+       }
+       pin = dpll_pin_find_from_nlattr(info);
+       if (!IS_ERR(pin)) {
++              if (!dpll_pin_available(pin)) {
++                      nlmsg_free(msg);
++                      return -ENODEV;
++              }
+               ret = dpll_msg_add_pin_handle(msg, pin);
+               if (ret) {
+                       nlmsg_free(msg);
+@@ -1151,6 +1173,8 @@ int dpll_nl_pin_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb)
+ 
+       xa_for_each_marked_start(&dpll_pin_xa, i, pin, DPLL_REGISTERED,
+                                ctx->idx) {
++              if (!dpll_pin_available(pin))
++                      continue;
+               hdr = genlmsg_put(skb, NETLINK_CB(cb->skb).portid,
+                                 cb->nlh->nlmsg_seq,
+                                 &dpll_nl_family, NLM_F_MULTI,
+@@ -1413,7 +1437,8 @@ int dpll_pin_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
+       }
+       info->user_ptr[0] = xa_load(&dpll_pin_xa,
+                                   nla_get_u32(info->attrs[DPLL_A_PIN_ID]));
+-      if (!info->user_ptr[0]) {
++      if (!info->user_ptr[0] ||
++          !dpll_pin_available(info->user_ptr[0])) {
+               NL_SET_ERR_MSG(info->extack, "pin not found");
+               ret = -ENODEV;
+               goto unlock_dev;
+-- 
+2.43.0
+
diff --git a/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch b/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch

new file mode 100644 (file)

index 0000000..32f1ad3
--- /dev/null
+++ b/queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch
@@ -0,0 +1,109 @@
+From 2b38a16abde53bfda995910e39ff9466933e189c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 01:24:42 +0800
+Subject: fjes: fix memleaks in fjes_hw_setup
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit f6cc4b6a3ae53df425771000e9c9540cce9b7bb1 ]
+
+In fjes_hw_setup, it allocates several memory and delay the deallocation
+to the fjes_hw_exit in fjes_probe through the following call chain:
+
+fjes_probe
+  |-> fjes_hw_init
+        |-> fjes_hw_setup
+  |-> fjes_hw_exit
+
+However, when fjes_hw_setup fails, fjes_hw_exit won't be called and thus
+all the resources allocated in fjes_hw_setup will be leaked. In this
+patch, we free those resources in fjes_hw_setup and prevents such leaks.
+
+Fixes: 2fcbca687702 ("fjes: platform_driver's .probe and .remove routine")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122172445.3841883-1-alexious@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/fjes/fjes_hw.c | 37 ++++++++++++++++++++++++++++++-------
+ 1 file changed, 30 insertions(+), 7 deletions(-)
+
+diff --git a/drivers/net/fjes/fjes_hw.c b/drivers/net/fjes/fjes_hw.c
+index 704e949484d0..b9b5554ea862 100644
+--- a/drivers/net/fjes/fjes_hw.c
++++ b/drivers/net/fjes/fjes_hw.c
+@@ -221,21 +221,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+ 
+       mem_size = FJES_DEV_REQ_BUF_SIZE(hw->max_epid);
+       hw->hw_info.req_buf = kzalloc(mem_size, GFP_KERNEL);
+-      if (!(hw->hw_info.req_buf))
+-              return -ENOMEM;
++      if (!(hw->hw_info.req_buf)) {
++              result = -ENOMEM;
++              goto free_ep_info;
++      }
+ 
+       hw->hw_info.req_buf_size = mem_size;
+ 
+       mem_size = FJES_DEV_RES_BUF_SIZE(hw->max_epid);
+       hw->hw_info.res_buf = kzalloc(mem_size, GFP_KERNEL);
+-      if (!(hw->hw_info.res_buf))
+-              return -ENOMEM;
++      if (!(hw->hw_info.res_buf)) {
++              result = -ENOMEM;
++              goto free_req_buf;
++      }
+ 
+       hw->hw_info.res_buf_size = mem_size;
+ 
+       result = fjes_hw_alloc_shared_status_region(hw);
+       if (result)
+-              return result;
++              goto free_res_buf;
+ 
+       hw->hw_info.buffer_share_bit = 0;
+       hw->hw_info.buffer_unshare_reserve_bit = 0;
+@@ -246,11 +250,11 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+ 
+                       result = fjes_hw_alloc_epbuf(&buf_pair->tx);
+                       if (result)
+-                              return result;
++                              goto free_epbuf;
+ 
+                       result = fjes_hw_alloc_epbuf(&buf_pair->rx);
+                       if (result)
+-                              return result;
++                              goto free_epbuf;
+ 
+                       spin_lock_irqsave(&hw->rx_status_lock, flags);
+                       fjes_hw_setup_epbuf(&buf_pair->tx, mac,
+@@ -273,6 +277,25 @@ static int fjes_hw_setup(struct fjes_hw *hw)
+       fjes_hw_init_command_registers(hw, &param);
+ 
+       return 0;
++
++free_epbuf:
++      for (epidx = 0; epidx < hw->max_epid ; epidx++) {
++              if (epidx == hw->my_epid)
++                      continue;
++              fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].tx);
++              fjes_hw_free_epbuf(&hw->ep_shm_info[epidx].rx);
++      }
++      fjes_hw_free_shared_status_region(hw);
++free_res_buf:
++      kfree(hw->hw_info.res_buf);
++      hw->hw_info.res_buf = NULL;
++free_req_buf:
++      kfree(hw->hw_info.req_buf);
++      hw->hw_info.req_buf = NULL;
++free_ep_info:
++      kfree(hw->ep_shm_info);
++      hw->ep_shm_info = NULL;
++      return result;
+ }
+ 
+ static void fjes_hw_cleanup(struct fjes_hw *hw)
+-- 
+2.43.0
+
diff --git a/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch b/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch

new file mode 100644 (file)

index 0000000..5e7d8b0
--- /dev/null
+++ b/queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch
@@ -0,0 +1,170 @@
+From 4186d2f90184f83aa949ca818e6eb18bc87b6253 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:56 +0100
+Subject: i40e: handle multi-buffer packets that are shrunk by xdp prog
+
+From: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+
+[ Upstream commit 83014323c642b8faa2d64a5f303b41c019322478 ]
+
+XDP programs can shrink packets by calling the bpf_xdp_adjust_tail()
+helper function. For multi-buffer packets this may lead to reduction of
+frag count stored in skb_shared_info area of the xdp_buff struct. This
+results in issues with the current handling of XDP_PASS and XDP_DROP
+cases.
+
+For XDP_PASS, currently skb is being built using frag count of
+xdp_buffer before it was processed by XDP prog and thus will result in
+an inconsistent skb when frag count gets reduced by XDP prog. To fix
+this, get correct frag count while building the skb instead of using
+pre-obtained frag count.
+
+For XDP_DROP, current page recycling logic will not reuse the page but
+instead will adjust the pagecnt_bias so that the page can be freed. This
+again results in inconsistent behavior as the page refcnt has already
+been changed by the helper while freeing the frag(s) as part of
+shrinking the packet. To fix this, only adjust pagecnt_bias for buffers
+that are stillpart of the packet post-xdp prog run.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Reported-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Signed-off-by: Tirthendu Sarkar <tirthendu.sarkar@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-6-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c | 40 ++++++++++++---------
+ 1 file changed, 23 insertions(+), 17 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index dd410b15000f..35e1bb6fe5e1 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -2099,7 +2099,8 @@ static void i40e_put_rx_buffer(struct i40e_ring *rx_ring,
+ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+                                 struct xdp_buff *xdp)
+ {
+-      u32 next = rx_ring->next_to_clean;
++      u32 nr_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++      u32 next = rx_ring->next_to_clean, i = 0;
+       struct i40e_rx_buffer *rx_buffer;
+ 
+       xdp->flags = 0;
+@@ -2112,10 +2113,10 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+               if (!rx_buffer->page)
+                       continue;
+ 
+-              if (xdp_res == I40E_XDP_CONSUMED)
+-                      rx_buffer->pagecnt_bias++;
+-              else
++              if (xdp_res != I40E_XDP_CONSUMED)
+                       i40e_rx_buffer_flip(rx_buffer, xdp->frame_sz);
++              else if (i++ <= nr_frags)
++                      rx_buffer->pagecnt_bias++;
+ 
+               /* EOP buffer will be put in i40e_clean_rx_irq() */
+               if (next == rx_ring->next_to_process)
+@@ -2129,20 +2130,20 @@ static void i40e_process_rx_buffs(struct i40e_ring *rx_ring, int xdp_res,
+  * i40e_construct_skb - Allocate skb and populate it
+  * @rx_ring: rx descriptor ring to transact packets on
+  * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+  *
+  * This function allocates an skb.  It then populates it with the page
+  * data from the current receive descriptor, taking care to set up the
+  * skb correctly.
+  */
+ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+-                                        struct xdp_buff *xdp,
+-                                        u32 nr_frags)
++                                        struct xdp_buff *xdp)
+ {
+       unsigned int size = xdp->data_end - xdp->data;
+       struct i40e_rx_buffer *rx_buffer;
++      struct skb_shared_info *sinfo;
+       unsigned int headlen;
+       struct sk_buff *skb;
++      u32 nr_frags = 0;
+ 
+       /* prefetch first cache line of first page */
+       net_prefetch(xdp->data);
+@@ -2180,6 +2181,10 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+       memcpy(__skb_put(skb, headlen), xdp->data,
+              ALIGN(headlen, sizeof(long)));
+ 
++      if (unlikely(xdp_buff_has_frags(xdp))) {
++              sinfo = xdp_get_shared_info_from_buff(xdp);
++              nr_frags = sinfo->nr_frags;
++      }
+       rx_buffer = i40e_rx_bi(rx_ring, rx_ring->next_to_clean);
+       /* update all of the pointers */
+       size -= headlen;
+@@ -2199,9 +2204,8 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+       }
+ 
+       if (unlikely(xdp_buff_has_frags(xdp))) {
+-              struct skb_shared_info *sinfo, *skinfo = skb_shinfo(skb);
++              struct skb_shared_info *skinfo = skb_shinfo(skb);
+ 
+-              sinfo = xdp_get_shared_info_from_buff(xdp);
+               memcpy(&skinfo->frags[skinfo->nr_frags], &sinfo->frags[0],
+                      sizeof(skb_frag_t) * nr_frags);
+ 
+@@ -2224,17 +2228,17 @@ static struct sk_buff *i40e_construct_skb(struct i40e_ring *rx_ring,
+  * i40e_build_skb - Build skb around an existing buffer
+  * @rx_ring: Rx descriptor ring to transact packets on
+  * @xdp: xdp_buff pointing to the data
+- * @nr_frags: number of buffers for the packet
+  *
+  * This function builds an skb around an existing Rx buffer, taking care
+  * to set up the skb correctly and avoid any memcpy overhead.
+  */
+ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+-                                    struct xdp_buff *xdp,
+-                                    u32 nr_frags)
++                                    struct xdp_buff *xdp)
+ {
+       unsigned int metasize = xdp->data - xdp->data_meta;
++      struct skb_shared_info *sinfo;
+       struct sk_buff *skb;
++      u32 nr_frags;
+ 
+       /* Prefetch first cache line of first page. If xdp->data_meta
+        * is unused, this points exactly as xdp->data, otherwise we
+@@ -2243,6 +2247,11 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+        */
+       net_prefetch(xdp->data_meta);
+ 
++      if (unlikely(xdp_buff_has_frags(xdp))) {
++              sinfo = xdp_get_shared_info_from_buff(xdp);
++              nr_frags = sinfo->nr_frags;
++      }
++
+       /* build an skb around the page buffer */
+       skb = napi_build_skb(xdp->data_hard_start, xdp->frame_sz);
+       if (unlikely(!skb))
+@@ -2255,9 +2264,6 @@ static struct sk_buff *i40e_build_skb(struct i40e_ring *rx_ring,
+               skb_metadata_set(skb, metasize);
+ 
+       if (unlikely(xdp_buff_has_frags(xdp))) {
+-              struct skb_shared_info *sinfo;
+-
+-              sinfo = xdp_get_shared_info_from_buff(xdp);
+               xdp_update_skb_shared_info(skb, nr_frags,
+                                          sinfo->xdp_frags_size,
+                                          nr_frags * xdp->frame_sz,
+@@ -2602,9 +2608,9 @@ static int i40e_clean_rx_irq(struct i40e_ring *rx_ring, int budget,
+                       total_rx_bytes += size;
+               } else {
+                       if (ring_uses_build_skb(rx_ring))
+-                              skb = i40e_build_skb(rx_ring, xdp, nfrags);
++                              skb = i40e_build_skb(rx_ring, xdp);
+                       else
+-                              skb = i40e_construct_skb(rx_ring, xdp, nfrags);
++                              skb = i40e_construct_skb(rx_ring, xdp);
+ 
+                       /* drop if we failed to retrieve a buffer */
+                       if (!skb) {
+-- 
+2.43.0
+
diff --git a/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch b/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch

new file mode 100644 (file)

index 0000000..f5622ab
--- /dev/null
+++ b/queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch
@@ -0,0 +1,130 @@
+From 8a934672dca82b456b04642db14ad547d932075d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:01 +0100
+Subject: i40e: set xdp_rxq_info::frag_size
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit a045d2f2d03d23e7db6772dd83e0ba2705dfad93 ]
+
+i40e support XDP multi-buffer so it is supposed to use
+__xdp_rxq_info_reg() instead of xdp_rxq_info_reg() and set the
+frag_size. It can not be simply converted at existing callsite because
+rx_buf_len could be un-initialized, so let us register xdp_rxq_info
+within i40e_configure_rx_ring(), which happen to be called with already
+initialized rx_buf_len value.
+
+Commit 5180ff1364bc ("i40e: use int for i40e_status") converted 'err' to
+int, so two variables to deal with return codes are not needed within
+i40e_configure_rx_ring(). Remove 'ret' and use 'err' to handle status
+from xdp_rxq_info registration.
+
+Fixes: e213ced19bef ("i40e: add support for XDP multi-buffer Rx")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-11-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 40 ++++++++++++---------
+ drivers/net/ethernet/intel/i40e/i40e_txrx.c |  9 -----
+ 2 files changed, 24 insertions(+), 25 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index d5519af34657..f97a63812141 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3588,40 +3588,48 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+       struct i40e_hmc_obj_rxq rx_ctx;
+       int err = 0;
+       bool ok;
+-      int ret;
+ 
+       bitmap_zero(ring->state, __I40E_RING_STATE_NBITS);
+ 
+       /* clear the context structure first */
+       memset(&rx_ctx, 0, sizeof(rx_ctx));
+ 
+-      if (ring->vsi->type == I40E_VSI_MAIN)
+-              xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++      ring->rx_buf_len = vsi->rx_buf_len;
++
++      /* XDP RX-queue info only needed for RX rings exposed to XDP */
++      if (ring->vsi->type != I40E_VSI_MAIN)
++              goto skip;
++
++      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                       ring->queue_index,
++                                       ring->q_vector->napi.napi_id,
++                                       ring->rx_buf_len);
++              if (err)
++                      return err;
++      }
+ 
+       ring->xsk_pool = i40e_xsk_pool(ring);
+       if (ring->xsk_pool) {
+-              ring->rx_buf_len =
+-                xsk_pool_get_rx_frame_size(ring->xsk_pool);
+-              ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++              ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++              err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                MEM_TYPE_XSK_BUFF_POOL,
+                                                NULL);
+-              if (ret)
+-                      return ret;
++              if (err)
++                      return err;
+               dev_info(&vsi->back->pdev->dev,
+                        "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+                        ring->queue_index);
+ 
+       } else {
+-              ring->rx_buf_len = vsi->rx_buf_len;
+-              if (ring->vsi->type == I40E_VSI_MAIN) {
+-                      ret = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+-                                                       MEM_TYPE_PAGE_SHARED,
+-                                                       NULL);
+-                      if (ret)
+-                              return ret;
+-              }
++              err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
++                                               MEM_TYPE_PAGE_SHARED,
++                                               NULL);
++              if (err)
++                      return err;
+       }
+ 
++skip:
+       xdp_init_buff(&ring->xdp, i40e_rx_pg_size(ring) / 2, &ring->xdp_rxq);
+ 
+       rx_ctx.dbuff = DIV_ROUND_UP(ring->rx_buf_len,
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_txrx.c b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+index 35e1bb6fe5e1..071ef309a3a4 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_txrx.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_txrx.c
+@@ -1555,7 +1555,6 @@ void i40e_free_rx_resources(struct i40e_ring *rx_ring)
+ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+ {
+       struct device *dev = rx_ring->dev;
+-      int err;
+ 
+       u64_stats_init(&rx_ring->syncp);
+ 
+@@ -1576,14 +1575,6 @@ int i40e_setup_rx_descriptors(struct i40e_ring *rx_ring)
+       rx_ring->next_to_process = 0;
+       rx_ring->next_to_use = 0;
+ 
+-      /* XDP RX-queue info only needed for RX rings exposed to XDP */
+-      if (rx_ring->vsi->type == I40E_VSI_MAIN) {
+-              err = xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+-                                     rx_ring->queue_index, rx_ring->q_vector->napi.napi_id);
+-              if (err < 0)
+-                      return err;
+-      }
+-
+       rx_ring->xdp_prog = rx_ring->vsi->xdp_prog;
+ 
+       rx_ring->rx_bi =
+-- 
+2.43.0
+
diff --git a/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch b/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch

new file mode 100644 (file)

index 0000000..9e6beb7
--- /dev/null
+++ b/queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch
@@ -0,0 +1,46 @@
+From 27105d0dd212b950eacaae0e22bf6cccdf54c566 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:02 +0100
+Subject: i40e: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 0cbb08707c932b3f004bc1a8ec6200ef572c1f5f ]
+
+Now that i40e driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. i40e_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-12-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_main.c | 7 +++++++
+ 1 file changed, 7 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_main.c b/drivers/net/ethernet/intel/i40e/i40e_main.c
+index f97a63812141..2bd7b29fb251 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_main.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_main.c
+@@ -3611,7 +3611,14 @@ static int i40e_configure_rx_ring(struct i40e_ring *ring)
+ 
+       ring->xsk_pool = i40e_xsk_pool(ring);
+       if (ring->xsk_pool) {
++              xdp_rxq_info_unreg(&ring->xdp_rxq);
+               ring->rx_buf_len = xsk_pool_get_rx_frame_size(ring->xsk_pool);
++              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                       ring->queue_index,
++                                       ring->q_vector->napi.napi_id,
++                                       ring->rx_buf_len);
++              if (err)
++                      return err;
+               err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                MEM_TYPE_XSK_BUFF_POOL,
+                                                NULL);
+-- 
+2.43.0
+
diff --git a/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch b/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch

new file mode 100644 (file)

index 0000000..b7db021
--- /dev/null
+++ b/queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch
@@ -0,0 +1,58 @@
+From 6567d90f1860790c7a73d15c47a2d8cfa8de7aae Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:57 +0100
+Subject: ice: remove redundant xdp_rxq_info registration
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 2ee788c06493d02ee85855414cca39825e768aaf ]
+
+xdp_rxq_info struct can be registered by drivers via two functions -
+xdp_rxq_info_reg() and __xdp_rxq_info_reg(). The latter one allows
+drivers that support XDP multi-buffer to set up xdp_rxq_info::frag_size
+which in turn will make it possible to grow the packet via
+bpf_xdp_adjust_tail() BPF helper.
+
+Currently, ice registers xdp_rxq_info in two spots:
+1) ice_setup_rx_ring() // via xdp_rxq_info_reg(), BUG
+2) ice_vsi_cfg_rxq()   // via __xdp_rxq_info_reg(), OK
+
+Cited commit under fixes tag took care of setting up frag_size and
+updated registration scheme in 2) but it did not help as
+1) is called before 2) and as shown above it uses old registration
+function. This means that 2) sees that xdp_rxq_info is already
+registered and never calls __xdp_rxq_info_reg() which leaves us with
+xdp_rxq_info::frag_size being set to 0.
+
+To fix this misbehavior, simply remove xdp_rxq_info_reg() call from
+ice_setup_rx_ring().
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-7-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c | 5 -----
+ 1 file changed, 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 6878448ba112..9170a3e8f088 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -513,11 +513,6 @@ int ice_setup_rx_ring(struct ice_rx_ring *rx_ring)
+       if (ice_is_xdp_ena_vsi(rx_ring->vsi))
+               WRITE_ONCE(rx_ring->xdp_prog, rx_ring->vsi->xdp_prog);
+ 
+-      if (rx_ring->vsi->type == ICE_VSI_PF &&
+-          !xdp_rxq_info_is_reg(&rx_ring->xdp_rxq))
+-              if (xdp_rxq_info_reg(&rx_ring->xdp_rxq, rx_ring->netdev,
+-                                   rx_ring->q_index, rx_ring->q_vector->napi.napi_id))
+-                      goto err;
+       return 0;
+ 
+ err:
+-- 
+2.43.0
+
diff --git a/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch b/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch

new file mode 100644 (file)

index 0000000..971cebf
--- /dev/null
+++ b/queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch
@@ -0,0 +1,91 @@
+From ba8440c493d603f075c11edb241c244ce6a007fa Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:59 +0100
+Subject: ice: update xdp_rxq_info::frag_size for ZC enabled Rx queue
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 3de38c87174225487fc93befeea7d380db80aef6 ]
+
+Now that ice driver correctly sets up frag_size in xdp_rxq_info, let us
+make it work for ZC multi-buffer as well. ice_rx_ring::rx_buf_len for ZC
+is being set via xsk_pool_get_rx_frame_size() and this needs to be
+propagated up to xdp_rxq_info.
+
+Use a bigger hammer and instead of unregistering only xdp_rxq_info's
+memory model, unregister it altogether and register it again and have
+xdp_rxq_info with correct frag_size value.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-9-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_base.c | 37 ++++++++++++++---------
+ 1 file changed, 23 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_base.c b/drivers/net/ethernet/intel/ice/ice_base.c
+index 7fa43827a3f0..4f3e65b47cdc 100644
+--- a/drivers/net/ethernet/intel/ice/ice_base.c
++++ b/drivers/net/ethernet/intel/ice/ice_base.c
+@@ -534,19 +534,27 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+       ring->rx_buf_len = ring->vsi->rx_buf_len;
+ 
+       if (ring->vsi->type == ICE_VSI_PF) {
+-              if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+-                      /* coverity[check_return] */
+-                      __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
+-                                         ring->q_index,
+-                                         ring->q_vector->napi.napi_id,
+-                                         ring->vsi->rx_buf_len);
++              if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++                      err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                               ring->q_index,
++                                               ring->q_vector->napi.napi_id,
++                                               ring->rx_buf_len);
++                      if (err)
++                              return err;
++              }
+ 
+               ring->xsk_pool = ice_xsk_pool(ring);
+               if (ring->xsk_pool) {
+-                      xdp_rxq_info_unreg_mem_model(&ring->xdp_rxq);
++                      xdp_rxq_info_unreg(&ring->xdp_rxq);
+ 
+                       ring->rx_buf_len =
+                               xsk_pool_get_rx_frame_size(ring->xsk_pool);
++                      err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                               ring->q_index,
++                                               ring->q_vector->napi.napi_id,
++                                               ring->rx_buf_len);
++                      if (err)
++                              return err;
+                       err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                        MEM_TYPE_XSK_BUFF_POOL,
+                                                        NULL);
+@@ -557,13 +565,14 @@ int ice_vsi_cfg_rxq(struct ice_rx_ring *ring)
+                       dev_info(dev, "Registered XDP mem model MEM_TYPE_XSK_BUFF_POOL on Rx ring %d\n",
+                                ring->q_index);
+               } else {
+-                      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq))
+-                              /* coverity[check_return] */
+-                              __xdp_rxq_info_reg(&ring->xdp_rxq,
+-                                                 ring->netdev,
+-                                                 ring->q_index,
+-                                                 ring->q_vector->napi.napi_id,
+-                                                 ring->vsi->rx_buf_len);
++                      if (!xdp_rxq_info_is_reg(&ring->xdp_rxq)) {
++                              err = __xdp_rxq_info_reg(&ring->xdp_rxq, ring->netdev,
++                                                       ring->q_index,
++                                                       ring->q_vector->napi.napi_id,
++                                                       ring->rx_buf_len);
++                              if (err)
++                                      return err;
++                      }
+ 
+                       err = xdp_rxq_info_reg_mem_model(&ring->xdp_rxq,
+                                                        MEM_TYPE_PAGE_SHARED,
+-- 
+2.43.0
+
diff --git a/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch b/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch

new file mode 100644 (file)

index 0000000..4d842ef
--- /dev/null
+++ b/queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch
@@ -0,0 +1,170 @@
+From 62da34a963fc7911cae3aa180d1e61801a97258d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:55 +0100
+Subject: ice: work on pre-XDP prog frag count
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit ad2047cf5d9313200e308612aed516548873d124 ]
+
+Fix an OOM panic in XDP_DRV mode when a XDP program shrinks a
+multi-buffer packet by 4k bytes and then redirects it to an AF_XDP
+socket.
+
+Since support for handling multi-buffer frames was added to XDP, usage
+of bpf_xdp_adjust_tail() helper within XDP program can free the page
+that given fragment occupies and in turn decrease the fragment count
+within skb_shared_info that is embedded in xdp_buff struct. In current
+ice driver codebase, it can become problematic when page recycling logic
+decides not to reuse the page. In such case, __page_frag_cache_drain()
+is used with ice_rx_buf::pagecnt_bias that was not adjusted after
+refcount of page was changed by XDP prog which in turn does not drain
+the refcount to 0 and page is never freed.
+
+To address this, let us store the count of frags before the XDP program
+was executed on Rx ring struct. This will be used to compare with
+current frag count from skb_shared_info embedded in xdp_buff. A smaller
+value in the latter indicates that XDP prog freed frag(s). Then, for
+given delta decrement pagecnt_bias for XDP_DROP verdict.
+
+While at it, let us also handle the EOP frag within
+ice_set_rx_bufs_act() to make our life easier, so all of the adjustments
+needed to be applied against freed frags are performed in the single
+place.
+
+Fixes: 2fba7dc5157b ("ice: Add support for XDP multi-buffer on Rx side")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-5-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/ice/ice_txrx.c     | 14 ++++++---
+ drivers/net/ethernet/intel/ice/ice_txrx.h     |  1 +
+ drivers/net/ethernet/intel/ice/ice_txrx_lib.h | 31 +++++++++++++------
+ 3 files changed, 32 insertions(+), 14 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.c b/drivers/net/ethernet/intel/ice/ice_txrx.c
+index 9e97ea863068..6878448ba112 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.c
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.c
+@@ -600,9 +600,7 @@ ice_run_xdp(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+               ret = ICE_XDP_CONSUMED;
+       }
+ exit:
+-      rx_buf->act = ret;
+-      if (unlikely(xdp_buff_has_frags(xdp)))
+-              ice_set_rx_bufs_act(xdp, rx_ring, ret);
++      ice_set_rx_bufs_act(xdp, rx_ring, ret);
+ }
+ 
+ /**
+@@ -890,14 +888,17 @@ ice_add_xdp_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *xdp,
+       }
+ 
+       if (unlikely(sinfo->nr_frags == MAX_SKB_FRAGS)) {
+-              if (unlikely(xdp_buff_has_frags(xdp)))
+-                      ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
++              ice_set_rx_bufs_act(xdp, rx_ring, ICE_XDP_CONSUMED);
+               return -ENOMEM;
+       }
+ 
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++, rx_buf->page,
+                                  rx_buf->page_offset, size);
+       sinfo->xdp_frags_size += size;
++      /* remember frag count before XDP prog execution; bpf_xdp_adjust_tail()
++       * can pop off frags but driver has to handle it on its own
++       */
++      rx_ring->nr_frags = sinfo->nr_frags;
+ 
+       if (page_is_pfmemalloc(rx_buf->page))
+               xdp_buff_set_frag_pfmemalloc(xdp);
+@@ -1249,6 +1250,7 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+ 
+               xdp->data = NULL;
+               rx_ring->first_desc = ntc;
++              rx_ring->nr_frags = 0;
+               continue;
+ construct_skb:
+               if (likely(ice_ring_uses_build_skb(rx_ring)))
+@@ -1264,10 +1266,12 @@ int ice_clean_rx_irq(struct ice_rx_ring *rx_ring, int budget)
+                                                   ICE_XDP_CONSUMED);
+                       xdp->data = NULL;
+                       rx_ring->first_desc = ntc;
++                      rx_ring->nr_frags = 0;
+                       break;
+               }
+               xdp->data = NULL;
+               rx_ring->first_desc = ntc;
++              rx_ring->nr_frags = 0;
+ 
+               stat_err_bits = BIT(ICE_RX_FLEX_DESC_STATUS0_RXE_S);
+               if (unlikely(ice_test_staterr(rx_desc->wb.status_error0,
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx.h b/drivers/net/ethernet/intel/ice/ice_txrx.h
+index daf7b9dbb143..b28b9826bbcd 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx.h
+@@ -333,6 +333,7 @@ struct ice_rx_ring {
+       struct ice_channel *ch;
+       struct ice_tx_ring *xdp_ring;
+       struct xsk_buff_pool *xsk_pool;
++      u32 nr_frags;
+       dma_addr_t dma;                 /* physical address of ring */
+       u64 cached_phctime;
+       u16 rx_buf_len;
+diff --git a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+index 115969ecdf7b..b0e56675f98b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
++++ b/drivers/net/ethernet/intel/ice/ice_txrx_lib.h
+@@ -12,26 +12,39 @@
+  * act: action to store onto Rx buffers related to XDP buffer parts
+  *
+  * Set action that should be taken before putting Rx buffer from first frag
+- * to one before last. Last one is handled by caller of this function as it
+- * is the EOP frag that is currently being processed. This function is
+- * supposed to be called only when XDP buffer contains frags.
++ * to the last.
+  */
+ static inline void
+ ice_set_rx_bufs_act(struct xdp_buff *xdp, const struct ice_rx_ring *rx_ring,
+                   const unsigned int act)
+ {
+-      const struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+-      u32 first = rx_ring->first_desc;
+-      u32 nr_frags = sinfo->nr_frags;
++      u32 sinfo_frags = xdp_get_shared_info_from_buff(xdp)->nr_frags;
++      u32 nr_frags = rx_ring->nr_frags + 1;
++      u32 idx = rx_ring->first_desc;
+       u32 cnt = rx_ring->count;
+       struct ice_rx_buf *buf;
+ 
+       for (int i = 0; i < nr_frags; i++) {
+-              buf = &rx_ring->rx_buf[first];
++              buf = &rx_ring->rx_buf[idx];
+               buf->act = act;
+ 
+-              if (++first == cnt)
+-                      first = 0;
++              if (++idx == cnt)
++                      idx = 0;
++      }
++
++      /* adjust pagecnt_bias on frags freed by XDP prog */
++      if (sinfo_frags < rx_ring->nr_frags && act == ICE_XDP_CONSUMED) {
++              u32 delta = rx_ring->nr_frags - sinfo_frags;
++
++              while (delta) {
++                      if (idx == 0)
++                              idx = cnt - 1;
++                      else
++                              idx--;
++                      buf = &rx_ring->rx_buf[idx];
++                      buf->pagecnt_bias--;
++                      delta--;
++              }
+       }
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch b/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch

new file mode 100644 (file)

index 0000000..fd14c56
--- /dev/null
+++ b/queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch
@@ -0,0 +1,52 @@
+From 5d5966086cb8bd78aab1a1b25b336edea51fe324 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 21:50:40 +0100
+Subject: idpf: distinguish vports by the dev_port attribute
+
+From: Michal Schmidt <mschmidt@redhat.com>
+
+[ Upstream commit 359724fa3ab79fbe9f42c6263cddc2afae32eef3 ]
+
+idpf registers multiple netdevs (virtual ports) for one PCI function,
+but it does not provide a way for userspace to distinguish them with
+sysfs attributes. Per Documentation/ABI/testing/sysfs-class-net, it is
+a bug not to set dev_port for independent ports on the same PCI bus,
+device and function.
+
+Without dev_port set, systemd-udevd's default naming policy attempts
+to assign the same name ("ens2f0") to all four idpf netdevs on my test
+system and obviously fails, leaving three of them with the initial
+eth<N> name.
+
+With this patch, systemd-udevd is able to assign unique names to the
+netdevs (e.g. "ens2f0", "ens2f0d1", "ens2f0d2", "ens2f0d3").
+
+The Intel-provided out-of-tree idpf driver already sets dev_port. In
+this patch I chose to do it in the same place in the idpf_cfg_netdev
+function.
+
+Fixes: 0fe45467a104 ("idpf: add create vport and netdev configuration")
+Signed-off-by: Michal Schmidt <mschmidt@redhat.com>
+Reviewed-by: Jesse Brandeburg <jesse.brandeburg@intel.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/idpf/idpf_lib.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/intel/idpf/idpf_lib.c b/drivers/net/ethernet/intel/idpf/idpf_lib.c
+index 19809b0ddcd9..0241e498cc20 100644
+--- a/drivers/net/ethernet/intel/idpf/idpf_lib.c
++++ b/drivers/net/ethernet/intel/idpf/idpf_lib.c
+@@ -783,6 +783,8 @@ static int idpf_cfg_netdev(struct idpf_vport *vport)
+       /* setup watchdog timeout value to be 5 second */
+       netdev->watchdog_timeo = 5 * HZ;
+ 
++      netdev->dev_port = idx;
++
+       /* configure default MTU size */
+       netdev->min_mtu = ETH_MIN_MTU;
+       netdev->max_mtu = vport->max_mtu;
+-- 
+2.43.0
+
diff --git a/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch b/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch

new file mode 100644 (file)

index 0000000..c441dda
--- /dev/null
+++ b/queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch
@@ -0,0 +1,60 @@
+From 0166c869022f93a949db8088d9cffb95e3db16bf Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:58 +0100
+Subject: intel: xsk: initialize skb_frag_t::bv_offset in ZC drivers
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 290779905d09d5fdf6caa4f58ddefc3f4db0c0a9 ]
+
+Ice and i40e ZC drivers currently set offset of a frag within
+skb_shared_info to 0, which is incorrect. xdp_buffs that come from
+xsk_buff_pool always have 256 bytes of a headroom, so they need to be
+taken into account to retrieve xdp_buff::data via skb_frag_address().
+Otherwise, bpf_xdp_frags_increase_tail() would be starting its job from
+xdp_buff::data_hard_start which would result in overwriting existing
+payload.
+
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-8-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 3 ++-
+ drivers/net/ethernet/intel/ice/ice_xsk.c   | 3 ++-
+ 2 files changed, 4 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index fede0bb3e047..65f38a57b3df 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -414,7 +414,8 @@ i40e_add_xsk_frag(struct i40e_ring *rx_ring, struct xdp_buff *first,
+       }
+ 
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+-                                 virt_to_page(xdp->data_hard_start), 0, size);
++                                 virt_to_page(xdp->data_hard_start),
++                                 XDP_PACKET_HEADROOM, size);
+       sinfo->xdp_frags_size += size;
+       xsk_buff_add_frag(xdp);
+ 
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 951f84bfdf2b..f3663b3f6390 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -820,7 +820,8 @@ ice_add_xsk_frag(struct ice_rx_ring *rx_ring, struct xdp_buff *first,
+       }
+ 
+       __skb_fill_page_desc_noacc(sinfo, sinfo->nr_frags++,
+-                                 virt_to_page(xdp->data_hard_start), 0, size);
++                                 virt_to_page(xdp->data_hard_start),
++                                 XDP_PACKET_HEADROOM, size);
+       sinfo->xdp_frags_size += size;
+       xsk_buff_add_frag(xdp);
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch b/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch

new file mode 100644 (file)

index 0000000..d173cb4
--- /dev/null
+++ b/queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch
@@ -0,0 +1,70 @@
+From eae4daf6f79b22f71f6a35ab4233136066e16449 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 18:20:01 +0800
+Subject: ipv6: init the accept_queue's spinlocks in inet6_create
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 435e202d645c197dcfd39d7372eb2a56529b6640 ]
+
+In commit 198bc90e0e73("tcp: make sure init the accept_queue's spinlocks
+once"), the spinlocks of accept_queue are initialized only when socket is
+created in the inet4 scenario. The locks are not initialized when socket
+is created in the inet6 scenario. The kernel reports the following error:
+INFO: trying to register non-static key.
+The code is fine but needs lockdep annotation, or maybe
+you didn't initialize this object before use?
+turning off the locking correctness validator.
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+Call Trace:
+<TASK>
+       dump_stack_lvl (lib/dump_stack.c:107)
+       register_lock_class (kernel/locking/lockdep.c:1289)
+       __lock_acquire (kernel/locking/lockdep.c:5015)
+       lock_acquire.part.0 (kernel/locking/lockdep.c:5756)
+       _raw_spin_lock_bh (kernel/locking/spinlock.c:178)
+       inet_csk_listen_stop (net/ipv4/inet_connection_sock.c:1386)
+       tcp_disconnect (net/ipv4/tcp.c:2981)
+       inet_shutdown (net/ipv4/af_inet.c:935)
+       __sys_shutdown (./include/linux/file.h:32 net/socket.c:2438)
+       __x64_sys_shutdown (net/socket.c:2445)
+       do_syscall_64 (arch/x86/entry/common.c:52)
+       entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+RIP: 0033:0x7f52ecd05a3d
+Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7
+48 89 d6 48 89 ca 4d 89 c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff
+ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+RSP: 002b:00007f52ecf5dde8 EFLAGS: 00000293 ORIG_RAX: 0000000000000030
+RAX: ffffffffffffffda RBX: 00007f52ecf5e640 RCX: 00007f52ecd05a3d
+RDX: 00007f52ecc8b188 RSI: 0000000000000000 RDI: 0000000000000004
+RBP: 00007f52ecf5de20 R08: 00007ffdae45c69f R09: 0000000000000000
+R10: 0000000000000000 R11: 0000000000000293 R12: 00007f52ecf5e640
+R13: 0000000000000000 R14: 00007f52ecc8b060 R15: 00007ffdae45c6e0
+
+Fixes: 198bc90e0e73 ("tcp: make sure init the accept_queue's spinlocks once")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240122102001.2851701-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv6/af_inet6.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/net/ipv6/af_inet6.c b/net/ipv6/af_inet6.c
+index 13a1833a4df5..959bfd9f6344 100644
+--- a/net/ipv6/af_inet6.c
++++ b/net/ipv6/af_inet6.c
+@@ -199,6 +199,9 @@ static int inet6_create(struct net *net, struct socket *sock, int protocol,
+       if (INET_PROTOSW_REUSE & answer_flags)
+               sk->sk_reuse = SK_CAN_REUSE;
+ 
++      if (INET_PROTOSW_ICSK & answer_flags)
++              inet_init_csk_locks(sk);
++
+       inet = inet_sk(sk);
+       inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch b/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch

new file mode 100644 (file)

index 0000000..0020174
--- /dev/null
+++ b/queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch
@@ -0,0 +1,130 @@
+From 3c2fb71fcd92c98c689bdd8a2a2c278559759d4d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 17:55:15 -0800
+Subject: llc: Drop support for ETH_P_TR_802_2.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit e3f9bed9bee261e3347131764e42aeedf1ffea61 ]
+
+syzbot reported an uninit-value bug below. [0]
+
+llc supports ETH_P_802_2 (0x0004) and used to support ETH_P_TR_802_2
+(0x0011), and syzbot abused the latter to trigger the bug.
+
+  write$tun(r0, &(0x7f0000000040)={@val={0x0, 0x11}, @val, @mpls={[], @llc={@snap={0xaa, 0x1, ')', "90e5dd"}}}}, 0x16)
+
+llc_conn_handler() initialises local variables {saddr,daddr}.mac
+based on skb in llc_pdu_decode_sa()/llc_pdu_decode_da() and passes
+them to __llc_lookup().
+
+However, the initialisation is done only when skb->protocol is
+htons(ETH_P_802_2), otherwise, __llc_lookup_established() and
+__llc_lookup_listener() will read garbage.
+
+The missing initialisation existed prior to commit 211ed865108e
+("net: delete all instances of special processing for token ring").
+
+It removed the part to kick out the token ring stuff but forgot to
+close the door allowing ETH_P_TR_802_2 packets to sneak into llc_rcv().
+
+Let's remove llc_tr_packet_type and complete the deprecation.
+
+[0]:
+BUG: KMSAN: uninit-value in __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup_established+0xe9d/0xf90
+ __llc_lookup net/llc/llc_conn.c:611 [inline]
+ llc_conn_handler+0x4bd/0x1360 net/llc/llc_conn.c:791
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+ __netif_receive_skb_one_core net/core/dev.c:5527 [inline]
+ __netif_receive_skb+0x1a6/0x5a0 net/core/dev.c:5641
+ netif_receive_skb_internal net/core/dev.c:5727 [inline]
+ netif_receive_skb+0x58/0x660 net/core/dev.c:5786
+ tun_rx_batched+0x3ee/0x980 drivers/net/tun.c:1555
+ tun_get_user+0x53af/0x66d0 drivers/net/tun.c:2002
+ tun_chr_write_iter+0x3af/0x5d0 drivers/net/tun.c:2048
+ call_write_iter include/linux/fs.h:2020 [inline]
+ new_sync_write fs/read_write.c:491 [inline]
+ vfs_write+0x8ef/0x1490 fs/read_write.c:584
+ ksys_write+0x20f/0x4c0 fs/read_write.c:637
+ __do_sys_write fs/read_write.c:649 [inline]
+ __se_sys_write fs/read_write.c:646 [inline]
+ __x64_sys_write+0x93/0xd0 fs/read_write.c:646
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x44/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+
+Local variable daddr created at:
+ llc_conn_handler+0x53/0x1360 net/llc/llc_conn.c:783
+ llc_rcv+0xfbb/0x14a0 net/llc/llc_input.c:206
+
+CPU: 1 PID: 5004 Comm: syz-executor994 Not tainted 6.6.0-syzkaller-14500-g1c41041124bd #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 10/09/2023
+
+Fixes: 211ed865108e ("net: delete all instances of special processing for token ring")
+Reported-by: syzbot+b5ad66046b913bc04c6f@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=b5ad66046b913bc04c6f
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119015515.61898-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/llc_pdu.h | 6 ++----
+ net/llc/llc_core.c    | 7 -------
+ 2 files changed, 2 insertions(+), 11 deletions(-)
+
+diff --git a/include/net/llc_pdu.h b/include/net/llc_pdu.h
+index 7e73f8e5e497..1d55ba7c45be 100644
+--- a/include/net/llc_pdu.h
++++ b/include/net/llc_pdu.h
+@@ -262,8 +262,7 @@ static inline void llc_pdu_header_init(struct sk_buff *skb, u8 type,
+  */
+ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+ {
+-      if (skb->protocol == htons(ETH_P_802_2))
+-              memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
++      memcpy(sa, eth_hdr(skb)->h_source, ETH_ALEN);
+ }
+ 
+ /**
+@@ -275,8 +274,7 @@ static inline void llc_pdu_decode_sa(struct sk_buff *skb, u8 *sa)
+  */
+ static inline void llc_pdu_decode_da(struct sk_buff *skb, u8 *da)
+ {
+-      if (skb->protocol == htons(ETH_P_802_2))
+-              memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
++      memcpy(da, eth_hdr(skb)->h_dest, ETH_ALEN);
+ }
+ 
+ /**
+diff --git a/net/llc/llc_core.c b/net/llc/llc_core.c
+index 6e387aadffce..4f16d9c88350 100644
+--- a/net/llc/llc_core.c
++++ b/net/llc/llc_core.c
+@@ -135,22 +135,15 @@ static struct packet_type llc_packet_type __read_mostly = {
+       .func = llc_rcv,
+ };
+ 
+-static struct packet_type llc_tr_packet_type __read_mostly = {
+-      .type = cpu_to_be16(ETH_P_TR_802_2),
+-      .func = llc_rcv,
+-};
+-
+ static int __init llc_init(void)
+ {
+       dev_add_pack(&llc_packet_type);
+-      dev_add_pack(&llc_tr_packet_type);
+       return 0;
+ }
+ 
+ static void __exit llc_exit(void)
+ {
+       dev_remove_pack(&llc_packet_type);
+-      dev_remove_pack(&llc_tr_packet_type);
+ }
+ 
+ module_init(llc_init);
+-- 
+2.43.0
+
diff --git a/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch b/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch

new file mode 100644 (file)

index 0000000..368bdf1
--- /dev/null
+++ b/queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch
@@ -0,0 +1,154 @@
+From 4655ed34031dacfc8ec060c94def23c23f158ea5 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 18:36:25 +0000
+Subject: llc: make llc_ui_sendmsg() more robust against bonding changes
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit dad555c816a50c6a6a8a86be1f9177673918c647 ]
+
+syzbot was able to trick llc_ui_sendmsg(), allocating an skb with no
+headroom, but subsequently trying to push 14 bytes of Ethernet header [1]
+
+Like some others, llc_ui_sendmsg() releases the socket lock before
+calling sock_alloc_send_skb().
+Then it acquires it again, but does not redo all the sanity checks
+that were performed.
+
+This fix:
+
+- Uses LL_RESERVED_SPACE() to reserve space.
+- Check all conditions again after socket lock is held again.
+- Do not account Ethernet header for mtu limitation.
+
+[1]
+
+skbuff: skb_under_panic: text:ffff800088baa334 len:1514 put:14 head:ffff0000c9c37000 data:ffff0000c9c36ff2 tail:0x5dc end:0x6c0 dev:bond0
+
+ kernel BUG at net/core/skbuff.c:193 !
+Internal error: Oops - BUG: 00000000f2000800 [#1] PREEMPT SMP
+Modules linked in:
+CPU: 0 PID: 6875 Comm: syz-executor.0 Not tainted 6.7.0-rc8-syzkaller-00101-g0802e17d9aca-dirty #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 11/17/2023
+pstate: 60400005 (nZCv daif +PAN -UAO -TCO -DIT -SSBS BTYPE=--)
+ pc : skb_panic net/core/skbuff.c:189 [inline]
+ pc : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+ lr : skb_panic net/core/skbuff.c:189 [inline]
+ lr : skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+sp : ffff800096f97000
+x29: ffff800096f97010 x28: ffff80008cc8d668 x27: dfff800000000000
+x26: ffff0000cb970c90 x25: 00000000000005dc x24: ffff0000c9c36ff2
+x23: ffff0000c9c37000 x22: 00000000000005ea x21: 00000000000006c0
+x20: 000000000000000e x19: ffff800088baa334 x18: 1fffe000368261ce
+x17: ffff80008e4ed000 x16: ffff80008a8310f8 x15: 0000000000000001
+x14: 1ffff00012df2d58 x13: 0000000000000000 x12: 0000000000000000
+x11: 0000000000000001 x10: 0000000000ff0100 x9 : e28a51f1087e8400
+x8 : e28a51f1087e8400 x7 : ffff80008028f8d0 x6 : 0000000000000000
+x5 : 0000000000000001 x4 : 0000000000000001 x3 : ffff800082b78714
+x2 : 0000000000000001 x1 : 0000000100000000 x0 : 0000000000000089
+Call trace:
+  skb_panic net/core/skbuff.c:189 [inline]
+  skb_under_panic+0x13c/0x140 net/core/skbuff.c:203
+  skb_push+0xf0/0x108 net/core/skbuff.c:2451
+  eth_header+0x44/0x1f8 net/ethernet/eth.c:83
+  dev_hard_header include/linux/netdevice.h:3188 [inline]
+  llc_mac_hdr_init+0x110/0x17c net/llc/llc_output.c:33
+  llc_sap_action_send_xid_c+0x170/0x344 net/llc/llc_s_ac.c:85
+  llc_exec_sap_trans_actions net/llc/llc_sap.c:153 [inline]
+  llc_sap_next_state net/llc/llc_sap.c:182 [inline]
+  llc_sap_state_process+0x1ec/0x774 net/llc/llc_sap.c:209
+  llc_build_and_send_xid_pkt+0x12c/0x1c0 net/llc/llc_sap.c:270
+  llc_ui_sendmsg+0x7bc/0xb1c net/llc/af_llc.c:997
+  sock_sendmsg_nosec net/socket.c:730 [inline]
+  __sock_sendmsg net/socket.c:745 [inline]
+  sock_sendmsg+0x194/0x274 net/socket.c:767
+  splice_to_socket+0x7cc/0xd58 fs/splice.c:881
+  do_splice_from fs/splice.c:933 [inline]
+  direct_splice_actor+0xe4/0x1c0 fs/splice.c:1142
+  splice_direct_to_actor+0x2a0/0x7e4 fs/splice.c:1088
+  do_splice_direct+0x20c/0x348 fs/splice.c:1194
+  do_sendfile+0x4bc/0xc70 fs/read_write.c:1254
+  __do_sys_sendfile64 fs/read_write.c:1322 [inline]
+  __se_sys_sendfile64 fs/read_write.c:1308 [inline]
+  __arm64_sys_sendfile64+0x160/0x3b4 fs/read_write.c:1308
+  __invoke_syscall arch/arm64/kernel/syscall.c:37 [inline]
+  invoke_syscall+0x98/0x2b8 arch/arm64/kernel/syscall.c:51
+  el0_svc_common+0x130/0x23c arch/arm64/kernel/syscall.c:136
+  do_el0_svc+0x48/0x58 arch/arm64/kernel/syscall.c:155
+  el0_svc+0x54/0x158 arch/arm64/kernel/entry-common.c:678
+  el0t_64_sync_handler+0x84/0xfc arch/arm64/kernel/entry-common.c:696
+  el0t_64_sync+0x190/0x194 arch/arm64/kernel/entry.S:595
+Code: aa1803e6 aa1903e7 a90023f5 94792f6a (d4210000)
+
+Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2")
+Reported-and-tested-by: syzbot+2a7024e9502df538e8ef@syzkaller.appspotmail.com
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240118183625.4007013-1-edumazet@google.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/llc/af_llc.c | 24 ++++++++++++++++--------
+ 1 file changed, 16 insertions(+), 8 deletions(-)
+
+diff --git a/net/llc/af_llc.c b/net/llc/af_llc.c
+index 9b06c380866b..20551cfb7da6 100644
+--- a/net/llc/af_llc.c
++++ b/net/llc/af_llc.c
+@@ -928,14 +928,15 @@ static int llc_ui_recvmsg(struct socket *sock, struct msghdr *msg, size_t len,
+  */
+ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+ {
++      DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+       struct sock *sk = sock->sk;
+       struct llc_sock *llc = llc_sk(sk);
+-      DECLARE_SOCKADDR(struct sockaddr_llc *, addr, msg->msg_name);
+       int flags = msg->msg_flags;
+       int noblock = flags & MSG_DONTWAIT;
++      int rc = -EINVAL, copied = 0, hdrlen, hh_len;
+       struct sk_buff *skb = NULL;
++      struct net_device *dev;
+       size_t size = 0;
+-      int rc = -EINVAL, copied = 0, hdrlen;
+ 
+       dprintk("%s: sending from %02X to %02X\n", __func__,
+               llc->laddr.lsap, llc->daddr.lsap);
+@@ -955,22 +956,29 @@ static int llc_ui_sendmsg(struct socket *sock, struct msghdr *msg, size_t len)
+               if (rc)
+                       goto out;
+       }
+-      hdrlen = llc->dev->hard_header_len + llc_ui_header_len(sk, addr);
++      dev = llc->dev;
++      hh_len = LL_RESERVED_SPACE(dev);
++      hdrlen = llc_ui_header_len(sk, addr);
+       size = hdrlen + len;
+-      if (size > llc->dev->mtu)
+-              size = llc->dev->mtu;
++      size = min_t(size_t, size, READ_ONCE(dev->mtu));
+       copied = size - hdrlen;
+       rc = -EINVAL;
+       if (copied < 0)
+               goto out;
+       release_sock(sk);
+-      skb = sock_alloc_send_skb(sk, size, noblock, &rc);
++      skb = sock_alloc_send_skb(sk, hh_len + size, noblock, &rc);
+       lock_sock(sk);
+       if (!skb)
+               goto out;
+-      skb->dev      = llc->dev;
++      if (sock_flag(sk, SOCK_ZAPPED) ||
++          llc->dev != dev ||
++          hdrlen != llc_ui_header_len(sk, addr) ||
++          hh_len != LL_RESERVED_SPACE(dev) ||
++          size > READ_ONCE(dev->mtu))
++              goto out;
++      skb->dev      = dev;
+       skb->protocol = llc_proto_type(addr->sllc_arphrd);
+-      skb_reserve(skb, hdrlen);
++      skb_reserve(skb, hh_len + hdrlen);
+       rc = memcpy_from_msg(skb_put(skb, copied), msg, copied);
+       if (rc)
+               goto out;
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch b/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch

new file mode 100644 (file)

index 0000000..e53e81c
--- /dev/null
+++ b/queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch
@@ -0,0 +1,58 @@
+From 673b7bbd4cec69e76f7c6790c24d1041bcf45ca4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 10:51:41 -0600
+Subject: net: fec: fix the unhandled context fault from smmu
+
+From: Shenwei Wang <shenwei.wang@nxp.com>
+
+[ Upstream commit 5e344807735023cd3a67c37a1852b849caa42620 ]
+
+When repeatedly changing the interface link speed using the command below:
+
+ethtool -s eth0 speed 100 duplex full
+ethtool -s eth0 speed 1000 duplex full
+
+The following errors may sometimes be reported by the ARM SMMU driver:
+
+[ 5395.035364] fec 5b040000.ethernet eth0: Link is Down
+[ 5395.039255] arm-smmu 51400000.iommu: Unhandled context fault:
+fsr=0x402, iova=0x00000000, fsynr=0x100001, cbfrsynra=0x852, cb=2
+[ 5398.108460] fec 5b040000.ethernet eth0: Link is Up - 100Mbps/Full -
+flow control off
+
+It is identified that the FEC driver does not properly stop the TX queue
+during the link speed transitions, and this results in the invalid virtual
+I/O address translations from the SMMU and causes the context faults.
+
+Fixes: dbc64a8ea231 ("net: fec: move calls to quiesce/resume packet processing out of fec_restart()")
+Signed-off-by: Shenwei Wang <shenwei.wang@nxp.com>
+Link: https://lore.kernel.org/r/20240123165141.2008104-1-shenwei.wang@nxp.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/freescale/fec_main.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/ethernet/freescale/fec_main.c b/drivers/net/ethernet/freescale/fec_main.c
+index e08c7b572497..c107680985e4 100644
+--- a/drivers/net/ethernet/freescale/fec_main.c
++++ b/drivers/net/ethernet/freescale/fec_main.c
+@@ -2036,6 +2036,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+ 
+               /* if any of the above changed restart the FEC */
+               if (status_change) {
++                      netif_stop_queue(ndev);
+                       napi_disable(&fep->napi);
+                       netif_tx_lock_bh(ndev);
+                       fec_restart(ndev);
+@@ -2045,6 +2046,7 @@ static void fec_enet_adjust_link(struct net_device *ndev)
+               }
+       } else {
+               if (fep->link) {
++                      netif_stop_queue(ndev);
+                       napi_disable(&fep->napi);
+                       netif_tx_lock_bh(ndev);
+                       fec_stop(ndev);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch b/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch

new file mode 100644 (file)

index 0000000..79ae3be
--- /dev/null
+++ b/queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch
@@ -0,0 +1,81 @@
+From 4638a989e1c2afb54acc2b7dc2890487a0a9c764 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 16:58:59 -0800
+Subject: net: fix removing a namespace with conflicting altnames
+MIME-Version: 1.0
+Content-Type: text/plain; charset=UTF-8
+Content-Transfer-Encoding: 8bit
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit d09486a04f5da0a812c26217213b89a3b1acf836 ]
+
+Mark reports a BUG() when a net namespace is removed.
+
+    kernel BUG at net/core/dev.c:11520!
+
+Physical interfaces moved outside of init_net get "refunded"
+to init_net when that namespace disappears. The main interface
+name may get overwritten in the process if it would have
+conflicted. We need to also discard all conflicting altnames.
+Recent fixes addressed ensuring that altnames get moved
+with the main interface, which surfaced this problem.
+
+Reported-by: Марк Коренберг <socketpair@gmail.com>
+Link: https://lore.kernel.org/all/CAEmTpZFZ4Sv3KwqFOY2WKDHeZYdi0O7N5H1nTvcGp=SAEavtDg@mail.gmail.com/
+Fixes: 7663d522099e ("net: check for altname conflicts when changing netdev's netns")
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Xin Long <lucien.xin@gmail.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/dev.c | 9 +++++++++
+ net/core/dev.h | 3 +++
+ 2 files changed, 12 insertions(+)
+
+diff --git a/net/core/dev.c b/net/core/dev.c
+index ad20bebe153f..add22ca0dff9 100644
+--- a/net/core/dev.c
++++ b/net/core/dev.c
+@@ -11509,6 +11509,7 @@ static struct pernet_operations __net_initdata netdev_net_ops = {
+ 
+ static void __net_exit default_device_exit_net(struct net *net)
+ {
++      struct netdev_name_node *name_node, *tmp;
+       struct net_device *dev, *aux;
+       /*
+        * Push all migratable network devices back to the
+@@ -11531,6 +11532,14 @@ static void __net_exit default_device_exit_net(struct net *net)
+               snprintf(fb_name, IFNAMSIZ, "dev%d", dev->ifindex);
+               if (netdev_name_in_use(&init_net, fb_name))
+                       snprintf(fb_name, IFNAMSIZ, "dev%%d");
++
++              netdev_for_each_altname_safe(dev, name_node, tmp)
++                      if (netdev_name_in_use(&init_net, name_node->name)) {
++                              netdev_name_node_del(name_node);
++                              synchronize_rcu();
++                              __netdev_name_node_alt_destroy(name_node);
++                      }
++
+               err = dev_change_net_namespace(dev, &init_net, fb_name);
+               if (err) {
+                       pr_emerg("%s: failed to move %s to init_net: %d\n",
+diff --git a/net/core/dev.h b/net/core/dev.h
+index 5aa45f0fd4ae..3f5eb92396b6 100644
+--- a/net/core/dev.h
++++ b/net/core/dev.h
+@@ -64,6 +64,9 @@ int dev_change_name(struct net_device *dev, const char *newname);
+ 
+ #define netdev_for_each_altname(dev, namenode)                                \
+       list_for_each_entry((namenode), &(dev)->name_node->list, list)
++#define netdev_for_each_altname_safe(dev, namenode, next)             \
++      list_for_each_entry_safe((namenode), (next), &(dev)->name_node->list, \
++                               list)
+ 
+ int netdev_name_node_alt_create(struct net_device *dev, const char *name);
+ int netdev_name_node_alt_destroy(struct net_device *dev, const char *name);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch b/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch

new file mode 100644 (file)

index 0000000..86ea396
--- /dev/null
+++ b/queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch
@@ -0,0 +1,61 @@
+From 846480519c20c1853b3e360271f8c0182d65e2a0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:47:50 +0100
+Subject: net: micrel: Fix PTP frame parsing for lan8814
+
+From: Horatiu Vultur <horatiu.vultur@microchip.com>
+
+[ Upstream commit aaf632f7ab6dec57bc9329a438f94504fe8034b9 ]
+
+The HW has the capability to check each frame if it is a PTP frame,
+which domain it is, which ptp frame type it is, different ip address in
+the frame. And if one of these checks fail then the frame is not
+timestamp. Most of these checks were disabled except checking the field
+minorVersionPTP inside the PTP header. Meaning that once a partner sends
+a frame compliant to 8021AS which has minorVersionPTP set to 1, then the
+frame was not timestamp because the HW expected by default a value of 0
+in minorVersionPTP. This is exactly the same issue as on lan8841.
+Fix this issue by removing this check so the userspace can decide on this.
+
+Fixes: ece19502834d ("net: phy: micrel: 1588 support for LAN8814 phy")
+Signed-off-by: Horatiu Vultur <horatiu.vultur@microchip.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Reviewed-by: Divya Koppera <divya.koppera@microchip.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/phy/micrel.c | 11 +++++++++++
+ 1 file changed, 11 insertions(+)
+
+diff --git a/drivers/net/phy/micrel.c b/drivers/net/phy/micrel.c
+index ce5ad4a82481..858175ca58cd 100644
+--- a/drivers/net/phy/micrel.c
++++ b/drivers/net/phy/micrel.c
+@@ -120,6 +120,11 @@
+  */
+ #define LAN8814_1PPM_FORMAT                   17179
+ 
++#define PTP_RX_VERSION                                0x0248
++#define PTP_TX_VERSION                                0x0288
++#define PTP_MAX_VERSION(x)                    (((x) & GENMASK(7, 0)) << 8)
++#define PTP_MIN_VERSION(x)                    ((x) & GENMASK(7, 0))
++
+ #define PTP_RX_MOD                            0x024F
+ #define PTP_RX_MOD_BAD_UDPV4_CHKSUM_FORCE_FCS_DIS_ BIT(3)
+ #define PTP_RX_TIMESTAMP_EN                   0x024D
+@@ -3147,6 +3152,12 @@ static void lan8814_ptp_init(struct phy_device *phydev)
+       lanphy_write_page_reg(phydev, 5, PTP_TX_PARSE_IP_ADDR_EN, 0);
+       lanphy_write_page_reg(phydev, 5, PTP_RX_PARSE_IP_ADDR_EN, 0);
+ 
++      /* Disable checking for minorVersionPTP field */
++      lanphy_write_page_reg(phydev, 5, PTP_RX_VERSION,
++                            PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++      lanphy_write_page_reg(phydev, 5, PTP_TX_VERSION,
++                            PTP_MAX_VERSION(0xff) | PTP_MIN_VERSION(0x0));
++
+       skb_queue_head_init(&ptp_priv->tx_queue);
+       skb_queue_head_init(&ptp_priv->rx_queue);
+       INIT_LIST_HEAD(&ptp_priv->rx_ts_list);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch b/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch

new file mode 100644 (file)

index 0000000..58fcd4e
--- /dev/null
+++ b/queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch
@@ -0,0 +1,94 @@
+From 72948aec8a138908291cc00d2584c02eb45d3574 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sat, 30 Dec 2023 22:40:37 +0200
+Subject: net/mlx5: Bridge, fix multicast packets sent to uplink
+
+From: Moshe Shemesh <moshe@nvidia.com>
+
+[ Upstream commit ec7cc38ef9f83553102e84c82536971a81630739 ]
+
+To enable multicast packets which are offloaded in bridge multicast
+offload mode to be sent also to uplink, FTE bit uplink_hairpin_en should
+be set. Add this bit to FTE for the bridge multicast offload rules.
+
+Fixes: 18c2916cee12 ("net/mlx5: Bridge, snoop igmp/mld packets")
+Signed-off-by: Moshe Shemesh <moshe@nvidia.com>
+Reviewed-by: Gal Pressman <gal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c | 3 +++
+ drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c           | 2 ++
+ include/linux/mlx5/fs.h                                    | 1 +
+ include/linux/mlx5/mlx5_ifc.h                              | 2 +-
+ 4 files changed, 7 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+index a7ed87e9d842..22dd30cf8033 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/esw/bridge_mcast.c
+@@ -83,6 +83,7 @@ mlx5_esw_bridge_mdb_flow_create(u16 esw_owner_vhca_id, struct mlx5_esw_bridge_md
+               i++;
+       }
+ 
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+       dmac_v = MLX5_ADDR_OF(fte_match_param, rule_spec->match_value, outer_headers.dmac_47_16);
+       ether_addr_copy(dmac_v, entry->key.addr);
+@@ -587,6 +588,7 @@ mlx5_esw_bridge_mcast_vlan_flow_create(u16 vlan_proto, struct mlx5_esw_bridge_po
+       if (!rule_spec)
+               return ERR_PTR(-ENOMEM);
+ 
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       rule_spec->match_criteria_enable = MLX5_MATCH_OUTER_HEADERS;
+ 
+       flow_act.action |= MLX5_FLOW_CONTEXT_ACTION_PACKET_REFORMAT;
+@@ -662,6 +664,7 @@ mlx5_esw_bridge_mcast_fwd_flow_create(struct mlx5_esw_bridge_port *port)
+               dest.vport.flags = MLX5_FLOW_DEST_VPORT_VHCA_ID;
+               dest.vport.vhca_id = port->esw_owner_vhca_id;
+       }
++      rule_spec->flow_context.flags |= FLOW_CONTEXT_UPLINK_HAIRPIN_EN;
+       handle = mlx5_add_flow_rules(port->mcast.ft, rule_spec, &flow_act, &dest, 1);
+ 
+       kvfree(rule_spec);
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+index a4b925331661..b29299c49ab3 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/fs_cmd.c
+@@ -566,6 +566,8 @@ static int mlx5_cmd_set_fte(struct mlx5_core_dev *dev,
+                fte->flow_context.flow_tag);
+       MLX5_SET(flow_context, in_flow_context, flow_source,
+                fte->flow_context.flow_source);
++      MLX5_SET(flow_context, in_flow_context, uplink_hairpin_en,
++               !!(fte->flow_context.flags & FLOW_CONTEXT_UPLINK_HAIRPIN_EN));
+ 
+       MLX5_SET(flow_context, in_flow_context, extended_destination,
+                extended_dest);
+diff --git a/include/linux/mlx5/fs.h b/include/linux/mlx5/fs.h
+index 6f7725238abc..3fb428ce7d1c 100644
+--- a/include/linux/mlx5/fs.h
++++ b/include/linux/mlx5/fs.h
+@@ -132,6 +132,7 @@ struct mlx5_flow_handle;
+ 
+ enum {
+       FLOW_CONTEXT_HAS_TAG = BIT(0),
++      FLOW_CONTEXT_UPLINK_HAIRPIN_EN = BIT(1),
+ };
+ 
+ struct mlx5_flow_context {
+diff --git a/include/linux/mlx5/mlx5_ifc.h b/include/linux/mlx5/mlx5_ifc.h
+index 3f7b664d625b..fb8d26a15df4 100644
+--- a/include/linux/mlx5/mlx5_ifc.h
++++ b/include/linux/mlx5/mlx5_ifc.h
+@@ -3557,7 +3557,7 @@ struct mlx5_ifc_flow_context_bits {
+       u8         action[0x10];
+ 
+       u8         extended_destination[0x1];
+-      u8         reserved_at_81[0x1];
++      u8         uplink_hairpin_en[0x1];
+       u8         flow_source[0x2];
+       u8         encrypt_decrypt_type[0x4];
+       u8         destination_list_size[0x18];
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch b/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch

new file mode 100644 (file)

index 0000000..deb1105
--- /dev/null
+++ b/queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch
@@ -0,0 +1,51 @@
+From 1dd205824d3c300e4463cf69b20f524b19dbc73d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 13:20:36 +0200
+Subject: net/mlx5: DR, Can't go to uplink vport on RX rule
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5b2a2523eeea5f03d39a9d1ff1bad2e9f8eb98d2 ]
+
+Go-To-Vport action on RX is not allowed when the vport is uplink.
+In such case, the packet should be dropped.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Reviewed-by: Erez Shitrit <erezsh@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../mellanox/mlx5/core/steering/dr_action.c      | 16 +++++++++++-----
+ 1 file changed, 11 insertions(+), 5 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index 74fc318b5027..d2b65a0ce47b 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -874,11 +874,17 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+                                                       action->sampler->tx_icm_addr;
+                       break;
+               case DR_ACTION_TYP_VPORT:
+-                      attr.hit_gvmi = action->vport->caps->vhca_gvmi;
+-                      dest_action = action;
+-                      attr.final_icm_addr = rx_rule ?
+-                              action->vport->caps->icm_address_rx :
+-                              action->vport->caps->icm_address_tx;
++                      if (unlikely(rx_rule && action->vport->caps->num == MLX5_VPORT_UPLINK)) {
++                              /* can't go to uplink on RX rule - dropping instead */
++                              attr.final_icm_addr = nic_dmn->drop_icm_addr;
++                              attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
++                      } else {
++                              attr.hit_gvmi = action->vport->caps->vhca_gvmi;
++                              dest_action = action;
++                              attr.final_icm_addr = rx_rule ?
++                                                    action->vport->caps->icm_address_rx :
++                                                    action->vport->caps->icm_address_tx;
++                      }
+                       break;
+               case DR_ACTION_TYP_POP_VLAN:
+                       if (!rx_rule && !(dmn->ste_ctx->actions_caps &
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch b/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch

new file mode 100644 (file)

index 0000000..f3340cf
--- /dev/null
+++ b/queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch
@@ -0,0 +1,39 @@
+From f996a3313ebd84e0efdb7d61b590eec93fc7c6a2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 17 Dec 2023 11:24:08 +0200
+Subject: net/mlx5: DR, Use the right GVMI number for drop action
+
+From: Yevgeny Kliteynik <kliteyn@nvidia.com>
+
+[ Upstream commit 5665954293f13642f9c052ead83c1e9d8cff186f ]
+
+When FW provides ICM addresses for drop RX/TX, the provided capability
+is 64 bits that contain its GVMI as well as the ICM address itself.
+In case of TX DROP this GVMI is different from the GVMI that the
+domain is operating on.
+
+This patch fixes the action to use these GVMI IDs, as provided by FW.
+
+Fixes: 9db810ed2d37 ("net/mlx5: DR, Expose steering action functionality")
+Signed-off-by: Yevgeny Kliteynik <kliteyn@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+index e3ec559369fa..74fc318b5027 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/steering/dr_action.c
+@@ -788,6 +788,7 @@ int mlx5dr_actions_build_ste_arr(struct mlx5dr_matcher *matcher,
+               switch (action_type) {
+               case DR_ACTION_TYP_DROP:
+                       attr.final_icm_addr = nic_dmn->drop_icm_addr;
++                      attr.hit_gvmi = nic_dmn->drop_icm_addr >> 48;
+                       break;
+               case DR_ACTION_TYP_FT:
+                       dest_action = action;
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch b/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch

new file mode 100644 (file)

index 0000000..199824f
--- /dev/null
+++ b/queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch
@@ -0,0 +1,149 @@
+From 35519d6efbd840bc026139266c7df4248df4cc79 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 31 Dec 2023 15:19:50 +0200
+Subject: net/mlx5: Fix a WARN upon a callback command failure
+
+From: Yishai Hadas <yishaih@nvidia.com>
+
+[ Upstream commit cc8091587779cfaddb6b29c9e9edb9079a282cad ]
+
+The below WARN [1] is reported once a callback command failed.
+
+As a callback runs under an interrupt context, needs to use the IRQ
+save/restore variant.
+
+[1]
+DEBUG_LOCKS_WARN_ON(lockdep_hardirq_context())
+WARNING: CPU: 15 PID: 0 at kernel/locking/lockdep.c:4353
+              lockdep_hardirqs_on_prepare+0x11b/0x180
+Modules linked in: vhost_net vhost tap mlx5_vfio_pci
+vfio_pci vfio_pci_core vfio_iommu_type1 vfio mlx5_vdpa vringh
+vhost_iotlb vdpa nfnetlink_cttimeout openvswitch nsh ip6table_mangle
+ip6table_nat ip6table_filter ip6_tables iptable_mangle
+xt_conntrackxt_MASQUERADE nf_conntrack_netlink nfnetlink
+xt_addrtype iptable_nat nf_nat br_netfilter rpcsec_gss_krb5
+auth_rpcgss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi
+scsi_transport_iscsi rdma_cm iw_cm ib_umad ib_ipoib ib_cm
+mlx5_ib ib_uverbs ib_core fuse mlx5_core
+CPU: 15 PID: 0 Comm: swapper/15 Tainted: G        W 6.7.0-rc4+ #1587
+Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS
+rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+RIP: 0010:lockdep_hardirqs_on_prepare+0x11b/0x180
+Code: 00 5b c3 c3 e8 e6 0d 58 00 85 c0 74 d6 8b 15 f0 c3
+      76 01 85 d2 75 cc 48 c7 c6 04 a5 3b 82 48 c7 c7 f1
+      e9 39 82 e8 95 12 f9 ff <0f> 0b 5b c3 e8 bc 0d 58 00
+      85 c0 74 ac 8b 3d c6 c3 76 01 85 ff 75
+RSP: 0018:ffffc900003ecd18 EFLAGS: 00010086
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: 0000000000000027
+RDX: 0000000000000000 RSI: ffff88885fbdb880 RDI: ffff88885fbdb888
+RBP: 00000000ffffff87 R08: 0000000000000000 R09: 0000000000000001
+R10: 0000000000000000 R11: 284e4f5f4e524157 R12: 00000000002c9aa1
+R13: ffff88810aace980 R14: ffff88810aace9b8 R15: 0000000000000003
+FS:  0000000000000000(0000) GS:ffff88885fbc0000(0000)
+knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 00007f731436f4c8 CR3: 000000010aae6001 CR4: 0000000000372eb0
+DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+Call Trace:
+ <IRQ>
+? __warn+0x81/0x170
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? report_bug+0xf8/0x1c0
+? handle_bug+0x3f/0x70
+? exc_invalid_op+0x13/0x60
+? asm_exc_invalid_op+0x16/0x20
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+? lockdep_hardirqs_on_prepare+0x11b/0x180
+trace_hardirqs_on+0x4a/0xa0
+raw_spin_unlock_irq+0x24/0x30
+cmd_status_err+0xc0/0x1a0 [mlx5_core]
+cmd_status_err+0x1a0/0x1a0 [mlx5_core]
+mlx5_cmd_exec_cb_handler+0x24/0x40 [mlx5_core]
+mlx5_cmd_comp_handler+0x129/0x4b0 [mlx5_core]
+cmd_comp_notifier+0x1a/0x20 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+mlx5_eq_async_int+0xe7/0x200 [mlx5_core]
+notifier_call_chain+0x3e/0xe0
+atomic_notifier_call_chain+0x5f/0x130
+irq_int_handler+0x11/0x20 [mlx5_core]
+__handle_irq_event_percpu+0x99/0x220
+? tick_irq_enter+0x5d/0x80
+handle_irq_event_percpu+0xf/0x40
+handle_irq_event+0x3a/0x60
+handle_edge_irq+0xa2/0x1c0
+__common_interrupt+0x55/0x140
+common_interrupt+0x7d/0xa0
+</IRQ>
+<TASK>
+asm_common_interrupt+0x22/0x40
+RIP: 0010:default_idle+0x13/0x20
+Code: c0 08 00 00 00 4d 29 c8 4c 01 c7 4c 29 c2 e9 72 ff
+ff ff cc cc cc cc 8b 05 ea 08 25 01 85 c0 7e 07 0f 00 2d 7f b0 26 00 fb
+f4 <fa> c3 90 66 2e 0f 1f 84 00 00 00 00 00 65 48 8b 04 25 80 d0 02 00
+RSP: 0018:ffffc9000010fec8 EFLAGS: 00000242
+RAX: 0000000000000001 RBX: 000000000000000f RCX: 4000000000000000
+RDX: 0000000000000000 RSI: 0000000000000000 RDI: ffffffff811c410c
+RBP: ffffffff829478c0 R08: 0000000000000001 R09: 0000000000000001
+R10: 0000000000000000 R11: 0000000000000000 R12: 0000000000000000
+R13: 0000000000000000 R14: 0000000000000000 R15: 0000000000000000
+? do_idle+0x1ec/0x210
+default_idle_call+0x6c/0x90
+do_idle+0x1ec/0x210
+cpu_startup_entry+0x26/0x30
+start_secondary+0x11b/0x150
+secondary_startup_64_no_verify+0x165/0x16b
+</TASK>
+irq event stamp: 833284
+hardirqs last  enabled at (833283): [<ffffffff811c410c>]
+do_idle+0x1ec/0x210
+hardirqs last disabled at (833284): [<ffffffff81daf9ef>]
+common_interrupt+0xf/0xa0
+softirqs last  enabled at (833224): [<ffffffff81dc199f>]
+__do_softirq+0x2bf/0x40e
+softirqs last disabled at (833177): [<ffffffff81178ddf>]
+irq_exit_rcu+0x7f/0xa0
+
+Fixes: 34f46ae0d4b3 ("net/mlx5: Add command failures data to debugfs")
+Signed-off-by: Yishai Hadas <yishaih@nvidia.com>
+Reviewed-by: Moshe Shemesh <moshe@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/cmd.c | 5 +++--
+ 1 file changed, 3 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+index a7b1f9686c09..4957412ff1f6 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/cmd.c
+@@ -1923,6 +1923,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+ {
+       const char *namep = mlx5_command_str(opcode);
+       struct mlx5_cmd_stats *stats;
++      unsigned long flags;
+ 
+       if (!err || !(strcmp(namep, "unknown command opcode")))
+               return;
+@@ -1930,7 +1931,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+       stats = xa_load(&dev->cmd.stats, opcode);
+       if (!stats)
+               return;
+-      spin_lock_irq(&stats->lock);
++      spin_lock_irqsave(&stats->lock, flags);
+       stats->failed++;
+       if (err < 0)
+               stats->last_failed_errno = -err;
+@@ -1939,7 +1940,7 @@ static void cmd_status_log(struct mlx5_core_dev *dev, u16 opcode, u8 status,
+               stats->last_failed_mbox_status = status;
+               stats->last_failed_syndrome = syndrome;
+       }
+-      spin_unlock_irq(&stats->lock);
++      spin_unlock_irqrestore(&stats->lock, flags);
+ }
+ 
+ /* preserve -EREMOTEIO for outbox.status != OK, otherwise return err as is */
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch b/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch

new file mode 100644 (file)

index 0000000..5b8dd96
--- /dev/null
+++ b/queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch
@@ -0,0 +1,39 @@
+From fee9cf89467830f61dd17acaae0daeed512142b9 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 14:01:54 -0800
+Subject: net/mlx5: Use mlx5 device constant for selecting CQ period mode for
+ ASO
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 20cbf8cbb827094197f3b17db60d71449415db1e ]
+
+mlx5 devices have specific constants for choosing the CQ period mode. These
+constants do not have to match the constants used by the kernel software
+API for DIM period mode selection.
+
+Fixes: cdd04f4d4d71 ("net/mlx5: Add support to create SQ and CQ for ASO")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Jianbo Liu <jianbol@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+index 40c7be124041..58bd749b5e4d 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/lib/aso.c
+@@ -98,7 +98,7 @@ static int create_aso_cq(struct mlx5_aso_cq *cq, void *cqc_data)
+       mlx5_fill_page_frag_array(&cq->wq_ctrl.buf,
+                                 (__be64 *)MLX5_ADDR_OF(create_cq_in, in, pas));
+ 
+-      MLX5_SET(cqc,   cqc, cq_period_mode, DIM_CQ_PERIOD_MODE_START_FROM_EQE);
++      MLX5_SET(cqc,   cqc, cq_period_mode, MLX5_CQ_PERIOD_MODE_START_FROM_EQE);
+       MLX5_SET(cqc,   cqc, c_eqn_or_apu_element, eqn);
+       MLX5_SET(cqc,   cqc, uar_page,      mdev->priv.uar->index);
+       MLX5_SET(cqc,   cqc, log_page_size, cq->wq_ctrl.buf.page_shift -
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch b/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch

new file mode 100644 (file)

index 0000000..1ef83f2
--- /dev/null
+++ b/queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch
@@ -0,0 +1,39 @@
+From 67398c780b74733917ea136778b926b754cfccd1 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 12 Dec 2023 13:52:55 +0200
+Subject: net/mlx5e: Allow software parsing when IPsec crypto is enabled
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 20f5468a7988dedd94a57ba8acd65ebda6a59723 ]
+
+All ConnectX devices have software parsing capability enabled, but it is
+more correct to set allow_swp only if capability exists, which for IPsec
+means that crypto offload is supported.
+
+Fixes: 2451da081a34 ("net/mlx5: Unify device IPsec capabilities check")
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/params.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+index e097f336e1c4..30507b7c2fb1 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/params.c
+@@ -1062,8 +1062,8 @@ void mlx5e_build_sq_param(struct mlx5_core_dev *mdev,
+       void *wq = MLX5_ADDR_OF(sqc, sqc, wq);
+       bool allow_swp;
+ 
+-      allow_swp =
+-              mlx5_geneve_tx_allowed(mdev) || !!mlx5_ipsec_device_caps(mdev);
++      allow_swp = mlx5_geneve_tx_allowed(mdev) ||
++                  (mlx5_ipsec_device_caps(mdev) & MLX5_IPSEC_CAP_CRYPTO);
+       mlx5e_build_sq_param_common(mdev, param);
+       MLX5_SET(wq, wq, log_wq_sz, params->log_sq_size);
+       MLX5_SET(sqc, sqc, allow_swp, allow_swp);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch b/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch

new file mode 100644 (file)

index 0000000..03470c1
--- /dev/null
+++ b/queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch
@@ -0,0 +1,100 @@
+From fcb41c119b911abd8b24a93d58e8f373a36e3784 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 15:17:36 +0800
+Subject: net/mlx5e: fix a double-free in arfs_create_groups
+
+From: Zhipeng Lu <alexious@zju.edu.cn>
+
+[ Upstream commit 3c6d5189246f590e4e1f167991558bdb72a4738b ]
+
+When `in` allocated by kvzalloc fails, arfs_create_groups will free
+ft->g and return an error. However, arfs_create_table, the only caller of
+arfs_create_groups, will hold this error and call to
+mlx5e_destroy_flow_table, in which the ft->g will be freed again.
+
+Fixes: 1cabe6b0965e ("net/mlx5e: Create aRFS flow tables")
+Signed-off-by: Zhipeng Lu <alexious@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_arfs.c | 26 +++++++++++--------
+ 1 file changed, 15 insertions(+), 11 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+index bb7f86c993e5..e66f486faafe 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_arfs.c
+@@ -254,11 +254,13 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+ 
+       ft->g = kcalloc(MLX5E_ARFS_NUM_GROUPS,
+                       sizeof(*ft->g), GFP_KERNEL);
+-      in = kvzalloc(inlen, GFP_KERNEL);
+-      if  (!in || !ft->g) {
+-              kfree(ft->g);
+-              kvfree(in);
++      if (!ft->g)
+               return -ENOMEM;
++
++      in = kvzalloc(inlen, GFP_KERNEL);
++      if (!in) {
++              err = -ENOMEM;
++              goto err_free_g;
+       }
+ 
+       mc = MLX5_ADDR_OF(create_flow_group_in, in, match_criteria);
+@@ -278,7 +280,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+               break;
+       default:
+               err = -EINVAL;
+-              goto out;
++              goto err_free_in;
+       }
+ 
+       switch (type) {
+@@ -300,7 +302,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+               break;
+       default:
+               err = -EINVAL;
+-              goto out;
++              goto err_free_in;
+       }
+ 
+       MLX5_SET_CFG(in, match_criteria_enable, MLX5_MATCH_OUTER_HEADERS);
+@@ -309,7 +311,7 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+       MLX5_SET_CFG(in, end_flow_index, ix - 1);
+       ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+       if (IS_ERR(ft->g[ft->num_groups]))
+-              goto err;
++              goto err_clean_group;
+       ft->num_groups++;
+ 
+       memset(in, 0, inlen);
+@@ -318,18 +320,20 @@ static int arfs_create_groups(struct mlx5e_flow_table *ft,
+       MLX5_SET_CFG(in, end_flow_index, ix - 1);
+       ft->g[ft->num_groups] = mlx5_create_flow_group(ft->t, in);
+       if (IS_ERR(ft->g[ft->num_groups]))
+-              goto err;
++              goto err_clean_group;
+       ft->num_groups++;
+ 
+       kvfree(in);
+       return 0;
+ 
+-err:
++err_clean_group:
+       err = PTR_ERR(ft->g[ft->num_groups]);
+       ft->g[ft->num_groups] = NULL;
+-out:
++err_free_in:
+       kvfree(in);
+-
++err_free_g:
++      kfree(ft->g);
++      ft->g = NULL;
+       return err;
+ }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch b/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch

new file mode 100644 (file)

index 0000000..1488dab
--- /dev/null
+++ b/queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch
@@ -0,0 +1,40 @@
+From a4f214befe9daf31c7e22b11d18352c1e952afd2 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 28 Nov 2023 17:29:01 +0800
+Subject: net/mlx5e: fix a potential double-free in fs_any_create_groups
+
+From: Dinghao Liu <dinghao.liu@zju.edu.cn>
+
+[ Upstream commit aef855df7e1bbd5aa4484851561211500b22707e ]
+
+When kcalloc() for ft->g succeeds but kvzalloc() for in fails,
+fs_any_create_groups() will free ft->g. However, its caller
+fs_any_create_table() will free ft->g again through calling
+mlx5e_destroy_flow_table(), which will lead to a double-free.
+Fix this by setting ft->g to NULL in fs_any_create_groups().
+
+Fixes: 0f575c20bf06 ("net/mlx5e: Introduce Flow Steering ANY API")
+Signed-off-by: Dinghao Liu <dinghao.liu@zju.edu.cn>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+index e1283531e0b8..671adbad0a40 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/fs_tt_redirect.c
+@@ -436,6 +436,7 @@ static int fs_any_create_groups(struct mlx5e_flow_table *ft)
+       in = kvzalloc(inlen, GFP_KERNEL);
+       if  (!in || !ft->g) {
+               kfree(ft->g);
++              ft->g = NULL;
+               kvfree(in);
+               return -ENOMEM;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch b/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch

new file mode 100644 (file)

index 0000000..837a32a
--- /dev/null
+++ b/queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch
@@ -0,0 +1,44 @@
+From 23db336dc559a45fa3dfa9a231d8d09a0fb1c25a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 5 Nov 2023 17:09:46 +0200
+Subject: net/mlx5e: Fix inconsistent hairpin RQT sizes
+
+From: Tariq Toukan <tariqt@nvidia.com>
+
+[ Upstream commit c20767fd45e82d64352db82d4fc8d281a43e4783 ]
+
+The processing of traffic in hairpin queues occurs in HW/FW and does not
+involve the cpus, hence the upper bound on max num channels does not
+apply to them.  Using this bound for the hairpin RQT max_table_size is
+wrong.  It could be too small, and cause the error below [1].  As the
+RQT size provided on init does not get modified later, use the same
+value for both actual and max table sizes.
+
+[1]
+mlx5_core 0000:08:00.1: mlx5_cmd_out_err:805:(pid 1200): CREATE_RQT(0x916) op_mod(0x0) failed, status bad parameter(0x3), syndrome (0x538faf), err(-22)
+
+Fixes: 74a8dadac17e ("net/mlx5e: Preparations for supporting larger number of channels")
+Signed-off-by: Tariq Toukan <tariqt@nvidia.com>
+Reviewed-by: Gal Pressman <gal@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+index 96af9e2ab1d8..b61d82f08e65 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -761,7 +761,7 @@ static int mlx5e_hairpin_create_indirect_rqt(struct mlx5e_hairpin *hp)
+ 
+       err = mlx5e_rss_params_indir_init(&indir, mdev,
+                                         mlx5e_rqt_size(mdev, hp->num_channels),
+-                                        mlx5e_rqt_size(mdev, priv->max_nch));
++                                        mlx5e_rqt_size(mdev, hp->num_channels));
+       if (err)
+               return err;
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch b/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch

new file mode 100644 (file)

index 0000000..b79cc38
--- /dev/null
+++ b/queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch
@@ -0,0 +1,41 @@
+From fa8499340268fb830497b9a417e690b2a2ee1e41 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 22 Nov 2023 18:32:11 -0800
+Subject: net/mlx5e: Fix operation precedence bug in port timestamping
+ napi_poll context
+
+From: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+
+[ Upstream commit 3876638b2c7ebb2c9d181de1191db0de8cac143a ]
+
+Indirection (*) is of lower precedence than postfix increment (++). Logic
+in napi_poll context would cause an out-of-bound read by first increment
+the pointer address by byte address space and then dereference the value.
+Rather, the intended logic was to dereference first and then increment the
+underlying value.
+
+Fixes: 92214be5979c ("net/mlx5e: Update doorbell for port timestamping CQ before the software counter")
+Signed-off-by: Rahul Rameshbabu <rrameshbabu@nvidia.com>
+Reviewed-by: Tariq Toukan <tariqt@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+index af3928eddafd..803035d4e597 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en/ptp.c
+@@ -213,7 +213,7 @@ static void mlx5e_ptp_handle_ts_cqe(struct mlx5e_ptpsq *ptpsq,
+       mlx5e_ptpsq_mark_ts_cqes_undelivered(ptpsq, hwtstamp);
+ out:
+       napi_consume_skb(skb, budget);
+-      md_buff[*md_buff_sz++] = metadata_id;
++      md_buff[(*md_buff_sz)++] = metadata_id;
+       if (unlikely(mlx5e_ptp_metadata_map_unhealthy(&ptpsq->metadata_map)) &&
+           !test_and_set_bit(MLX5E_SQ_STATE_RECOVERING, &sq->state))
+               queue_work(ptpsq->txqsq.priv->wq, &ptpsq->report_unhealthy_work);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch b/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch

new file mode 100644 (file)

index 0000000..afb040b
--- /dev/null
+++ b/queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch
@@ -0,0 +1,126 @@
+From 77963fd716035e94aa889dcddb5f193a5ac0f276 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 10 Nov 2023 11:10:22 +0100
+Subject: net/mlx5e: Fix peer flow lists handling
+
+From: Vlad Buslov <vladbu@nvidia.com>
+
+[ Upstream commit d76fdd31f953ac5046555171620f2562715e9b71 ]
+
+The cited change refactored mlx5e_tc_del_fdb_peer_flow() to only clear DUP
+flag when list of peer flows has become empty. However, if any concurrent
+user holds a reference to a peer flow (for example, the neighbor update
+workqueue task is updating peer flow's parent encap entry concurrently),
+then the flow will not be removed from the peer list and, consecutively,
+DUP flag will remain set. Since mlx5e_tc_del_fdb_peers_flow() calls
+mlx5e_tc_del_fdb_peer_flow() for every possible peer index the algorithm
+will try to remove the flow from eswitch instances that it has never peered
+with causing either NULL pointer dereference when trying to remove the flow
+peer list head of peer_index that was never initialized or a warning if the
+list debug config is enabled[0].
+
+Fix the issue by always removing the peer flow from the list even when not
+releasing the last reference to it.
+
+[0]:
+
+[ 3102.985806] ------------[ cut here ]------------
+[ 3102.986223] list_del corruption, ffff888139110698->next is NULL
+[ 3102.986757] WARNING: CPU: 2 PID: 22109 at lib/list_debug.c:53 __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.987561] Modules linked in: act_ct nf_flow_table bonding act_tunnel_key act_mirred act_skbedit vxlan cls_matchall nfnetlink_cttimeout act_gact cls_flower sch_ingress mlx5_vdpa vringh vhost_iotlb vdpa openvswitch nsh xt_MASQUERADE nf_conntrack_netlink nfnetlink iptable_nat xt_addrtype xt_conntrack nf_nat br_netfilter rpcsec_gss_krb5 auth_rpcg
+ss oid_registry overlay rpcrdma rdma_ucm ib_iser libiscsi scsi_transport_iscsi ib_umad rdma_cm ib_ipoib iw_cm ib_cm mlx5_ib ib_uverbs ib_core mlx5_core [last unloaded: bonding]
+[ 3102.991113] CPU: 2 PID: 22109 Comm: revalidator28 Not tainted 6.6.0-rc6+ #3
+[ 3102.991695] Hardware name: QEMU Standard PC (Q35 + ICH9, 2009), BIOS rel-1.13.0-0-gf21b5a4aeb02-prebuilt.qemu.org 04/01/2014
+[ 3102.992605] RIP: 0010:__list_del_entry_valid_or_report+0x4f/0xc0
+[ 3102.993122] Code: 39 c2 74 56 48 8b 32 48 39 fe 75 62 48 8b 51 08 48 39 f2 75 73 b8 01 00 00 00 c3 48 89 fe 48 c7 c7 48 fd 0a 82 e8 41 0b ad ff <0f> 0b 31 c0 c3 48 89 fe 48 c7 c7 70 fd 0a 82 e8 2d 0b ad ff 0f 0b
+[ 3102.994615] RSP: 0018:ffff8881383e7710 EFLAGS: 00010286
+[ 3102.995078] RAX: 0000000000000000 RBX: 0000000000000002 RCX: 0000000000000000
+[ 3102.995670] RDX: 0000000000000001 RSI: ffff88885f89b640 RDI: ffff88885f89b640
+[ 3102.997188] DEL flow 00000000be367878 on port 0
+[ 3102.998594] RBP: dead000000000122 R08: 0000000000000000 R09: c0000000ffffdfff
+[ 3102.999604] R10: 0000000000000008 R11: ffff8881383e7598 R12: dead000000000100
+[ 3103.000198] R13: 0000000000000002 R14: ffff888139110000 R15: ffff888101901240
+[ 3103.000790] FS:  00007f424cde4700(0000) GS:ffff88885f880000(0000) knlGS:0000000000000000
+[ 3103.001486] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[ 3103.001986] CR2: 00007fd42e8dcb70 CR3: 000000011e68a003 CR4: 0000000000370ea0
+[ 3103.002596] DR0: 0000000000000000 DR1: 0000000000000000 DR2: 0000000000000000
+[ 3103.003190] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7: 0000000000000400
+[ 3103.003787] Call Trace:
+[ 3103.004055]  <TASK>
+[ 3103.004297]  ? __warn+0x7d/0x130
+[ 3103.004623]  ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.005094]  ? report_bug+0xf1/0x1c0
+[ 3103.005439]  ? console_unlock+0x4a/0xd0
+[ 3103.005806]  ? handle_bug+0x3f/0x70
+[ 3103.006149]  ? exc_invalid_op+0x13/0x60
+[ 3103.006531]  ? asm_exc_invalid_op+0x16/0x20
+[ 3103.007430]  ? __list_del_entry_valid_or_report+0x4f/0xc0
+[ 3103.007910]  mlx5e_tc_del_fdb_peers_flow+0xcf/0x240 [mlx5_core]
+[ 3103.008463]  mlx5e_tc_del_flow+0x46/0x270 [mlx5_core]
+[ 3103.008944]  mlx5e_flow_put+0x26/0x50 [mlx5_core]
+[ 3103.009401]  mlx5e_delete_flower+0x25f/0x380 [mlx5_core]
+[ 3103.009901]  tc_setup_cb_destroy+0xab/0x180
+[ 3103.010292]  fl_hw_destroy_filter+0x99/0xc0 [cls_flower]
+[ 3103.010779]  __fl_delete+0x2d4/0x2f0 [cls_flower]
+[ 3103.011207]  fl_delete+0x36/0x80 [cls_flower]
+[ 3103.011614]  tc_del_tfilter+0x56f/0x750
+[ 3103.011982]  rtnetlink_rcv_msg+0xff/0x3a0
+[ 3103.012362]  ? netlink_ack+0x1c7/0x4e0
+[ 3103.012719]  ? rtnl_calcit.isra.44+0x130/0x130
+[ 3103.013134]  netlink_rcv_skb+0x54/0x100
+[ 3103.013533]  netlink_unicast+0x1ca/0x2b0
+[ 3103.013902]  netlink_sendmsg+0x361/0x4d0
+[ 3103.014269]  __sock_sendmsg+0x38/0x60
+[ 3103.014643]  ____sys_sendmsg+0x1f2/0x200
+[ 3103.015018]  ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.015265]  ___sys_sendmsg+0x87/0xd0
+[ 3103.016608]  ? copy_msghdr_from_user+0x72/0xa0
+[ 3103.017014]  ? ___sys_recvmsg+0x9b/0xd0
+[ 3103.017381]  ? ttwu_do_activate.isra.137+0x58/0x180
+[ 3103.017821]  ? wake_up_q+0x49/0x90
+[ 3103.018157]  ? futex_wake+0x137/0x160
+[ 3103.018521]  ? __sys_sendmsg+0x51/0x90
+[ 3103.018882]  __sys_sendmsg+0x51/0x90
+[ 3103.019230]  ? exit_to_user_mode_prepare+0x56/0x130
+[ 3103.019670]  do_syscall_64+0x3c/0x80
+[ 3103.020017]  entry_SYSCALL_64_after_hwframe+0x46/0xb0
+[ 3103.020469] RIP: 0033:0x7f4254811ef4
+[ 3103.020816] Code: 89 f3 48 83 ec 10 48 89 7c 24 08 48 89 14 24 e8 42 eb ff ff 48 8b 14 24 41 89 c0 48 89 de 48 8b 7c 24 08 b8 2e 00 00 00 0f 05 <48> 3d 00 f0 ff ff 77 30 44 89 c7 48 89 04 24 e8 78 eb ff ff 48 8b
+[ 3103.022290] RSP: 002b:00007f424cdd9480 EFLAGS: 00000293 ORIG_RAX: 000000000000002e
+[ 3103.022970] RAX: ffffffffffffffda RBX: 00007f424cdd9510 RCX: 00007f4254811ef4
+[ 3103.023564] RDX: 0000000000000000 RSI: 00007f424cdd9510 RDI: 0000000000000012
+[ 3103.024158] RBP: 00007f424cdda238 R08: 0000000000000000 R09: 00007f41d801a4b0
+[ 3103.024748] R10: 0000000000000000 R11: 0000000000000293 R12: 0000000000000001
+[ 3103.025341] R13: 00007f424cdd9510 R14: 00007f424cdda240 R15: 00007f424cdd99a0
+[ 3103.025931]  </TASK>
+[ 3103.026182] ---[ end trace 0000000000000000 ]---
+[ 3103.027033] ------------[ cut here ]------------
+
+Fixes: 9be6c21fdcf8 ("net/mlx5e: Handle offloads flows per peer")
+Signed-off-by: Vlad Buslov <vladbu@nvidia.com>
+Reviewed-by: Mark Bloch <mbloch@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/mellanox/mlx5/core/en_tc.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+index b61d82f08e65..404dd1d9b28b 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_tc.c
+@@ -2014,9 +2014,10 @@ static void mlx5e_tc_del_fdb_peer_flow(struct mlx5e_tc_flow *flow,
+       list_for_each_entry_safe(peer_flow, tmp, &flow->peer_flows, peer_flows) {
+               if (peer_index != mlx5_get_dev_index(peer_flow->priv->mdev))
+                       continue;
++
++              list_del(&peer_flow->peer_flows);
+               if (refcount_dec_and_test(&peer_flow->refcnt)) {
+                       mlx5e_tc_del_fdb_flow(peer_flow->priv, peer_flow);
+-                      list_del(&peer_flow->peer_flows);
+                       kfree(peer_flow);
+               }
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch b/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch

new file mode 100644 (file)

index 0000000..add5ca7
--- /dev/null
+++ b/queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch
@@ -0,0 +1,68 @@
+From 87127fb62eda0f2490d2a3d442cd9fdbdbf2732a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Sun, 26 Nov 2023 11:08:10 +0200
+Subject: net/mlx5e: Ignore IPsec replay window values on sender side
+
+From: Leon Romanovsky <leonro@nvidia.com>
+
+[ Upstream commit 315a597f9bcfe7fe9980985031413457bee95510 ]
+
+XFRM stack doesn't prevent from users to configure replay window
+in TX side and strongswan sets replay_window to be 1. It causes
+to failures in validation logic when trying to offload the SA.
+
+Replay window is not relevant in TX side and should be ignored.
+
+Fixes: cded6d80129b ("net/mlx5e: Store replay window in XFRM attributes")
+Signed-off-by: Aya Levin <ayal@nvidia.com>
+Signed-off-by: Leon Romanovsky <leonro@nvidia.com>
+Signed-off-by: Saeed Mahameed <saeedm@nvidia.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c   | 10 ++++++++--
+ 1 file changed, 8 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+index 161c5190c236..05612d9c6080 100644
+--- a/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
++++ b/drivers/net/ethernet/mellanox/mlx5/core/en_accel/ipsec.c
+@@ -336,12 +336,17 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+       /* iv len */
+       aes_gcm->icv_len = x->aead->alg_icv_len;
+ 
++      attrs->dir = x->xso.dir;
++
+       /* esn */
+       if (x->props.flags & XFRM_STATE_ESN) {
+               attrs->replay_esn.trigger = true;
+               attrs->replay_esn.esn = sa_entry->esn_state.esn;
+               attrs->replay_esn.esn_msb = sa_entry->esn_state.esn_msb;
+               attrs->replay_esn.overlap = sa_entry->esn_state.overlap;
++              if (attrs->dir == XFRM_DEV_OFFLOAD_OUT)
++                      goto skip_replay_window;
++
+               switch (x->replay_esn->replay_window) {
+               case 32:
+                       attrs->replay_esn.replay_window =
+@@ -365,7 +370,7 @@ void mlx5e_ipsec_build_accel_xfrm_attrs(struct mlx5e_ipsec_sa_entry *sa_entry,
+               }
+       }
+ 
+-      attrs->dir = x->xso.dir;
++skip_replay_window:
+       /* spi */
+       attrs->spi = be32_to_cpu(x->id.spi);
+ 
+@@ -501,7 +506,8 @@ static int mlx5e_xfrm_validate_state(struct mlx5_core_dev *mdev,
+                       return -EINVAL;
+               }
+ 
+-              if (x->replay_esn && x->replay_esn->replay_window != 32 &&
++              if (x->replay_esn && x->xso.dir == XFRM_DEV_OFFLOAD_IN &&
++                  x->replay_esn->replay_window != 32 &&
+                   x->replay_esn->replay_window != 64 &&
+                   x->replay_esn->replay_window != 128 &&
+                   x->replay_esn->replay_window != 256) {
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch b/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch

new file mode 100644 (file)

index 0000000..08f104f
--- /dev/null
+++ b/queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch
@@ -0,0 +1,77 @@
+From ab880a9dc6e8c24858ed3edae56d7d323782c2f6 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 19:59:14 -0800
+Subject: net: mvpp2: clear BM pool before initialization
+
+From: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+
+[ Upstream commit 9f538b415db862e74b8c5d3abbccfc1b2b6caa38 ]
+
+Register value persist after booting the kernel using
+kexec which results in kernel panic. Thus clear the
+BM pool registers before initialisation to fix the issue.
+
+Fixes: 3f518509dedc ("ethernet: Add new driver for Marvell Armada 375 network unit")
+Signed-off-by: Jenishkumar Maheshbhai Patel <jpatel2@marvell.com>
+Reviewed-by: Maxime Chevallier <maxime.chevallier@bootlin.com>
+Link: https://lore.kernel.org/r/20240119035914.2595665-1-jpatel2@marvell.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../net/ethernet/marvell/mvpp2/mvpp2_main.c   | 27 ++++++++++++++++++-
+ 1 file changed, 26 insertions(+), 1 deletion(-)
+
+diff --git a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+index 93137606869e..065f07392c96 100644
+--- a/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
++++ b/drivers/net/ethernet/marvell/mvpp2/mvpp2_main.c
+@@ -614,12 +614,38 @@ static void mvpp23_bm_set_8pool_mode(struct mvpp2 *priv)
+       mvpp2_write(priv, MVPP22_BM_POOL_BASE_ADDR_HIGH_REG, val);
+ }
+ 
++/* Cleanup pool before actual initialization in the OS */
++static void mvpp2_bm_pool_cleanup(struct mvpp2 *priv, int pool_id)
++{
++      unsigned int thread = mvpp2_cpu_to_thread(priv, get_cpu());
++      u32 val;
++      int i;
++
++      /* Drain the BM from all possible residues left by firmware */
++      for (i = 0; i < MVPP2_BM_POOL_SIZE_MAX; i++)
++              mvpp2_thread_read(priv, thread, MVPP2_BM_PHY_ALLOC_REG(pool_id));
++
++      put_cpu();
++
++      /* Stop the BM pool */
++      val = mvpp2_read(priv, MVPP2_BM_POOL_CTRL_REG(pool_id));
++      val |= MVPP2_BM_STOP_MASK;
++      mvpp2_write(priv, MVPP2_BM_POOL_CTRL_REG(pool_id), val);
++}
++
+ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+ {
+       enum dma_data_direction dma_dir = DMA_FROM_DEVICE;
+       int i, err, poolnum = MVPP2_BM_POOLS_NUM;
+       struct mvpp2_port *port;
+ 
++      if (priv->percpu_pools)
++              poolnum = mvpp2_get_nrxqs(priv) * 2;
++
++      /* Clean up the pool state in case it contains stale state */
++      for (i = 0; i < poolnum; i++)
++              mvpp2_bm_pool_cleanup(priv, i);
++
+       if (priv->percpu_pools) {
+               for (i = 0; i < priv->port_count; i++) {
+                       port = priv->port_list[i];
+@@ -629,7 +655,6 @@ static int mvpp2_bm_init(struct device *dev, struct mvpp2 *priv)
+                       }
+               }
+ 
+-              poolnum = mvpp2_get_nrxqs(priv) * 2;
+               for (i = 0; i < poolnum; i++) {
+                       /* the pool in use */
+                       int pn = i / (poolnum / 2);
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch b/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch

new file mode 100644 (file)

index 0000000..a219e8f
--- /dev/null
+++ b/queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch
@@ -0,0 +1,71 @@
+From 3a2f87de96b7fc7b6543d41e9fefa4c13dfa7cfc Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 17:48:39 -0800
+Subject: net/rds: Fix UBSAN: array-index-out-of-bounds in rds_cmsg_recv
+
+From: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+
+[ Upstream commit 13e788deb7348cc88df34bed736c3b3b9927ea52 ]
+
+Syzcaller UBSAN crash occurs in rds_cmsg_recv(),
+which reads inc->i_rx_lat_trace[j + 1] with index 4 (3 + 1),
+but with array size of 4 (RDS_RX_MAX_TRACES).
+Here 'j' is assigned from rs->rs_rx_trace[i] and in-turn from
+trace.rx_trace_pos[i] in rds_recv_track_latency(),
+with both arrays sized 3 (RDS_MSG_RX_DGRAM_TRACE_MAX). So fix the
+off-by-one bounds check in rds_recv_track_latency() to prevent
+a potential crash in rds_cmsg_recv().
+
+Found by syzcaller:
+=================================================================
+UBSAN: array-index-out-of-bounds in net/rds/recv.c:585:39
+index 4 is out of range for type 'u64 [4]'
+CPU: 1 PID: 8058 Comm: syz-executor228 Not tainted 6.6.0-gd2f51b3516da #1
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996),
+BIOS 1.15.0-1 04/01/2014
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x136/0x150 lib/dump_stack.c:106
+ ubsan_epilogue lib/ubsan.c:217 [inline]
+ __ubsan_handle_out_of_bounds+0xd5/0x130 lib/ubsan.c:348
+ rds_cmsg_recv+0x60d/0x700 net/rds/recv.c:585
+ rds_recvmsg+0x3fb/0x1610 net/rds/recv.c:716
+ sock_recvmsg_nosec net/socket.c:1044 [inline]
+ sock_recvmsg+0xe2/0x160 net/socket.c:1066
+ __sys_recvfrom+0x1b6/0x2f0 net/socket.c:2246
+ __do_sys_recvfrom net/socket.c:2264 [inline]
+ __se_sys_recvfrom net/socket.c:2260 [inline]
+ __x64_sys_recvfrom+0xe0/0x1b0 net/socket.c:2260
+ do_syscall_x64 arch/x86/entry/common.c:51 [inline]
+ do_syscall_64+0x40/0x110 arch/x86/entry/common.c:82
+ entry_SYSCALL_64_after_hwframe+0x63/0x6b
+==================================================================
+
+Fixes: 3289025aedc0 ("RDS: add receive message trace used by application")
+Reported-by: Chenyuan Yang <chenyuan0y@gmail.com>
+Closes: https://lore.kernel.org/linux-rdma/CALGdzuoVdq-wtQ4Az9iottBqC5cv9ZhcE5q8N7LfYFvkRsOVcw@mail.gmail.com/
+Signed-off-by: Sharath Srinivasan <sharath.srinivasan@oracle.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/rds/af_rds.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/rds/af_rds.c b/net/rds/af_rds.c
+index 01c4cdfef45d..8435a20968ef 100644
+--- a/net/rds/af_rds.c
++++ b/net/rds/af_rds.c
+@@ -419,7 +419,7 @@ static int rds_recv_track_latency(struct rds_sock *rs, sockptr_t optval,
+ 
+       rs->rs_rx_traces = trace.rx_traces;
+       for (i = 0; i < rs->rs_rx_traces; i++) {
+-              if (trace.rx_trace_pos[i] > RDS_MSG_RX_DGRAM_TRACE_MAX) {
++              if (trace.rx_trace_pos[i] >= RDS_MSG_RX_DGRAM_TRACE_MAX) {
+                       rs->rs_rx_traces = 0;
+                       return -EFAULT;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-sched-flower-fix-chain-template-offload.patch b/queue-6.7/net-sched-flower-fix-chain-template-offload.patch

new file mode 100644 (file)

index 0000000..5cd636a
--- /dev/null
+++ b/queue-6.7/net-sched-flower-fix-chain-template-offload.patch
@@ -0,0 +1,190 @@
+From 95f0d3dbe719a42cc4c5614e70cc0a6a71b2f833 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 15:28:43 +0200
+Subject: net/sched: flower: Fix chain template offload
+
+From: Ido Schimmel <idosch@nvidia.com>
+
+[ Upstream commit 32f2a0afa95fae0d1ceec2ff06e0e816939964b8 ]
+
+When a qdisc is deleted from a net device the stack instructs the
+underlying driver to remove its flow offload callback from the
+associated filter block using the 'FLOW_BLOCK_UNBIND' command. The stack
+then continues to replay the removal of the filters in the block for
+this driver by iterating over the chains in the block and invoking the
+'reoffload' operation of the classifier being used. In turn, the
+classifier in its 'reoffload' operation prepares and emits a
+'FLOW_CLS_DESTROY' command for each filter.
+
+However, the stack does not do the same for chain templates and the
+underlying driver never receives a 'FLOW_CLS_TMPLT_DESTROY' command when
+a qdisc is deleted. This results in a memory leak [1] which can be
+reproduced using [2].
+
+Fix by introducing a 'tmplt_reoffload' operation and have the stack
+invoke it with the appropriate arguments as part of the replay.
+Implement the operation in the sole classifier that supports chain
+templates (flower) by emitting the 'FLOW_CLS_TMPLT_{CREATE,DESTROY}'
+command based on whether a flow offload callback is being bound to a
+filter block or being unbound from one.
+
+As far as I can tell, the issue happens since cited commit which
+reordered tcf_block_offload_unbind() before tcf_block_flush_all_chains()
+in __tcf_block_put(). The order cannot be reversed as the filter block
+is expected to be freed after flushing all the chains.
+
+[1]
+unreferenced object 0xffff888107e28800 (size 2048):
+  comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+  hex dump (first 32 bytes):
+    b1 a6 7c 11 81 88 ff ff e0 5b b3 10 81 88 ff ff  ..|......[......
+    01 00 00 00 00 00 00 00 e0 aa b0 84 ff ff ff ff  ................
+  backtrace:
+    [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+    [<ffffffff81ab374e>] __kmalloc+0x4e/0x90
+    [<ffffffff832aec6d>] mlxsw_sp_acl_ruleset_get+0x34d/0x7a0
+    [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+    [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+    [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+    [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+    [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+    [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+    [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+    [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+    [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+    [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+    [<ffffffff8379d29a>] ___sys_sendmsg+0x13a/0x1e0
+    [<ffffffff8379d50c>] __sys_sendmsg+0x11c/0x1f0
+    [<ffffffff843b9ce0>] do_syscall_64+0x40/0xe0
+unreferenced object 0xffff88816d2c0400 (size 1024):
+  comm "tc", pid 1079, jiffies 4294958525 (age 3074.287s)
+  hex dump (first 32 bytes):
+    40 00 00 00 00 00 00 00 57 f6 38 be 00 00 00 00  @.......W.8.....
+    10 04 2c 6d 81 88 ff ff 10 04 2c 6d 81 88 ff ff  ..,m......,m....
+  backtrace:
+    [<ffffffff81c06a68>] __kmem_cache_alloc_node+0x1e8/0x320
+    [<ffffffff81ab36c1>] __kmalloc_node+0x51/0x90
+    [<ffffffff81a8ed96>] kvmalloc_node+0xa6/0x1f0
+    [<ffffffff82827d03>] bucket_table_alloc.isra.0+0x83/0x460
+    [<ffffffff82828d2b>] rhashtable_init+0x43b/0x7c0
+    [<ffffffff832aed48>] mlxsw_sp_acl_ruleset_get+0x428/0x7a0
+    [<ffffffff832bc195>] mlxsw_sp_flower_tmplt_create+0x145/0x180
+    [<ffffffff832b2e1a>] mlxsw_sp_flow_block_cb+0x1ea/0x280
+    [<ffffffff83a10613>] tc_setup_cb_call+0x183/0x340
+    [<ffffffff83a9f85a>] fl_tmplt_create+0x3da/0x4c0
+    [<ffffffff83a22435>] tc_ctl_chain+0xa15/0x1170
+    [<ffffffff838a863c>] rtnetlink_rcv_msg+0x3cc/0xed0
+    [<ffffffff83ac87f0>] netlink_rcv_skb+0x170/0x440
+    [<ffffffff83ac6270>] netlink_unicast+0x540/0x820
+    [<ffffffff83ac6e28>] netlink_sendmsg+0x8d8/0xda0
+    [<ffffffff83793def>] ____sys_sendmsg+0x30f/0xa80
+
+[2]
+ # tc qdisc add dev swp1 clsact
+ # tc chain add dev swp1 ingress proto ip chain 1 flower dst_ip 0.0.0.0/32
+ # tc qdisc del dev swp1 clsact
+ # devlink dev reload pci/0000:06:00.0
+
+Fixes: bbf73830cd48 ("net: sched: traverse chains in block with tcf_get_next_chain()")
+Signed-off-by: Ido Schimmel <idosch@nvidia.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/sch_generic.h |  4 ++++
+ net/sched/cls_api.c       |  9 ++++++++-
+ net/sched/cls_flower.c    | 23 +++++++++++++++++++++++
+ 3 files changed, 35 insertions(+), 1 deletion(-)
+
+diff --git a/include/net/sch_generic.h b/include/net/sch_generic.h
+index dcb9160e6467..959a7725c27b 100644
+--- a/include/net/sch_generic.h
++++ b/include/net/sch_generic.h
+@@ -375,6 +375,10 @@ struct tcf_proto_ops {
+                                               struct nlattr **tca,
+                                               struct netlink_ext_ack *extack);
+       void                    (*tmplt_destroy)(void *tmplt_priv);
++      void                    (*tmplt_reoffload)(struct tcf_chain *chain,
++                                                 bool add,
++                                                 flow_setup_cb_t *cb,
++                                                 void *cb_priv);
+       struct tcf_exts *       (*get_exts)(const struct tcf_proto *tp,
+                                           u32 handle);
+ 
+diff --git a/net/sched/cls_api.c b/net/sched/cls_api.c
+index 1976bd163986..02c594baa1d9 100644
+--- a/net/sched/cls_api.c
++++ b/net/sched/cls_api.c
+@@ -1536,6 +1536,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+            chain_prev = chain,
+                    chain = __tcf_get_next_chain(block, chain),
+                    tcf_chain_put(chain_prev)) {
++              if (chain->tmplt_ops && add)
++                      chain->tmplt_ops->tmplt_reoffload(chain, true, cb,
++                                                        cb_priv);
+               for (tp = __tcf_get_next_proto(chain, NULL); tp;
+                    tp_prev = tp,
+                            tp = __tcf_get_next_proto(chain, tp),
+@@ -1551,6 +1554,9 @@ tcf_block_playback_offloads(struct tcf_block *block, flow_setup_cb_t *cb,
+                               goto err_playback_remove;
+                       }
+               }
++              if (chain->tmplt_ops && !add)
++                      chain->tmplt_ops->tmplt_reoffload(chain, false, cb,
++                                                        cb_priv);
+       }
+ 
+       return 0;
+@@ -2971,7 +2977,8 @@ static int tc_chain_tmplt_add(struct tcf_chain *chain, struct net *net,
+       ops = tcf_proto_lookup_ops(name, true, extack);
+       if (IS_ERR(ops))
+               return PTR_ERR(ops);
+-      if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump) {
++      if (!ops->tmplt_create || !ops->tmplt_destroy || !ops->tmplt_dump ||
++          !ops->tmplt_reoffload) {
+               NL_SET_ERR_MSG(extack, "Chain templates are not supported with specified classifier");
+               module_put(ops->owner);
+               return -EOPNOTSUPP;
+diff --git a/net/sched/cls_flower.c b/net/sched/cls_flower.c
+index e5314a31f75a..efb9d2811b73 100644
+--- a/net/sched/cls_flower.c
++++ b/net/sched/cls_flower.c
+@@ -2721,6 +2721,28 @@ static void fl_tmplt_destroy(void *tmplt_priv)
+       kfree(tmplt);
+ }
+ 
++static void fl_tmplt_reoffload(struct tcf_chain *chain, bool add,
++                             flow_setup_cb_t *cb, void *cb_priv)
++{
++      struct fl_flow_tmplt *tmplt = chain->tmplt_priv;
++      struct flow_cls_offload cls_flower = {};
++
++      cls_flower.rule = flow_rule_alloc(0);
++      if (!cls_flower.rule)
++              return;
++
++      cls_flower.common.chain_index = chain->index;
++      cls_flower.command = add ? FLOW_CLS_TMPLT_CREATE :
++                                 FLOW_CLS_TMPLT_DESTROY;
++      cls_flower.cookie = (unsigned long) tmplt;
++      cls_flower.rule->match.dissector = &tmplt->dissector;
++      cls_flower.rule->match.mask = &tmplt->mask;
++      cls_flower.rule->match.key = &tmplt->dummy_key;
++
++      cb(TC_SETUP_CLSFLOWER, &cls_flower, cb_priv);
++      kfree(cls_flower.rule);
++}
++
+ static int fl_dump_key_val(struct sk_buff *skb,
+                          void *val, int val_type,
+                          void *mask, int mask_type, int len)
+@@ -3628,6 +3650,7 @@ static struct tcf_proto_ops cls_fl_ops __read_mostly = {
+       .bind_class     = fl_bind_class,
+       .tmplt_create   = fl_tmplt_create,
+       .tmplt_destroy  = fl_tmplt_destroy,
++      .tmplt_reoffload = fl_tmplt_reoffload,
+       .tmplt_dump     = fl_tmplt_dump,
+       .get_exts       = fl_get_exts,
+       .owner          = THIS_MODULE,
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch b/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch

new file mode 100644 (file)

index 0000000..67c9b27
--- /dev/null
+++ b/queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch
@@ -0,0 +1,87 @@
+From 1777c87b90b1b1898a582ff004304306f589756b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 12:32:10 +0800
+Subject: net/smc: fix illegal rmb_desc access in SMC-D connection dump
+
+From: Wen Gu <guwen@linux.alibaba.com>
+
+[ Upstream commit dbc153fd3c142909e564bb256da087e13fbf239c ]
+
+A crash was found when dumping SMC-D connections. It can be reproduced
+by following steps:
+
+- run nginx/wrk test:
+  smc_run nginx
+  smc_run wrk -t 16 -c 1000 -d <duration> -H 'Connection: Close' <URL>
+
+- continuously dump SMC-D connections in parallel:
+  watch -n 1 'smcss -D'
+
+ BUG: kernel NULL pointer dereference, address: 0000000000000030
+ CPU: 2 PID: 7204 Comm: smcss Kdump: loaded Tainted: G E      6.7.0+ #55
+ RIP: 0010:__smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+ Call Trace:
+  <TASK>
+  ? __die+0x24/0x70
+  ? page_fault_oops+0x66/0x150
+  ? exc_page_fault+0x69/0x140
+  ? asm_exc_page_fault+0x26/0x30
+  ? __smc_diag_dump.constprop.0+0x5e5/0x620 [smc_diag]
+  ? __kmalloc_node_track_caller+0x35d/0x430
+  ? __alloc_skb+0x77/0x170
+  smc_diag_dump_proto+0xd0/0xf0 [smc_diag]
+  smc_diag_dump+0x26/0x60 [smc_diag]
+  netlink_dump+0x19f/0x320
+  __netlink_dump_start+0x1dc/0x300
+  smc_diag_handler_dump+0x6a/0x80 [smc_diag]
+  ? __pfx_smc_diag_dump+0x10/0x10 [smc_diag]
+  sock_diag_rcv_msg+0x121/0x140
+  ? __pfx_sock_diag_rcv_msg+0x10/0x10
+  netlink_rcv_skb+0x5a/0x110
+  sock_diag_rcv+0x28/0x40
+  netlink_unicast+0x22a/0x330
+  netlink_sendmsg+0x1f8/0x420
+  __sock_sendmsg+0xb0/0xc0
+  ____sys_sendmsg+0x24e/0x300
+  ? copy_msghdr_from_user+0x62/0x80
+  ___sys_sendmsg+0x7c/0xd0
+  ? __do_fault+0x34/0x160
+  ? do_read_fault+0x5f/0x100
+  ? do_fault+0xb0/0x110
+  ? __handle_mm_fault+0x2b0/0x6c0
+  __sys_sendmsg+0x4d/0x80
+  do_syscall_64+0x69/0x180
+  entry_SYSCALL_64_after_hwframe+0x6e/0x76
+
+It is possible that the connection is in process of being established
+when we dump it. Assumed that the connection has been registered in a
+link group by smc_conn_create() but the rmb_desc has not yet been
+initialized by smc_buf_create(), thus causing the illegal access to
+conn->rmb_desc. So fix it by checking before dump.
+
+Fixes: 4b1b7d3b30a6 ("net/smc: add SMC-D diag support")
+Signed-off-by: Wen Gu <guwen@linux.alibaba.com>
+Reviewed-by: Dust Li <dust.li@linux.alibaba.com>
+Reviewed-by: Wenjia Zhang <wenjia@linux.ibm.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/smc/smc_diag.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/smc/smc_diag.c b/net/smc/smc_diag.c
+index 5cc376834c57..fb9e5cc1285e 100644
+--- a/net/smc/smc_diag.c
++++ b/net/smc/smc_diag.c
+@@ -163,7 +163,7 @@ static int __smc_diag_dump(struct sock *sk, struct sk_buff *skb,
+       }
+       if (smc_conn_lgr_valid(&smc->conn) && smc->conn.lgr->is_smcd &&
+           (req->diag_ext & (1 << (SMC_DIAG_DMBINFO - 1))) &&
+-          !list_empty(&smc->conn.lgr->list)) {
++          !list_empty(&smc->conn.lgr->list) && smc->conn.rmb_desc) {
+               struct smc_connection *conn = &smc->conn;
+               struct smcd_diag_dmbinfo dinfo;
+               struct smcd_dev *smcd = conn->lgr->smcd;
+-- 
+2.43.0
+
diff --git a/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch b/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch

new file mode 100644 (file)

index 0000000..6214814
--- /dev/null
+++ b/queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch
@@ -0,0 +1,63 @@
+From 174002e959ef2d8df58a9f06047e3f6d941a7e96 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 19:19:09 +0100
+Subject: net: stmmac: Wait a bit for the reset to take effect
+
+From: Bernd Edlinger <bernd.edlinger@hotmail.de>
+
+[ Upstream commit a5f5eee282a0aae80227697e1d9c811b1726d31d ]
+
+otherwise the synopsys_id value may be read out wrong,
+because the GMAC_VERSION register might still be in reset
+state, for at least 1 us after the reset is de-asserted.
+
+Add a wait for 10 us before continuing to be on the safe side.
+
+> From what have you got that delay value?
+
+Just try and error, with very old linux versions and old gcc versions
+the synopsys_id was read out correctly most of the time (but not always),
+with recent linux versions and recnet gcc versions it was read out
+wrongly most of the time, but again not always.
+I don't have access to the VHDL code in question, so I cannot
+tell why it takes so long to get the correct values, I also do not
+have more than a few hardware samples, so I cannot tell how long
+this timeout must be in worst case.
+Experimentally I can tell that the register is read several times
+as zero immediately after the reset is de-asserted, also adding several
+no-ops is not enough, adding a printk is enough, also udelay(1) seems to
+be enough but I tried that not very often, and I have not access to many
+hardware samples to be 100% sure about the necessary delay.
+And since the udelay here is only executed once per device instance,
+it seems acceptable to delay the boot for 10 us.
+
+BTW: my hardware's synopsys id is 0x37.
+
+Fixes: c5e4ddbdfa11 ("net: stmmac: Add support for optional reset control")
+Signed-off-by: Bernd Edlinger <bernd.edlinger@hotmail.de>
+Reviewed-by: Jiri Pirko <jiri@nvidia.com>
+Reviewed-by: Serge Semin <fancer.lancer@gmail.com>
+Link: https://lore.kernel.org/r/AS8P193MB1285A810BD78C111E7F6AA34E4752@AS8P193MB1285.EURP193.PROD.OUTLOOK.COM
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/stmicro/stmmac/stmmac_main.c | 3 +++
+ 1 file changed, 3 insertions(+)
+
+diff --git a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+index 49b81daf7411..d094c3c1e2ee 100644
+--- a/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
++++ b/drivers/net/ethernet/stmicro/stmmac/stmmac_main.c
+@@ -7467,6 +7467,9 @@ int stmmac_dvr_probe(struct device *device,
+               dev_err(priv->device, "unable to bring out of ahb reset: %pe\n",
+                       ERR_PTR(ret));
+ 
++      /* Wait a bit for the reset to take effect */
++      udelay(10);
++
+       /* Init MAC and get the capabilities */
+       ret = stmmac_hw_init(priv);
+       if (ret)
+-- 
+2.43.0
+
diff --git a/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch b/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch

new file mode 100644 (file)

index 0000000..784ce12
--- /dev/null
+++ b/queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch
@@ -0,0 +1,60 @@
+From 541e41a88a00522cc6ce415e5481902002c27b4a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:34:32 +0100
+Subject: netfilter: nf_tables: restrict anonymous set and map names to 16
+ bytes
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit b462579b2b86a8f5230543cadd3a4836be27baf7 ]
+
+nftables has two types of sets/maps, one where userspace defines the
+name, and anonymous sets/maps, where userspace defines a template name.
+
+For the latter, kernel requires presence of exactly one "%d".
+nftables uses "__set%d" and "__map%d" for this.  The kernel will
+expand the format specifier and replaces it with the smallest unused
+number.
+
+As-is, userspace could define a template name that allows to move
+the set name past the 256 bytes upperlimit (post-expansion).
+
+I don't see how this could be a problem, but I would prefer if userspace
+cannot do this, so add a limit of 16 bytes for the '%d' template name.
+
+16 bytes is the old total upper limit for set names that existed when
+nf_tables was merged initially.
+
+Fixes: 387454901bd6 ("netfilter: nf_tables: Allow set names of up to 255 chars")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nf_tables_api.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/netfilter/nf_tables_api.c b/net/netfilter/nf_tables_api.c
+index f032c29f1da6..5282e8377782 100644
+--- a/net/netfilter/nf_tables_api.c
++++ b/net/netfilter/nf_tables_api.c
+@@ -24,6 +24,7 @@
+ #include <net/sock.h>
+ 
+ #define NFT_MODULE_AUTOLOAD_LIMIT (MODULE_NAME_LEN - sizeof("nft-expr-255-"))
++#define NFT_SET_MAX_ANONLEN 16
+ 
+ unsigned int nf_tables_net_id __read_mostly;
+ 
+@@ -4411,6 +4412,9 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
+               if (p[1] != 'd' || strchr(p + 2, '%'))
+                       return -EINVAL;
+ 
++              if (strnlen(name, NFT_SET_MAX_ANONLEN) >= NFT_SET_MAX_ANONLEN)
++                      return -EINVAL;
++
+               inuse = (unsigned long *)get_zeroed_page(GFP_KERNEL);
+               if (inuse == NULL)
+                       return -ENOMEM;
+-- 
+2.43.0
+
diff --git a/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch b/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch

new file mode 100644 (file)

index 0000000..08633ac
--- /dev/null
+++ b/queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch
@@ -0,0 +1,196 @@
+From 3f0829fa1a89fd1aab6d15af0a0d0f7eb428dd1d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 16:38:25 +0100
+Subject: netfilter: nf_tables: validate NFPROTO_* family
+
+From: Pablo Neira Ayuso <pablo@netfilter.org>
+
+[ Upstream commit d0009effa8862c20a13af4cb7475d9771b905693 ]
+
+Several expressions explicitly refer to NF_INET_* hook definitions
+from expr->ops->validate, however, family is not validated.
+
+Bail out with EOPNOTSUPP in case they are used from unsupported
+families.
+
+Fixes: 0ca743a55991 ("netfilter: nf_tables: add compatibility layer for x_tables")
+Fixes: a3c90f7a2323 ("netfilter: nf_tables: flow offload expression")
+Fixes: 2fa841938c64 ("netfilter: nf_tables: introduce routing expression")
+Fixes: 554ced0a6e29 ("netfilter: nf_tables: add support for native socket matching")
+Fixes: ad49d86e07a4 ("netfilter: nf_tables: Add synproxy support")
+Fixes: 4ed8eb6570a4 ("netfilter: nf_tables: Add native tproxy support")
+Fixes: 6c47260250fc ("netfilter: nf_tables: add xfrm expression")
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_compat.c       | 12 ++++++++++++
+ net/netfilter/nft_flow_offload.c |  5 +++++
+ net/netfilter/nft_nat.c          |  5 +++++
+ net/netfilter/nft_rt.c           |  5 +++++
+ net/netfilter/nft_socket.c       |  5 +++++
+ net/netfilter/nft_synproxy.c     |  7 +++++--
+ net/netfilter/nft_tproxy.c       |  5 +++++
+ net/netfilter/nft_xfrm.c         |  5 +++++
+ 8 files changed, 47 insertions(+), 2 deletions(-)
+
+diff --git a/net/netfilter/nft_compat.c b/net/netfilter/nft_compat.c
+index 5284cd2ad532..f0eeda97bfcd 100644
+--- a/net/netfilter/nft_compat.c
++++ b/net/netfilter/nft_compat.c
+@@ -350,6 +350,12 @@ static int nft_target_validate(const struct nft_ctx *ctx,
+       unsigned int hook_mask = 0;
+       int ret;
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_BRIDGE &&
++          ctx->family != NFPROTO_ARP)
++              return -EOPNOTSUPP;
++
+       if (nft_is_base_chain(ctx->chain)) {
+               const struct nft_base_chain *basechain =
+                                               nft_base_chain(ctx->chain);
+@@ -595,6 +601,12 @@ static int nft_match_validate(const struct nft_ctx *ctx,
+       unsigned int hook_mask = 0;
+       int ret;
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_BRIDGE &&
++          ctx->family != NFPROTO_ARP)
++              return -EOPNOTSUPP;
++
+       if (nft_is_base_chain(ctx->chain)) {
+               const struct nft_base_chain *basechain =
+                                               nft_base_chain(ctx->chain);
+diff --git a/net/netfilter/nft_flow_offload.c b/net/netfilter/nft_flow_offload.c
+index ab3362c483b4..397351fa4d5f 100644
+--- a/net/netfilter/nft_flow_offload.c
++++ b/net/netfilter/nft_flow_offload.c
+@@ -384,6 +384,11 @@ static int nft_flow_offload_validate(const struct nft_ctx *ctx,
+ {
+       unsigned int hook_mask = (1 << NF_INET_FORWARD);
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, hook_mask);
+ }
+ 
+diff --git a/net/netfilter/nft_nat.c b/net/netfilter/nft_nat.c
+index 583885ce7232..808f5802c270 100644
+--- a/net/netfilter/nft_nat.c
++++ b/net/netfilter/nft_nat.c
+@@ -143,6 +143,11 @@ static int nft_nat_validate(const struct nft_ctx *ctx,
+       struct nft_nat *priv = nft_expr_priv(expr);
+       int err;
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       err = nft_chain_validate_dependency(ctx->chain, NFT_CHAIN_T_NAT);
+       if (err < 0)
+               return err;
+diff --git a/net/netfilter/nft_rt.c b/net/netfilter/nft_rt.c
+index 35a2c28caa60..24d977138572 100644
+--- a/net/netfilter/nft_rt.c
++++ b/net/netfilter/nft_rt.c
+@@ -166,6 +166,11 @@ static int nft_rt_validate(const struct nft_ctx *ctx, const struct nft_expr *exp
+       const struct nft_rt *priv = nft_expr_priv(expr);
+       unsigned int hooks;
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       switch (priv->key) {
+       case NFT_RT_NEXTHOP4:
+       case NFT_RT_NEXTHOP6:
+diff --git a/net/netfilter/nft_socket.c b/net/netfilter/nft_socket.c
+index 9ed85be79452..f30163e2ca62 100644
+--- a/net/netfilter/nft_socket.c
++++ b/net/netfilter/nft_socket.c
+@@ -242,6 +242,11 @@ static int nft_socket_validate(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain,
+                                       (1 << NF_INET_PRE_ROUTING) |
+                                       (1 << NF_INET_LOCAL_IN) |
+diff --git a/net/netfilter/nft_synproxy.c b/net/netfilter/nft_synproxy.c
+index 13da882669a4..1d737f89dfc1 100644
+--- a/net/netfilter/nft_synproxy.c
++++ b/net/netfilter/nft_synproxy.c
+@@ -186,7 +186,6 @@ static int nft_synproxy_do_init(const struct nft_ctx *ctx,
+               break;
+ #endif
+       case NFPROTO_INET:
+-      case NFPROTO_BRIDGE:
+               err = nf_synproxy_ipv4_init(snet, ctx->net);
+               if (err)
+                       goto nf_ct_failure;
+@@ -219,7 +218,6 @@ static void nft_synproxy_do_destroy(const struct nft_ctx *ctx)
+               break;
+ #endif
+       case NFPROTO_INET:
+-      case NFPROTO_BRIDGE:
+               nf_synproxy_ipv4_fini(snet, ctx->net);
+               nf_synproxy_ipv6_fini(snet, ctx->net);
+               break;
+@@ -253,6 +251,11 @@ static int nft_synproxy_validate(const struct nft_ctx *ctx,
+                                const struct nft_expr *expr,
+                                const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, (1 << NF_INET_LOCAL_IN) |
+                                                   (1 << NF_INET_FORWARD));
+ }
+diff --git a/net/netfilter/nft_tproxy.c b/net/netfilter/nft_tproxy.c
+index ae15cd693f0e..71412adb73d4 100644
+--- a/net/netfilter/nft_tproxy.c
++++ b/net/netfilter/nft_tproxy.c
+@@ -316,6 +316,11 @@ static int nft_tproxy_validate(const struct nft_ctx *ctx,
+                              const struct nft_expr *expr,
+                              const struct nft_data **data)
+ {
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       return nft_chain_validate_hooks(ctx->chain, 1 << NF_INET_PRE_ROUTING);
+ }
+ 
+diff --git a/net/netfilter/nft_xfrm.c b/net/netfilter/nft_xfrm.c
+index 452f8587adda..1c866757db55 100644
+--- a/net/netfilter/nft_xfrm.c
++++ b/net/netfilter/nft_xfrm.c
+@@ -235,6 +235,11 @@ static int nft_xfrm_validate(const struct nft_ctx *ctx, const struct nft_expr *e
+       const struct nft_xfrm *priv = nft_expr_priv(expr);
+       unsigned int hooks;
+ 
++      if (ctx->family != NFPROTO_IPV4 &&
++          ctx->family != NFPROTO_IPV6 &&
++          ctx->family != NFPROTO_INET)
++              return -EOPNOTSUPP;
++
+       switch (priv->dir) {
+       case XFRM_POLICY_IN:
+               hooks = (1 << NF_INET_FORWARD) |
+-- 
+2.43.0
+
diff --git a/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch b/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch

new file mode 100644 (file)

index 0000000..049bed3
--- /dev/null
+++ b/queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch
@@ -0,0 +1,83 @@
+From 03c58469dab167d91526495d6be164f559516dbe Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 13:11:32 +0100
+Subject: netfilter: nft_limit: reject configurations that cause integer
+ overflow
+
+From: Florian Westphal <fw@strlen.de>
+
+[ Upstream commit c9d9eb9c53d37cdebbad56b91e40baf42d5a97aa ]
+
+Reject bogus configs where internal token counter wraps around.
+This only occurs with very very large requests, such as 17gbyte/s.
+
+Its better to reject this rather than having incorrect ratelimit.
+
+Fixes: d2168e849ebf ("netfilter: nft_limit: add per-byte limiting")
+Signed-off-by: Florian Westphal <fw@strlen.de>
+Signed-off-by: Pablo Neira Ayuso <pablo@netfilter.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netfilter/nft_limit.c | 23 ++++++++++++++++-------
+ 1 file changed, 16 insertions(+), 7 deletions(-)
+
+diff --git a/net/netfilter/nft_limit.c b/net/netfilter/nft_limit.c
+index 79039afde34e..cefa25e0dbb0 100644
+--- a/net/netfilter/nft_limit.c
++++ b/net/netfilter/nft_limit.c
+@@ -58,17 +58,19 @@ static inline bool nft_limit_eval(struct nft_limit_priv *priv, u64 cost)
+ static int nft_limit_init(struct nft_limit_priv *priv,
+                         const struct nlattr * const tb[], bool pkts)
+ {
++      u64 unit, tokens, rate_with_burst;
+       bool invert = false;
+-      u64 unit, tokens;
+ 
+       if (tb[NFTA_LIMIT_RATE] == NULL ||
+           tb[NFTA_LIMIT_UNIT] == NULL)
+               return -EINVAL;
+ 
+       priv->rate = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_RATE]));
++      if (priv->rate == 0)
++              return -EINVAL;
++
+       unit = be64_to_cpu(nla_get_be64(tb[NFTA_LIMIT_UNIT]));
+-      priv->nsecs = unit * NSEC_PER_SEC;
+-      if (priv->rate == 0 || priv->nsecs < unit)
++      if (check_mul_overflow(unit, NSEC_PER_SEC, &priv->nsecs))
+               return -EOVERFLOW;
+ 
+       if (tb[NFTA_LIMIT_BURST])
+@@ -77,18 +79,25 @@ static int nft_limit_init(struct nft_limit_priv *priv,
+       if (pkts && priv->burst == 0)
+               priv->burst = NFT_LIMIT_PKT_BURST_DEFAULT;
+ 
+-      if (priv->rate + priv->burst < priv->rate)
++      if (check_add_overflow(priv->rate, priv->burst, &rate_with_burst))
+               return -EOVERFLOW;
+ 
+       if (pkts) {
+-              tokens = div64_u64(priv->nsecs, priv->rate) * priv->burst;
++              u64 tmp = div64_u64(priv->nsecs, priv->rate);
++
++              if (check_mul_overflow(tmp, priv->burst, &tokens))
++                      return -EOVERFLOW;
+       } else {
++              u64 tmp;
++
+               /* The token bucket size limits the number of tokens can be
+                * accumulated. tokens_max specifies the bucket size.
+                * tokens_max = unit * (rate + burst) / rate.
+                */
+-              tokens = div64_u64(priv->nsecs * (priv->rate + priv->burst),
+-                               priv->rate);
++              if (check_mul_overflow(priv->nsecs, rate_with_burst, &tmp))
++                      return -EOVERFLOW;
++
++              tokens = div64_u64(tmp, priv->rate);
+       }
+ 
+       if (tb[NFTA_LIMIT_FLAGS]) {
+-- 
+2.43.0
+
diff --git a/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch b/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch

new file mode 100644 (file)

index 0000000..14758dd
--- /dev/null
+++ b/queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch
@@ -0,0 +1,44 @@
+From 9b0508de9d77bca0679a689ebf30f1cb59ba3392 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 12 Jan 2024 09:59:41 +0300
+Subject: netfs, fscache: Prevent Oops in fscache_put_cache()
+
+From: Dan Carpenter <dan.carpenter@linaro.org>
+
+[ Upstream commit 3be0b3ed1d76c6703b9ee482b55f7e01c369cc68 ]
+
+This function dereferences "cache" and then checks if it's
+IS_ERR_OR_NULL().  Check first, then dereference.
+
+Fixes: 9549332df4ed ("fscache: Implement cache registration")
+Signed-off-by: Dan Carpenter <dan.carpenter@linaro.org>
+Signed-off-by: David Howells <dhowells@redhat.com>
+Link: https://lore.kernel.org/r/e84bc740-3502-4f16-982a-a40d5676615c@moroto.mountain/ # v2
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/fscache/cache.c | 3 ++-
+ 1 file changed, 2 insertions(+), 1 deletion(-)
+
+diff --git a/fs/fscache/cache.c b/fs/fscache/cache.c
+index d645f8b302a2..9397ed39b0b4 100644
+--- a/fs/fscache/cache.c
++++ b/fs/fscache/cache.c
+@@ -179,13 +179,14 @@ EXPORT_SYMBOL(fscache_acquire_cache);
+ void fscache_put_cache(struct fscache_cache *cache,
+                      enum fscache_cache_trace where)
+ {
+-      unsigned int debug_id = cache->debug_id;
++      unsigned int debug_id;
+       bool zero;
+       int ref;
+ 
+       if (IS_ERR_OR_NULL(cache))
+               return;
+ 
++      debug_id = cache->debug_id;
+       zero = __refcount_dec_and_test(&cache->ref, &ref);
+       trace_fscache_cache(debug_id, ref - 1, where);
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch b/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch

new file mode 100644 (file)

index 0000000..96c47b1
--- /dev/null
+++ b/queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch
@@ -0,0 +1,76 @@
+From e0e707fa22ae61e59a539bf4dbd7beb2f21590eb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 09:18:07 +0800
+Subject: netlink: fix potential sleeping issue in mqueue_flush_file
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 234ec0b6034b16869d45128b8cd2dc6ffe596f04 ]
+
+I analyze the potential sleeping issue of the following processes:
+Thread A                                Thread B
+...                                     netlink_create  //ref = 1
+do_mq_notify                            ...
+  sock = netlink_getsockbyfilp          ...     //ref = 2
+  info->notify_sock = sock;             ...
+...                                     netlink_sendmsg
+...                                       skb = netlink_alloc_large_skb  //skb->head is vmalloced
+...                                       netlink_unicast
+...                                         sk = netlink_getsockbyportid //ref = 3
+...                                         netlink_sendskb
+...                                           __netlink_sendskb
+...                                             skb_queue_tail //put skb to sk_receive_queue
+...                                         sock_put //ref = 2
+...                                     ...
+...                                     netlink_release
+...                                       deferred_put_nlk_sk //ref = 1
+mqueue_flush_file
+  spin_lock
+  remove_notification
+    netlink_sendskb
+      sock_put  //ref = 0
+        sk_free
+          ...
+          __sk_destruct
+            netlink_sock_destruct
+              skb_queue_purge  //get skb from sk_receive_queue
+                ...
+                __skb_queue_purge_reason
+                  kfree_skb_reason
+                    __kfree_skb
+                    ...
+                    skb_release_all
+                      skb_release_head_state
+                        netlink_skb_destructor
+                          vfree(skb->head)  //sleeping while holding spinlock
+
+In netlink_sendmsg, if the memory pointed to by skb->head is allocated by
+vmalloc, and is put to sk_receive_queue queue, also the skb is not freed.
+When the mqueue executes flush, the sleeping bug will occur. Use
+vfree_atomic instead of vfree in netlink_skb_destructor to solve the issue.
+
+Fixes: c05cdb1b864f ("netlink: allow large data transfers from user-space")
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Link: https://lore.kernel.org/r/20240122011807.2110357-1-shaozhengchao@huawei.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/netlink/af_netlink.c | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/net/netlink/af_netlink.c b/net/netlink/af_netlink.c
+index eb086b06d60d..d9107b545d36 100644
+--- a/net/netlink/af_netlink.c
++++ b/net/netlink/af_netlink.c
+@@ -374,7 +374,7 @@ static void netlink_skb_destructor(struct sk_buff *skb)
+       if (is_vmalloc_addr(skb->head)) {
+               if (!skb->cloned ||
+                   !atomic_dec_return(&(skb_shinfo(skb)->dataref)))
+-                      vfree(skb->head);
++                      vfree_atomic(skb->head);
+ 
+               skb->head = NULL;
+       }
+-- 
+2.43.0
+
diff --git a/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch b/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch

new file mode 100644 (file)

index 0000000..40144ee
--- /dev/null
+++ b/queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch
@@ -0,0 +1,141 @@
+From 3ee2c71d0f6a323db1850cf4f2af474918609467 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 19 Dec 2023 00:19:15 +0100
+Subject: rcu: Defer RCU kthreads wakeup when CPU is dying
+
+From: Frederic Weisbecker <frederic@kernel.org>
+
+[ Upstream commit e787644caf7628ad3269c1fbd321c3255cf51710 ]
+
+When the CPU goes idle for the last time during the CPU down hotplug
+process, RCU reports a final quiescent state for the current CPU. If
+this quiescent state propagates up to the top, some tasks may then be
+woken up to complete the grace period: the main grace period kthread
+and/or the expedited main workqueue (or kworker).
+
+If those kthreads have a SCHED_FIFO policy, the wake up can indirectly
+arm the RT bandwith timer to the local offline CPU. Since this happens
+after hrtimers have been migrated at CPUHP_AP_HRTIMERS_DYING stage, the
+timer gets ignored. Therefore if the RCU kthreads are waiting for RT
+bandwidth to be available, they may never be actually scheduled.
+
+This triggers TREE03 rcutorture hangs:
+
+        rcu: INFO: rcu_preempt self-detected stall on CPU
+        rcu:     4-...!: (1 GPs behind) idle=9874/1/0x4000000000000000 softirq=0/0 fqs=20 rcuc=21071 jiffies(starved)
+        rcu:     (t=21035 jiffies g=938281 q=40787 ncpus=6)
+        rcu: rcu_preempt kthread starved for 20964 jiffies! g938281 f0x0 RCU_GP_WAIT_FQS(5) ->state=0x0 ->cpu=0
+        rcu:     Unless rcu_preempt kthread gets sufficient CPU time, OOM is now expected behavior.
+        rcu: RCU grace-period kthread stack dump:
+        task:rcu_preempt     state:R  running task     stack:14896 pid:14    tgid:14    ppid:2      flags:0x00004000
+        Call Trace:
+         <TASK>
+         __schedule+0x2eb/0xa80
+         schedule+0x1f/0x90
+         schedule_timeout+0x163/0x270
+         ? __pfx_process_timeout+0x10/0x10
+         rcu_gp_fqs_loop+0x37c/0x5b0
+         ? __pfx_rcu_gp_kthread+0x10/0x10
+         rcu_gp_kthread+0x17c/0x200
+         kthread+0xde/0x110
+         ? __pfx_kthread+0x10/0x10
+         ret_from_fork+0x2b/0x40
+         ? __pfx_kthread+0x10/0x10
+         ret_from_fork_asm+0x1b/0x30
+         </TASK>
+
+The situation can't be solved with just unpinning the timer. The hrtimer
+infrastructure and the nohz heuristics involved in finding the best
+remote target for an unpinned timer would then also need to handle
+enqueues from an offline CPU in the most horrendous way.
+
+So fix this on the RCU side instead and defer the wake up to an online
+CPU if it's too late for the local one.
+
+Reported-by: Paul E. McKenney <paulmck@kernel.org>
+Fixes: 5c0930ccaad5 ("hrtimers: Push pending hrtimers away from outgoing CPU earlier")
+Signed-off-by: Frederic Weisbecker <frederic@kernel.org>
+Signed-off-by: Paul E. McKenney <paulmck@kernel.org>
+Signed-off-by: Neeraj Upadhyay (AMD) <neeraj.iitr10@gmail.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/rcu/tree.c     | 34 +++++++++++++++++++++++++++++++++-
+ kernel/rcu/tree_exp.h |  3 +--
+ 2 files changed, 34 insertions(+), 3 deletions(-)
+
+diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
+index 3ac3c846105f..157f3ca2a9b5 100644
+--- a/kernel/rcu/tree.c
++++ b/kernel/rcu/tree.c
+@@ -1013,6 +1013,38 @@ static bool rcu_future_gp_cleanup(struct rcu_node *rnp)
+       return needmore;
+ }
+ 
++static void swake_up_one_online_ipi(void *arg)
++{
++      struct swait_queue_head *wqh = arg;
++
++      swake_up_one(wqh);
++}
++
++static void swake_up_one_online(struct swait_queue_head *wqh)
++{
++      int cpu = get_cpu();
++
++      /*
++       * If called from rcutree_report_cpu_starting(), wake up
++       * is dangerous that late in the CPU-down hotplug process. The
++       * scheduler might queue an ignored hrtimer. Defer the wake up
++       * to an online CPU instead.
++       */
++      if (unlikely(cpu_is_offline(cpu))) {
++              int target;
++
++              target = cpumask_any_and(housekeeping_cpumask(HK_TYPE_RCU),
++                                       cpu_online_mask);
++
++              smp_call_function_single(target, swake_up_one_online_ipi,
++                                       wqh, 0);
++              put_cpu();
++      } else {
++              put_cpu();
++              swake_up_one(wqh);
++      }
++}
++
+ /*
+  * Awaken the grace-period kthread.  Don't do a self-awaken (unless in an
+  * interrupt or softirq handler, in which case we just might immediately
+@@ -1037,7 +1069,7 @@ static void rcu_gp_kthread_wake(void)
+               return;
+       WRITE_ONCE(rcu_state.gp_wake_time, jiffies);
+       WRITE_ONCE(rcu_state.gp_wake_seq, READ_ONCE(rcu_state.gp_seq));
+-      swake_up_one(&rcu_state.gp_wq);
++      swake_up_one_online(&rcu_state.gp_wq);
+ }
+ 
+ /*
+diff --git a/kernel/rcu/tree_exp.h b/kernel/rcu/tree_exp.h
+index 6d7cea5d591f..2ac440bc7e10 100644
+--- a/kernel/rcu/tree_exp.h
++++ b/kernel/rcu/tree_exp.h
+@@ -173,7 +173,6 @@ static bool sync_rcu_exp_done_unlocked(struct rcu_node *rnp)
+       return ret;
+ }
+ 
+-
+ /*
+  * Report the exit from RCU read-side critical section for the last task
+  * that queued itself during or before the current expedited preemptible-RCU
+@@ -201,7 +200,7 @@ static void __rcu_report_exp_rnp(struct rcu_node *rnp,
+                       raw_spin_unlock_irqrestore_rcu_node(rnp, flags);
+                       if (wake) {
+                               smp_mb(); /* EGP done before wake_up(). */
+-                              swake_up_one(&rcu_state.expedited_wq);
++                              swake_up_one_online(&rcu_state.expedited_wq);
+                       }
+                       break;
+               }
+-- 
+2.43.0
+
diff --git a/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch b/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch

new file mode 100644 (file)

index 0000000..b645231
--- /dev/null
+++ b/queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch
@@ -0,0 +1,1276 @@
+From 5f7f956733145671f4d20cd9caf937583e2b50ad Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 19 Oct 2023 12:55:11 +0100
+Subject: rxrpc, afs: Allow afs to pin rxrpc_peer objects
+
+From: David Howells <dhowells@redhat.com>
+
+[ Upstream commit 72904d7b9bfbf2dd146254edea93958bc35bbbfe ]
+
+Change rxrpc's API such that:
+
+ (1) A new function, rxrpc_kernel_lookup_peer(), is provided to look up an
+     rxrpc_peer record for a remote address and a corresponding function,
+     rxrpc_kernel_put_peer(), is provided to dispose of it again.
+
+ (2) When setting up a call, the rxrpc_peer object used during a call is
+     now passed in rather than being set up by rxrpc_connect_call().  For
+     afs, this meenat passing it to rxrpc_kernel_begin_call() rather than
+     the full address (the service ID then has to be passed in as a
+     separate parameter).
+
+ (3) A new function, rxrpc_kernel_remote_addr(), is added so that afs can
+     get a pointer to the transport address for display purposed, and
+     another, rxrpc_kernel_remote_srx(), to gain a pointer to the full
+     rxrpc address.
+
+ (4) The function to retrieve the RTT from a call, rxrpc_kernel_get_srtt(),
+     is then altered to take a peer.  This now returns the RTT or -1 if
+     there are insufficient samples.
+
+ (5) Rename rxrpc_kernel_get_peer() to rxrpc_kernel_call_get_peer().
+
+ (6) Provide a new function, rxrpc_kernel_get_peer(), to get a ref on a
+     peer the caller already has.
+
+This allows the afs filesystem to pin the rxrpc_peer records that it is
+using, allowing faster lookups and pointer comparisons rather than
+comparing sockaddr_rxrpc contents.  It also makes it easier to get hold of
+the RTT.  The following changes are made to afs:
+
+ (1) The addr_list struct's addrs[] elements now hold a peer struct pointer
+     and a service ID rather than a sockaddr_rxrpc.
+
+ (2) When displaying the transport address, rxrpc_kernel_remote_addr() is
+     used.
+
+ (3) The port arg is removed from afs_alloc_addrlist() since it's always
+     overridden.
+
+ (4) afs_merge_fs_addr4() and afs_merge_fs_addr6() do peer lookup and may
+     now return an error that must be handled.
+
+ (5) afs_find_server() now takes a peer pointer to specify the address.
+
+ (6) afs_find_server(), afs_compare_fs_alists() and afs_merge_fs_addr[46]{}
+     now do peer pointer comparison rather than address comparison.
+
+Signed-off-by: David Howells <dhowells@redhat.com>
+cc: Marc Dionne <marc.dionne@auristor.com>
+cc: linux-afs@lists.infradead.org
+Stable-dep-of: 17ba6f0bd14f ("afs: Fix error handling with lookup via FS.InlineBulkStatus")
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ fs/afs/addr_list.c           | 125 ++++++++++++++++++-----------------
+ fs/afs/cmservice.c           |   5 +-
+ fs/afs/fs_probe.c            |  11 +--
+ fs/afs/internal.h            |  26 ++++----
+ fs/afs/proc.c                |   9 +--
+ fs/afs/rotate.c              |   6 +-
+ fs/afs/rxrpc.c               |  10 +--
+ fs/afs/server.c              |  41 ++----------
+ fs/afs/vl_alias.c            |  55 +--------------
+ fs/afs/vl_list.c             |  15 +++--
+ fs/afs/vl_probe.c            |  12 ++--
+ fs/afs/vl_rotate.c           |   6 +-
+ fs/afs/vlclient.c            |  22 ++++--
+ include/net/af_rxrpc.h       |  15 +++--
+ include/trace/events/rxrpc.h |   3 +
+ net/rxrpc/af_rxrpc.c         |  62 ++++++++++++++---
+ net/rxrpc/ar-internal.h      |   2 +-
+ net/rxrpc/call_object.c      |  17 ++---
+ net/rxrpc/peer_object.c      |  58 ++++++++++------
+ net/rxrpc/sendmsg.c          |  11 ++-
+ 20 files changed, 273 insertions(+), 238 deletions(-)
+
+diff --git a/fs/afs/addr_list.c b/fs/afs/addr_list.c
+index ac05a59e9d46..519821f5aedc 100644
+--- a/fs/afs/addr_list.c
++++ b/fs/afs/addr_list.c
+@@ -13,26 +13,33 @@
+ #include "internal.h"
+ #include "afs_fs.h"
+ 
++static void afs_free_addrlist(struct rcu_head *rcu)
++{
++      struct afs_addr_list *alist = container_of(rcu, struct afs_addr_list, rcu);
++      unsigned int i;
++
++      for (i = 0; i < alist->nr_addrs; i++)
++              rxrpc_kernel_put_peer(alist->addrs[i].peer);
++}
++
+ /*
+  * Release an address list.
+  */
+ void afs_put_addrlist(struct afs_addr_list *alist)
+ {
+       if (alist && refcount_dec_and_test(&alist->usage))
+-              kfree_rcu(alist, rcu);
++              call_rcu(&alist->rcu, afs_free_addrlist);
+ }
+ 
+ /*
+  * Allocate an address list.
+  */
+-struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+-                                       unsigned short service,
+-                                       unsigned short port)
++struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id)
+ {
+       struct afs_addr_list *alist;
+       unsigned int i;
+ 
+-      _enter("%u,%u,%u", nr, service, port);
++      _enter("%u,%u", nr, service_id);
+ 
+       if (nr > AFS_MAX_ADDRESSES)
+               nr = AFS_MAX_ADDRESSES;
+@@ -44,16 +51,8 @@ struct afs_addr_list *afs_alloc_addrlist(unsigned int nr,
+       refcount_set(&alist->usage, 1);
+       alist->max_addrs = nr;
+ 
+-      for (i = 0; i < nr; i++) {
+-              struct sockaddr_rxrpc *srx = &alist->addrs[i].srx;
+-              srx->srx_family                 = AF_RXRPC;
+-              srx->srx_service                = service;
+-              srx->transport_type             = SOCK_DGRAM;
+-              srx->transport_len              = sizeof(srx->transport.sin6);
+-              srx->transport.sin6.sin6_family = AF_INET6;
+-              srx->transport.sin6.sin6_port   = htons(port);
+-      }
+-
++      for (i = 0; i < nr; i++)
++              alist->addrs[i].service_id = service_id;
+       return alist;
+ }
+ 
+@@ -126,7 +125,7 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+       if (!vllist->servers[0].server)
+               goto error_vl;
+ 
+-      alist = afs_alloc_addrlist(nr, service, AFS_VL_PORT);
++      alist = afs_alloc_addrlist(nr, service);
+       if (!alist)
+               goto error;
+ 
+@@ -197,9 +196,11 @@ struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *net,
+               }
+ 
+               if (family == AF_INET)
+-                      afs_merge_fs_addr4(alist, x[0], xport);
++                      ret = afs_merge_fs_addr4(net, alist, x[0], xport);
+               else
+-                      afs_merge_fs_addr6(alist, x, xport);
++                      ret = afs_merge_fs_addr6(net, alist, x, xport);
++              if (ret < 0)
++                      goto error;
+ 
+       } while (p < end);
+ 
+@@ -271,25 +272,33 @@ struct afs_vlserver_list *afs_dns_query(struct afs_cell *cell, time64_t *_expiry
+ /*
+  * Merge an IPv4 entry into a fileserver address list.
+  */
+-void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
++int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *alist,
++                     __be32 xdr, u16 port)
+ {
+-      struct sockaddr_rxrpc *srx;
+-      u32 addr = ntohl(xdr);
++      struct sockaddr_rxrpc srx;
++      struct rxrpc_peer *peer;
+       int i;
+ 
+       if (alist->nr_addrs >= alist->max_addrs)
+-              return;
++              return 0;
+ 
+-      for (i = 0; i < alist->nr_ipv4; i++) {
+-              struct sockaddr_in *a = &alist->addrs[i].srx.transport.sin;
+-              u32 a_addr = ntohl(a->sin_addr.s_addr);
+-              u16 a_port = ntohs(a->sin_port);
++      srx.srx_family = AF_RXRPC;
++      srx.transport_type = SOCK_DGRAM;
++      srx.transport_len = sizeof(srx.transport.sin);
++      srx.transport.sin.sin_family = AF_INET;
++      srx.transport.sin.sin_port = htons(port);
++      srx.transport.sin.sin_addr.s_addr = xdr;
+ 
+-              if (addr == a_addr && port == a_port)
+-                      return;
+-              if (addr == a_addr && port < a_port)
+-                      break;
+-              if (addr < a_addr)
++      peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
++      if (!peer)
++              return -ENOMEM;
++
++      for (i = 0; i < alist->nr_ipv4; i++) {
++              if (peer == alist->addrs[i].peer) {
++                      rxrpc_kernel_put_peer(peer);
++                      return 0;
++              }
++              if (peer <= alist->addrs[i].peer)
+                       break;
+       }
+ 
+@@ -298,38 +307,42 @@ void afs_merge_fs_addr4(struct afs_addr_list *alist, __be32 xdr, u16 port)
+                       alist->addrs + i,
+                       sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+ 
+-      srx = &alist->addrs[i].srx;
+-      srx->srx_family = AF_RXRPC;
+-      srx->transport_type = SOCK_DGRAM;
+-      srx->transport_len = sizeof(srx->transport.sin);
+-      srx->transport.sin.sin_family = AF_INET;
+-      srx->transport.sin.sin_port = htons(port);
+-      srx->transport.sin.sin_addr.s_addr = xdr;
++      alist->addrs[i].peer = peer;
+       alist->nr_ipv4++;
+       alist->nr_addrs++;
++      return 0;
+ }
+ 
+ /*
+  * Merge an IPv6 entry into a fileserver address list.
+  */
+-void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
++int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *alist,
++                     __be32 *xdr, u16 port)
+ {
+-      struct sockaddr_rxrpc *srx;
+-      int i, diff;
++      struct sockaddr_rxrpc srx;
++      struct rxrpc_peer *peer;
++      int i;
+ 
+       if (alist->nr_addrs >= alist->max_addrs)
+-              return;
++              return 0;
+ 
+-      for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+-              struct sockaddr_in6 *a = &alist->addrs[i].srx.transport.sin6;
+-              u16 a_port = ntohs(a->sin6_port);
++      srx.srx_family = AF_RXRPC;
++      srx.transport_type = SOCK_DGRAM;
++      srx.transport_len = sizeof(srx.transport.sin6);
++      srx.transport.sin6.sin6_family = AF_INET6;
++      srx.transport.sin6.sin6_port = htons(port);
++      memcpy(&srx.transport.sin6.sin6_addr, xdr, 16);
+ 
+-              diff = memcmp(xdr, &a->sin6_addr, 16);
+-              if (diff == 0 && port == a_port)
+-                      return;
+-              if (diff == 0 && port < a_port)
+-                      break;
+-              if (diff < 0)
++      peer = rxrpc_kernel_lookup_peer(net->socket, &srx, GFP_KERNEL);
++      if (!peer)
++              return -ENOMEM;
++
++      for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
++              if (peer == alist->addrs[i].peer) {
++                      rxrpc_kernel_put_peer(peer);
++                      return 0;
++              }
++              if (peer <= alist->addrs[i].peer)
+                       break;
+       }
+ 
+@@ -337,15 +350,9 @@ void afs_merge_fs_addr6(struct afs_addr_list *alist, __be32 *xdr, u16 port)
+               memmove(alist->addrs + i + 1,
+                       alist->addrs + i,
+                       sizeof(alist->addrs[0]) * (alist->nr_addrs - i));
+-
+-      srx = &alist->addrs[i].srx;
+-      srx->srx_family = AF_RXRPC;
+-      srx->transport_type = SOCK_DGRAM;
+-      srx->transport_len = sizeof(srx->transport.sin6);
+-      srx->transport.sin6.sin6_family = AF_INET6;
+-      srx->transport.sin6.sin6_port = htons(port);
+-      memcpy(&srx->transport.sin6.sin6_addr, xdr, 16);
++      alist->addrs[i].peer = peer;
+       alist->nr_addrs++;
++      return 0;
+ }
+ 
+ /*
+diff --git a/fs/afs/cmservice.c b/fs/afs/cmservice.c
+index d4ddb20d6732..99a3f20bc786 100644
+--- a/fs/afs/cmservice.c
++++ b/fs/afs/cmservice.c
+@@ -146,10 +146,11 @@ static int afs_find_cm_server_by_peer(struct afs_call *call)
+ {
+       struct sockaddr_rxrpc srx;
+       struct afs_server *server;
++      struct rxrpc_peer *peer;
+ 
+-      rxrpc_kernel_get_peer(call->net->socket, call->rxcall, &srx);
++      peer = rxrpc_kernel_get_call_peer(call->net->socket, call->rxcall);
+ 
+-      server = afs_find_server(call->net, &srx);
++      server = afs_find_server(call->net, peer);
+       if (!server) {
+               trace_afs_cm_no_server(call, &srx);
+               return 0;
+diff --git a/fs/afs/fs_probe.c b/fs/afs/fs_probe.c
+index 3dd24842f277..58d28b82571e 100644
+--- a/fs/afs/fs_probe.c
++++ b/fs/afs/fs_probe.c
+@@ -101,6 +101,7 @@ static void afs_fs_probe_not_done(struct afs_net *net,
+ void afs_fileserver_probe_result(struct afs_call *call)
+ {
+       struct afs_addr_list *alist = call->alist;
++      struct afs_address *addr = &alist->addrs[call->addr_ix];
+       struct afs_server *server = call->server;
+       unsigned int index = call->addr_ix;
+       unsigned int rtt_us = 0, cap0;
+@@ -153,12 +154,12 @@ void afs_fileserver_probe_result(struct afs_call *call)
+       if (call->service_id == YFS_FS_SERVICE) {
+               server->probe.is_yfs = true;
+               set_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+-              alist->addrs[index].srx.srx_service = call->service_id;
++              addr->service_id = call->service_id;
+       } else {
+               server->probe.not_yfs = true;
+               if (!server->probe.is_yfs) {
+                       clear_bit(AFS_SERVER_FL_IS_YFS, &server->flags);
+-                      alist->addrs[index].srx.srx_service = call->service_id;
++                      addr->service_id = call->service_id;
+               }
+               cap0 = ntohl(call->tmp);
+               if (cap0 & AFS3_VICED_CAPABILITY_64BITFILES)
+@@ -167,7 +168,7 @@ void afs_fileserver_probe_result(struct afs_call *call)
+                       clear_bit(AFS_SERVER_FL_HAS_FS64, &server->flags);
+       }
+ 
+-      rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
++      rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+       if (rtt_us < server->probe.rtt) {
+               server->probe.rtt = rtt_us;
+               server->rtt = rtt_us;
+@@ -181,8 +182,8 @@ void afs_fileserver_probe_result(struct afs_call *call)
+ out:
+       spin_unlock(&server->probe_lock);
+ 
+-      _debug("probe %pU [%u] %pISpc rtt=%u ret=%d",
+-             &server->uuid, index, &alist->addrs[index].srx.transport,
++      _debug("probe %pU [%u] %pISpc rtt=%d ret=%d",
++             &server->uuid, index, rxrpc_kernel_remote_addr(alist->addrs[index].peer),
+              rtt_us, ret);
+ 
+       return afs_done_one_fs_probe(call->net, server);
+diff --git a/fs/afs/internal.h b/fs/afs/internal.h
+index e2adb314ab6a..ec08b4a7e499 100644
+--- a/fs/afs/internal.h
++++ b/fs/afs/internal.h
+@@ -72,6 +72,11 @@ enum afs_call_state {
+       AFS_CALL_COMPLETE,              /* Completed or failed */
+ };
+ 
++struct afs_address {
++      struct rxrpc_peer       *peer;
++      u16                     service_id;
++};
++
+ /*
+  * List of server addresses.
+  */
+@@ -87,9 +92,7 @@ struct afs_addr_list {
+       enum dns_lookup_status  status:8;
+       unsigned long           failed;         /* Mask of addrs that failed locally/ICMP */
+       unsigned long           responded;      /* Mask of addrs that responded */
+-      struct {
+-              struct sockaddr_rxrpc   srx;
+-      } addrs[] __counted_by(max_addrs);
++      struct afs_address      addrs[] __counted_by(max_addrs);
+ #define AFS_MAX_ADDRESSES ((unsigned int)(sizeof(unsigned long) * 8))
+ };
+ 
+@@ -420,7 +423,7 @@ struct afs_vlserver {
+       atomic_t                probe_outstanding;
+       spinlock_t              probe_lock;
+       struct {
+-              unsigned int    rtt;            /* RTT in uS */
++              unsigned int    rtt;            /* Best RTT in uS (or UINT_MAX) */
+               u32             abort_code;
+               short           error;
+               unsigned short  flags;
+@@ -537,7 +540,7 @@ struct afs_server {
+       atomic_t                probe_outstanding;
+       spinlock_t              probe_lock;
+       struct {
+-              unsigned int    rtt;            /* RTT in uS */
++              unsigned int    rtt;            /* Best RTT in uS (or UINT_MAX) */
+               u32             abort_code;
+               short           error;
+               bool            responded:1;
+@@ -964,9 +967,7 @@ static inline struct afs_addr_list *afs_get_addrlist(struct afs_addr_list *alist
+               refcount_inc(&alist->usage);
+       return alist;
+ }
+-extern struct afs_addr_list *afs_alloc_addrlist(unsigned int,
+-                                              unsigned short,
+-                                              unsigned short);
++extern struct afs_addr_list *afs_alloc_addrlist(unsigned int nr, u16 service_id);
+ extern void afs_put_addrlist(struct afs_addr_list *);
+ extern struct afs_vlserver_list *afs_parse_text_addrs(struct afs_net *,
+                                                     const char *, size_t, char,
+@@ -977,8 +978,10 @@ extern struct afs_vlserver_list *afs_dns_query(struct afs_cell *, time64_t *);
+ extern bool afs_iterate_addresses(struct afs_addr_cursor *);
+ extern int afs_end_cursor(struct afs_addr_cursor *);
+ 
+-extern void afs_merge_fs_addr4(struct afs_addr_list *, __be32, u16);
+-extern void afs_merge_fs_addr6(struct afs_addr_list *, __be32 *, u16);
++extern int afs_merge_fs_addr4(struct afs_net *net, struct afs_addr_list *addr,
++                            __be32 xdr, u16 port);
++extern int afs_merge_fs_addr6(struct afs_net *net, struct afs_addr_list *addr,
++                            __be32 *xdr, u16 port);
+ 
+ /*
+  * callback.c
+@@ -1405,8 +1408,7 @@ extern void __exit afs_clean_up_permit_cache(void);
+  */
+ extern spinlock_t afs_server_peer_lock;
+ 
+-extern struct afs_server *afs_find_server(struct afs_net *,
+-                                        const struct sockaddr_rxrpc *);
++extern struct afs_server *afs_find_server(struct afs_net *, const struct rxrpc_peer *);
+ extern struct afs_server *afs_find_server_by_uuid(struct afs_net *, const uuid_t *);
+ extern struct afs_server *afs_lookup_server(struct afs_cell *, struct key *, const uuid_t *, u32);
+ extern struct afs_server *afs_get_server(struct afs_server *, enum afs_server_trace);
+diff --git a/fs/afs/proc.c b/fs/afs/proc.c
+index ab9cd986cfd9..8a65a06908d2 100644
+--- a/fs/afs/proc.c
++++ b/fs/afs/proc.c
+@@ -307,7 +307,7 @@ static int afs_proc_cell_vlservers_show(struct seq_file *m, void *v)
+               for (i = 0; i < alist->nr_addrs; i++)
+                       seq_printf(m, " %c %pISpc\n",
+                                  alist->preferred == i ? '>' : '-',
+-                                 &alist->addrs[i].srx.transport);
++                                 rxrpc_kernel_remote_addr(alist->addrs[i].peer));
+       }
+       seq_printf(m, " info: fl=%lx rtt=%d\n", vlserver->flags, vlserver->rtt);
+       seq_printf(m, " probe: fl=%x e=%d ac=%d out=%d\n",
+@@ -398,9 +398,10 @@ static int afs_proc_servers_show(struct seq_file *m, void *v)
+       seq_printf(m, "  - ALIST v=%u rsp=%lx f=%lx\n",
+                  alist->version, alist->responded, alist->failed);
+       for (i = 0; i < alist->nr_addrs; i++)
+-              seq_printf(m, "    [%x] %pISpc%s\n",
+-                         i, &alist->addrs[i].srx.transport,
+-                         alist->preferred == i ? "*" : "");
++              seq_printf(m, "    [%x] %pISpc%s rtt=%d\n",
++                         i, rxrpc_kernel_remote_addr(alist->addrs[i].peer),
++                         alist->preferred == i ? "*" : "",
++                         rxrpc_kernel_get_srtt(alist->addrs[i].peer));
+       return 0;
+ }
+ 
+diff --git a/fs/afs/rotate.c b/fs/afs/rotate.c
+index 46081e5da6f5..59aed7a6dd11 100644
+--- a/fs/afs/rotate.c
++++ b/fs/afs/rotate.c
+@@ -113,7 +113,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+       struct afs_server *server;
+       struct afs_vnode *vnode = op->file[0].vnode;
+       struct afs_error e;
+-      u32 rtt;
++      unsigned int rtt;
+       int error = op->ac.error, i;
+ 
+       _enter("%lx[%d],%lx[%d],%d,%d",
+@@ -420,7 +420,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+       }
+ 
+       op->index = -1;
+-      rtt = U32_MAX;
++      rtt = UINT_MAX;
+       for (i = 0; i < op->server_list->nr_servers; i++) {
+               struct afs_server *s = op->server_list->servers[i].server;
+ 
+@@ -488,7 +488,7 @@ bool afs_select_fileserver(struct afs_operation *op)
+ 
+       _debug("address [%u] %u/%u %pISp",
+              op->index, op->ac.index, op->ac.alist->nr_addrs,
+-             &op->ac.alist->addrs[op->ac.index].srx.transport);
++             rxrpc_kernel_remote_addr(op->ac.alist->addrs[op->ac.index].peer));
+ 
+       _leave(" = t");
+       return true;
+diff --git a/fs/afs/rxrpc.c b/fs/afs/rxrpc.c
+index 181317126e43..2603db03b7ff 100644
+--- a/fs/afs/rxrpc.c
++++ b/fs/afs/rxrpc.c
+@@ -296,7 +296,8 @@ static void afs_notify_end_request_tx(struct sock *sock,
+  */
+ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+ {
+-      struct sockaddr_rxrpc *srx = &ac->alist->addrs[ac->index].srx;
++      struct afs_address *addr = &ac->alist->addrs[ac->index];
++      struct rxrpc_peer *peer = addr->peer;
+       struct rxrpc_call *rxcall;
+       struct msghdr msg;
+       struct kvec iov[1];
+@@ -304,7 +305,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+       s64 tx_total_len;
+       int ret;
+ 
+-      _enter(",{%pISp},", &srx->transport);
++      _enter(",{%pISp},", rxrpc_kernel_remote_addr(addr->peer));
+ 
+       ASSERT(call->type != NULL);
+       ASSERT(call->type->name != NULL);
+@@ -333,7 +334,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+       }
+ 
+       /* create a call */
+-      rxcall = rxrpc_kernel_begin_call(call->net->socket, srx, call->key,
++      rxcall = rxrpc_kernel_begin_call(call->net->socket, peer, call->key,
+                                        (unsigned long)call,
+                                        tx_total_len,
+                                        call->max_lifespan,
+@@ -341,6 +342,7 @@ void afs_make_call(struct afs_addr_cursor *ac, struct afs_call *call, gfp_t gfp)
+                                        (call->async ?
+                                         afs_wake_up_async_call :
+                                         afs_wake_up_call_waiter),
++                                       addr->service_id,
+                                        call->upgrade,
+                                        (call->intr ? RXRPC_PREINTERRUPTIBLE :
+                                         RXRPC_UNINTERRUPTIBLE),
+@@ -461,7 +463,7 @@ static void afs_log_error(struct afs_call *call, s32 remote_abort)
+               max = m + 1;
+               pr_notice("kAFS: Peer reported %s failure on %s [%pISp]\n",
+                         msg, call->type->name,
+-                        &call->alist->addrs[call->addr_ix].srx.transport);
++                        rxrpc_kernel_remote_addr(call->alist->addrs[call->addr_ix].peer));
+       }
+ }
+ 
+diff --git a/fs/afs/server.c b/fs/afs/server.c
+index b8e2d211d4a1..5b5fa94005c9 100644
+--- a/fs/afs/server.c
++++ b/fs/afs/server.c
+@@ -21,13 +21,12 @@ static void __afs_put_server(struct afs_net *, struct afs_server *);
+ /*
+  * Find a server by one of its addresses.
+  */
+-struct afs_server *afs_find_server(struct afs_net *net,
+-                                 const struct sockaddr_rxrpc *srx)
++struct afs_server *afs_find_server(struct afs_net *net, const struct rxrpc_peer *peer)
+ {
+       const struct afs_addr_list *alist;
+       struct afs_server *server = NULL;
+       unsigned int i;
+-      int seq = 1, diff;
++      int seq = 1;
+ 
+       rcu_read_lock();
+ 
+@@ -38,37 +37,11 @@ struct afs_server *afs_find_server(struct afs_net *net,
+               seq++; /* 2 on the 1st/lockless path, otherwise odd */
+               read_seqbegin_or_lock(&net->fs_addr_lock, &seq);
+ 
+-              if (srx->transport.family == AF_INET6) {
+-                      const struct sockaddr_in6 *a = &srx->transport.sin6, *b;
+-                      hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
+-                              alist = rcu_dereference(server->addresses);
+-                              for (i = alist->nr_ipv4; i < alist->nr_addrs; i++) {
+-                                      b = &alist->addrs[i].srx.transport.sin6;
+-                                      diff = ((u16 __force)a->sin6_port -
+-                                              (u16 __force)b->sin6_port);
+-                                      if (diff == 0)
+-                                              diff = memcmp(&a->sin6_addr,
+-                                                            &b->sin6_addr,
+-                                                            sizeof(struct in6_addr));
+-                                      if (diff == 0)
+-                                              goto found;
+-                              }
+-                      }
+-              } else {
+-                      const struct sockaddr_in *a = &srx->transport.sin, *b;
+-                      hlist_for_each_entry_rcu(server, &net->fs_addresses4, addr4_link) {
+-                              alist = rcu_dereference(server->addresses);
+-                              for (i = 0; i < alist->nr_ipv4; i++) {
+-                                      b = &alist->addrs[i].srx.transport.sin;
+-                                      diff = ((u16 __force)a->sin_port -
+-                                              (u16 __force)b->sin_port);
+-                                      if (diff == 0)
+-                                              diff = ((u32 __force)a->sin_addr.s_addr -
+-                                                      (u32 __force)b->sin_addr.s_addr);
+-                                      if (diff == 0)
+-                                              goto found;
+-                              }
+-                      }
++              hlist_for_each_entry_rcu(server, &net->fs_addresses6, addr6_link) {
++                      alist = rcu_dereference(server->addresses);
++                      for (i = 0; i < alist->nr_addrs; i++)
++                              if (alist->addrs[i].peer == peer)
++                                      goto found;
+               }
+ 
+               server = NULL;
+diff --git a/fs/afs/vl_alias.c b/fs/afs/vl_alias.c
+index d3c0df70a1a5..6fdf9f1bedc0 100644
+--- a/fs/afs/vl_alias.c
++++ b/fs/afs/vl_alias.c
+@@ -32,55 +32,6 @@ static struct afs_volume *afs_sample_volume(struct afs_cell *cell, struct key *k
+       return volume;
+ }
+ 
+-/*
+- * Compare two addresses.
+- */
+-static int afs_compare_addrs(const struct sockaddr_rxrpc *srx_a,
+-                           const struct sockaddr_rxrpc *srx_b)
+-{
+-      short port_a, port_b;
+-      int addr_a, addr_b, diff;
+-
+-      diff = (short)srx_a->transport_type - (short)srx_b->transport_type;
+-      if (diff)
+-              goto out;
+-
+-      switch (srx_a->transport_type) {
+-      case AF_INET: {
+-              const struct sockaddr_in *a = &srx_a->transport.sin;
+-              const struct sockaddr_in *b = &srx_b->transport.sin;
+-              addr_a = ntohl(a->sin_addr.s_addr);
+-              addr_b = ntohl(b->sin_addr.s_addr);
+-              diff = addr_a - addr_b;
+-              if (diff == 0) {
+-                      port_a = ntohs(a->sin_port);
+-                      port_b = ntohs(b->sin_port);
+-                      diff = port_a - port_b;
+-              }
+-              break;
+-      }
+-
+-      case AF_INET6: {
+-              const struct sockaddr_in6 *a = &srx_a->transport.sin6;
+-              const struct sockaddr_in6 *b = &srx_b->transport.sin6;
+-              diff = memcmp(&a->sin6_addr, &b->sin6_addr, 16);
+-              if (diff == 0) {
+-                      port_a = ntohs(a->sin6_port);
+-                      port_b = ntohs(b->sin6_port);
+-                      diff = port_a - port_b;
+-              }
+-              break;
+-      }
+-
+-      default:
+-              WARN_ON(1);
+-              diff = 1;
+-      }
+-
+-out:
+-      return diff;
+-}
+-
+ /*
+  * Compare the address lists of a pair of fileservers.
+  */
+@@ -94,9 +45,9 @@ static int afs_compare_fs_alists(const struct afs_server *server_a,
+       lb = rcu_dereference(server_b->addresses);
+ 
+       while (a < la->nr_addrs && b < lb->nr_addrs) {
+-              const struct sockaddr_rxrpc *srx_a = &la->addrs[a].srx;
+-              const struct sockaddr_rxrpc *srx_b = &lb->addrs[b].srx;
+-              int diff = afs_compare_addrs(srx_a, srx_b);
++              unsigned long pa = (unsigned long)la->addrs[a].peer;
++              unsigned long pb = (unsigned long)lb->addrs[b].peer;
++              long diff = pa - pb;
+ 
+               if (diff < 0) {
+                       a++;
+diff --git a/fs/afs/vl_list.c b/fs/afs/vl_list.c
+index acc48216136a..ba89140eee9e 100644
+--- a/fs/afs/vl_list.c
++++ b/fs/afs/vl_list.c
+@@ -83,14 +83,15 @@ static u16 afs_extract_le16(const u8 **_b)
+ /*
+  * Build a VL server address list from a DNS queried server list.
+  */
+-static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
++static struct afs_addr_list *afs_extract_vl_addrs(struct afs_net *net,
++                                                const u8 **_b, const u8 *end,
+                                                 u8 nr_addrs, u16 port)
+ {
+       struct afs_addr_list *alist;
+       const u8 *b = *_b;
+       int ret = -EINVAL;
+ 
+-      alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE, port);
++      alist = afs_alloc_addrlist(nr_addrs, VL_SERVICE);
+       if (!alist)
+               return ERR_PTR(-ENOMEM);
+       if (nr_addrs == 0)
+@@ -109,7 +110,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+                               goto error;
+                       }
+                       memcpy(x, b, 4);
+-                      afs_merge_fs_addr4(alist, x[0], port);
++                      ret = afs_merge_fs_addr4(net, alist, x[0], port);
++                      if (ret < 0)
++                              goto error;
+                       b += 4;
+                       break;
+ 
+@@ -119,7 +122,9 @@ static struct afs_addr_list *afs_extract_vl_addrs(const u8 **_b, const u8 *end,
+                               goto error;
+                       }
+                       memcpy(x, b, 16);
+-                      afs_merge_fs_addr6(alist, x, port);
++                      ret = afs_merge_fs_addr6(net, alist, x, port);
++                      if (ret < 0)
++                              goto error;
+                       b += 16;
+                       break;
+ 
+@@ -247,7 +252,7 @@ struct afs_vlserver_list *afs_extract_vlserver_list(struct afs_cell *cell,
+               /* Extract the addresses - note that we can't skip this as we
+                * have to advance the payload pointer.
+                */
+-              addrs = afs_extract_vl_addrs(&b, end, bs.nr_addrs, bs.port);
++              addrs = afs_extract_vl_addrs(cell->net, &b, end, bs.nr_addrs, bs.port);
+               if (IS_ERR(addrs)) {
+                       ret = PTR_ERR(addrs);
+                       goto error_2;
+diff --git a/fs/afs/vl_probe.c b/fs/afs/vl_probe.c
+index bdd9372e3fb2..9551aef07cee 100644
+--- a/fs/afs/vl_probe.c
++++ b/fs/afs/vl_probe.c
+@@ -48,6 +48,7 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ {
+       struct afs_addr_list *alist = call->alist;
+       struct afs_vlserver *server = call->vlserver;
++      struct afs_address *addr = &alist->addrs[call->addr_ix];
+       unsigned int server_index = call->server_index;
+       unsigned int rtt_us = 0;
+       unsigned int index = call->addr_ix;
+@@ -106,16 +107,16 @@ void afs_vlserver_probe_result(struct afs_call *call)
+       if (call->service_id == YFS_VL_SERVICE) {
+               server->probe.flags |= AFS_VLSERVER_PROBE_IS_YFS;
+               set_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+-              alist->addrs[index].srx.srx_service = call->service_id;
++              addr->service_id = call->service_id;
+       } else {
+               server->probe.flags |= AFS_VLSERVER_PROBE_NOT_YFS;
+               if (!(server->probe.flags & AFS_VLSERVER_PROBE_IS_YFS)) {
+                       clear_bit(AFS_VLSERVER_FL_IS_YFS, &server->flags);
+-                      alist->addrs[index].srx.srx_service = call->service_id;
++                      addr->service_id = call->service_id;
+               }
+       }
+ 
+-      rxrpc_kernel_get_srtt(call->net->socket, call->rxcall, &rtt_us);
++      rtt_us = rxrpc_kernel_get_srtt(addr->peer);
+       if (rtt_us < server->probe.rtt) {
+               server->probe.rtt = rtt_us;
+               server->rtt = rtt_us;
+@@ -130,8 +131,9 @@ void afs_vlserver_probe_result(struct afs_call *call)
+ out:
+       spin_unlock(&server->probe_lock);
+ 
+-      _debug("probe [%u][%u] %pISpc rtt=%u ret=%d",
+-             server_index, index, &alist->addrs[index].srx.transport, rtt_us, ret);
++      _debug("probe [%u][%u] %pISpc rtt=%d ret=%d",
++             server_index, index, rxrpc_kernel_remote_addr(addr->peer),
++             rtt_us, ret);
+ 
+       afs_done_one_vl_probe(server, have_result);
+ }
+diff --git a/fs/afs/vl_rotate.c b/fs/afs/vl_rotate.c
+index e52b9d4c8a0a..f8f255c966ae 100644
+--- a/fs/afs/vl_rotate.c
++++ b/fs/afs/vl_rotate.c
+@@ -92,7 +92,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+       struct afs_addr_list *alist;
+       struct afs_vlserver *vlserver;
+       struct afs_error e;
+-      u32 rtt;
++      unsigned int rtt;
+       int error = vc->ac.error, i;
+ 
+       _enter("%lx[%d],%lx[%d],%d,%d",
+@@ -194,7 +194,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+               goto selected_server;
+ 
+       vc->index = -1;
+-      rtt = U32_MAX;
++      rtt = UINT_MAX;
+       for (i = 0; i < vc->server_list->nr_servers; i++) {
+               struct afs_vlserver *s = vc->server_list->servers[i].server;
+ 
+@@ -249,7 +249,7 @@ bool afs_select_vlserver(struct afs_vl_cursor *vc)
+ 
+       _debug("VL address %d/%d", vc->ac.index, vc->ac.alist->nr_addrs);
+ 
+-      _leave(" = t %pISpc", &vc->ac.alist->addrs[vc->ac.index].srx.transport);
++      _leave(" = t %pISpc", rxrpc_kernel_remote_addr(vc->ac.alist->addrs[vc->ac.index].peer));
+       return true;
+ 
+ next_server:
+diff --git a/fs/afs/vlclient.c b/fs/afs/vlclient.c
+index 00fca3c66ba6..41e7932d75c6 100644
+--- a/fs/afs/vlclient.c
++++ b/fs/afs/vlclient.c
+@@ -208,7 +208,7 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+               count           = ntohl(*bp);
+ 
+               nentries = min(nentries, count);
+-              alist = afs_alloc_addrlist(nentries, FS_SERVICE, AFS_FS_PORT);
++              alist = afs_alloc_addrlist(nentries, FS_SERVICE);
+               if (!alist)
+                       return -ENOMEM;
+               alist->version = uniquifier;
+@@ -230,9 +230,13 @@ static int afs_deliver_vl_get_addrs_u(struct afs_call *call)
+               alist = call->ret_alist;
+               bp = call->buffer;
+               count = min(call->count, 4U);
+-              for (i = 0; i < count; i++)
+-                      if (alist->nr_addrs < call->count2)
+-                              afs_merge_fs_addr4(alist, *bp++, AFS_FS_PORT);
++              for (i = 0; i < count; i++) {
++                      if (alist->nr_addrs < call->count2) {
++                              ret = afs_merge_fs_addr4(call->net, alist, *bp++, AFS_FS_PORT);
++                              if (ret < 0)
++                                      return ret;
++                      }
++              }
+ 
+               call->count -= count;
+               if (call->count > 0)
+@@ -450,7 +454,7 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+               if (call->count > YFS_MAXENDPOINTS)
+                       return afs_protocol_error(call, afs_eproto_yvl_fsendpt_num);
+ 
+-              alist = afs_alloc_addrlist(call->count, FS_SERVICE, AFS_FS_PORT);
++              alist = afs_alloc_addrlist(call->count, FS_SERVICE);
+               if (!alist)
+                       return -ENOMEM;
+               alist->version = uniquifier;
+@@ -488,14 +492,18 @@ static int afs_deliver_yfsvl_get_endpoints(struct afs_call *call)
+                       if (ntohl(bp[0]) != sizeof(__be32) * 2)
+                               return afs_protocol_error(
+                                       call, afs_eproto_yvl_fsendpt4_len);
+-                      afs_merge_fs_addr4(alist, bp[1], ntohl(bp[2]));
++                      ret = afs_merge_fs_addr4(call->net, alist, bp[1], ntohl(bp[2]));
++                      if (ret < 0)
++                              return ret;
+                       bp += 3;
+                       break;
+               case YFS_ENDPOINT_IPV6:
+                       if (ntohl(bp[0]) != sizeof(__be32) * 5)
+                               return afs_protocol_error(
+                                       call, afs_eproto_yvl_fsendpt6_len);
+-                      afs_merge_fs_addr6(alist, bp + 1, ntohl(bp[5]));
++                      ret = afs_merge_fs_addr6(call->net, alist, bp + 1, ntohl(bp[5]));
++                      if (ret < 0)
++                              return ret;
+                       bp += 6;
+                       break;
+               default:
+diff --git a/include/net/af_rxrpc.h b/include/net/af_rxrpc.h
+index 5531dd08061e..0754c463224a 100644
+--- a/include/net/af_rxrpc.h
++++ b/include/net/af_rxrpc.h
+@@ -15,6 +15,7 @@ struct key;
+ struct sock;
+ struct socket;
+ struct rxrpc_call;
++struct rxrpc_peer;
+ enum rxrpc_abort_reason;
+ 
+ enum rxrpc_interruptibility {
+@@ -41,13 +42,14 @@ void rxrpc_kernel_new_call_notification(struct socket *,
+                                       rxrpc_notify_new_call_t,
+                                       rxrpc_discard_new_call_t);
+ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+-                                         struct sockaddr_rxrpc *srx,
++                                         struct rxrpc_peer *peer,
+                                          struct key *key,
+                                          unsigned long user_call_ID,
+                                          s64 tx_total_len,
+                                          u32 hard_timeout,
+                                          gfp_t gfp,
+                                          rxrpc_notify_rx_t notify_rx,
++                                         u16 service_id,
+                                          bool upgrade,
+                                          enum rxrpc_interruptibility interruptibility,
+                                          unsigned int debug_id);
+@@ -60,9 +62,14 @@ bool rxrpc_kernel_abort_call(struct socket *, struct rxrpc_call *,
+                            u32, int, enum rxrpc_abort_reason);
+ void rxrpc_kernel_shutdown_call(struct socket *sock, struct rxrpc_call *call);
+ void rxrpc_kernel_put_call(struct socket *sock, struct rxrpc_call *call);
+-void rxrpc_kernel_get_peer(struct socket *, struct rxrpc_call *,
+-                         struct sockaddr_rxrpc *);
+-bool rxrpc_kernel_get_srtt(struct socket *, struct rxrpc_call *, u32 *);
++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
++                                          struct sockaddr_rxrpc *srx, gfp_t gfp);
++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer);
++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer);
++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call);
++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer);
++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer);
++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *);
+ int rxrpc_kernel_charge_accept(struct socket *, rxrpc_notify_rx_t,
+                              rxrpc_user_attach_call_t, unsigned long, gfp_t,
+                              unsigned int);
+diff --git a/include/trace/events/rxrpc.h b/include/trace/events/rxrpc.h
+index f7e537f64db4..4c1ef7b3705c 100644
+--- a/include/trace/events/rxrpc.h
++++ b/include/trace/events/rxrpc.h
+@@ -178,7 +178,9 @@
+ #define rxrpc_peer_traces \
+       EM(rxrpc_peer_free,                     "FREE        ") \
+       EM(rxrpc_peer_get_accept,               "GET accept  ") \
++      EM(rxrpc_peer_get_application,          "GET app     ") \
+       EM(rxrpc_peer_get_bundle,               "GET bundle  ") \
++      EM(rxrpc_peer_get_call,                 "GET call    ") \
+       EM(rxrpc_peer_get_client_conn,          "GET cln-conn") \
+       EM(rxrpc_peer_get_input,                "GET input   ") \
+       EM(rxrpc_peer_get_input_error,          "GET inpt-err") \
+@@ -187,6 +189,7 @@
+       EM(rxrpc_peer_get_service_conn,         "GET srv-conn") \
+       EM(rxrpc_peer_new_client,               "NEW client  ") \
+       EM(rxrpc_peer_new_prealloc,             "NEW prealloc") \
++      EM(rxrpc_peer_put_application,          "PUT app     ") \
+       EM(rxrpc_peer_put_bundle,               "PUT bundle  ") \
+       EM(rxrpc_peer_put_call,                 "PUT call    ") \
+       EM(rxrpc_peer_put_conn,                 "PUT conn    ") \
+diff --git a/net/rxrpc/af_rxrpc.c b/net/rxrpc/af_rxrpc.c
+index fa8aec78f63d..465bfe5eb061 100644
+--- a/net/rxrpc/af_rxrpc.c
++++ b/net/rxrpc/af_rxrpc.c
+@@ -258,16 +258,62 @@ static int rxrpc_listen(struct socket *sock, int backlog)
+       return ret;
+ }
+ 
++/**
++ * rxrpc_kernel_lookup_peer - Obtain remote transport endpoint for an address
++ * @sock: The socket through which it will be accessed
++ * @srx: The network address
++ * @gfp: Allocation flags
++ *
++ * Lookup or create a remote transport endpoint record for the specified
++ * address and return it with a ref held.
++ */
++struct rxrpc_peer *rxrpc_kernel_lookup_peer(struct socket *sock,
++                                          struct sockaddr_rxrpc *srx, gfp_t gfp)
++{
++      struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
++      int ret;
++
++      ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
++      if (ret < 0)
++              return ERR_PTR(ret);
++
++      return rxrpc_lookup_peer(rx->local, srx, gfp);
++}
++EXPORT_SYMBOL(rxrpc_kernel_lookup_peer);
++
++/**
++ * rxrpc_kernel_get_peer - Get a reference on a peer
++ * @peer: The peer to get a reference on.
++ *
++ * Get a record for the remote peer in a call.
++ */
++struct rxrpc_peer *rxrpc_kernel_get_peer(struct rxrpc_peer *peer)
++{
++      return peer ? rxrpc_get_peer(peer, rxrpc_peer_get_application) : NULL;
++}
++EXPORT_SYMBOL(rxrpc_kernel_get_peer);
++
++/**
++ * rxrpc_kernel_put_peer - Allow a kernel app to drop a peer reference
++ * @peer: The peer to drop a ref on
++ */
++void rxrpc_kernel_put_peer(struct rxrpc_peer *peer)
++{
++      rxrpc_put_peer(peer, rxrpc_peer_put_application);
++}
++EXPORT_SYMBOL(rxrpc_kernel_put_peer);
++
+ /**
+  * rxrpc_kernel_begin_call - Allow a kernel service to begin a call
+  * @sock: The socket on which to make the call
+- * @srx: The address of the peer to contact
++ * @peer: The peer to contact
+  * @key: The security context to use (defaults to socket setting)
+  * @user_call_ID: The ID to use
+  * @tx_total_len: Total length of data to transmit during the call (or -1)
+  * @hard_timeout: The maximum lifespan of the call in sec
+  * @gfp: The allocation constraints
+  * @notify_rx: Where to send notifications instead of socket queue
++ * @service_id: The ID of the service to contact
+  * @upgrade: Request service upgrade for call
+  * @interruptibility: The call is interruptible, or can be canceled.
+  * @debug_id: The debug ID for tracing to be assigned to the call
+@@ -280,13 +326,14 @@ static int rxrpc_listen(struct socket *sock, int backlog)
+  * supplying @srx and @key.
+  */
+ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+-                                         struct sockaddr_rxrpc *srx,
++                                         struct rxrpc_peer *peer,
+                                          struct key *key,
+                                          unsigned long user_call_ID,
+                                          s64 tx_total_len,
+                                          u32 hard_timeout,
+                                          gfp_t gfp,
+                                          rxrpc_notify_rx_t notify_rx,
++                                         u16 service_id,
+                                          bool upgrade,
+                                          enum rxrpc_interruptibility interruptibility,
+                                          unsigned int debug_id)
+@@ -295,13 +342,11 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+       struct rxrpc_call_params p;
+       struct rxrpc_call *call;
+       struct rxrpc_sock *rx = rxrpc_sk(sock->sk);
+-      int ret;
+ 
+       _enter(",,%x,%lx", key_serial(key), user_call_ID);
+ 
+-      ret = rxrpc_validate_address(rx, srx, sizeof(*srx));
+-      if (ret < 0)
+-              return ERR_PTR(ret);
++      if (WARN_ON_ONCE(peer->local != rx->local))
++              return ERR_PTR(-EIO);
+ 
+       lock_sock(&rx->sk);
+ 
+@@ -319,12 +364,13 @@ struct rxrpc_call *rxrpc_kernel_begin_call(struct socket *sock,
+ 
+       memset(&cp, 0, sizeof(cp));
+       cp.local                = rx->local;
++      cp.peer                 = peer;
+       cp.key                  = key;
+       cp.security_level       = rx->min_sec_level;
+       cp.exclusive            = false;
+       cp.upgrade              = upgrade;
+-      cp.service_id           = srx->srx_service;
+-      call = rxrpc_new_client_call(rx, &cp, srx, &p, gfp, debug_id);
++      cp.service_id           = service_id;
++      call = rxrpc_new_client_call(rx, &cp, &p, gfp, debug_id);
+       /* The socket has been unlocked. */
+       if (!IS_ERR(call)) {
+               call->notify_rx = notify_rx;
+diff --git a/net/rxrpc/ar-internal.h b/net/rxrpc/ar-internal.h
+index e8b43408136a..5d5b19f20d1e 100644
+--- a/net/rxrpc/ar-internal.h
++++ b/net/rxrpc/ar-internal.h
+@@ -364,6 +364,7 @@ struct rxrpc_conn_proto {
+ 
+ struct rxrpc_conn_parameters {
+       struct rxrpc_local      *local;         /* Representation of local endpoint */
++      struct rxrpc_peer       *peer;          /* Representation of remote endpoint */
+       struct key              *key;           /* Security details */
+       bool                    exclusive;      /* T if conn is exclusive */
+       bool                    upgrade;        /* T if service ID can be upgraded */
+@@ -867,7 +868,6 @@ struct rxrpc_call *rxrpc_find_call_by_user_ID(struct rxrpc_sock *, unsigned long
+ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *, gfp_t, unsigned int);
+ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *,
+                                        struct rxrpc_conn_parameters *,
+-                                       struct sockaddr_rxrpc *,
+                                        struct rxrpc_call_params *, gfp_t,
+                                        unsigned int);
+ void rxrpc_start_call_timer(struct rxrpc_call *call);
+diff --git a/net/rxrpc/call_object.c b/net/rxrpc/call_object.c
+index f10b37c14772..0943e54370ba 100644
+--- a/net/rxrpc/call_object.c
++++ b/net/rxrpc/call_object.c
+@@ -193,7 +193,6 @@ struct rxrpc_call *rxrpc_alloc_call(struct rxrpc_sock *rx, gfp_t gfp,
+  * Allocate a new client call.
+  */
+ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+-                                                struct sockaddr_rxrpc *srx,
+                                                 struct rxrpc_conn_parameters *cp,
+                                                 struct rxrpc_call_params *p,
+                                                 gfp_t gfp,
+@@ -211,10 +210,12 @@ static struct rxrpc_call *rxrpc_alloc_client_call(struct rxrpc_sock *rx,
+       now = ktime_get_real();
+       call->acks_latest_ts    = now;
+       call->cong_tstamp       = now;
+-      call->dest_srx          = *srx;
++      call->dest_srx          = cp->peer->srx;
++      call->dest_srx.srx_service = cp->service_id;
+       call->interruptibility  = p->interruptibility;
+       call->tx_total_len      = p->tx_total_len;
+       call->key               = key_get(cp->key);
++      call->peer              = rxrpc_get_peer(cp->peer, rxrpc_peer_get_call);
+       call->local             = rxrpc_get_local(cp->local, rxrpc_local_get_call);
+       call->security_level    = cp->security_level;
+       if (p->kernel)
+@@ -306,10 +307,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
+ 
+       _enter("{%d,%lx},", call->debug_id, call->user_call_ID);
+ 
+-      call->peer = rxrpc_lookup_peer(local, &call->dest_srx, gfp);
+-      if (!call->peer)
+-              goto error;
+-
+       ret = rxrpc_look_up_bundle(call, gfp);
+       if (ret < 0)
+               goto error;
+@@ -334,7 +331,6 @@ static int rxrpc_connect_call(struct rxrpc_call *call, gfp_t gfp)
+  */
+ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
+                                        struct rxrpc_conn_parameters *cp,
+-                                       struct sockaddr_rxrpc *srx,
+                                        struct rxrpc_call_params *p,
+                                        gfp_t gfp,
+                                        unsigned int debug_id)
+@@ -349,13 +345,18 @@ struct rxrpc_call *rxrpc_new_client_call(struct rxrpc_sock *rx,
+ 
+       _enter("%p,%lx", rx, p->user_call_ID);
+ 
++      if (WARN_ON_ONCE(!cp->peer)) {
++              release_sock(&rx->sk);
++              return ERR_PTR(-EIO);
++      }
++
+       limiter = rxrpc_get_call_slot(p, gfp);
+       if (!limiter) {
+               release_sock(&rx->sk);
+               return ERR_PTR(-ERESTARTSYS);
+       }
+ 
+-      call = rxrpc_alloc_client_call(rx, srx, cp, p, gfp, debug_id);
++      call = rxrpc_alloc_client_call(rx, cp, p, gfp, debug_id);
+       if (IS_ERR(call)) {
+               release_sock(&rx->sk);
+               up(limiter);
+diff --git a/net/rxrpc/peer_object.c b/net/rxrpc/peer_object.c
+index 8d7a715a0bb1..49dcda67a0d5 100644
+--- a/net/rxrpc/peer_object.c
++++ b/net/rxrpc/peer_object.c
+@@ -22,6 +22,8 @@
+ #include <net/ip6_route.h>
+ #include "ar-internal.h"
+ 
++static const struct sockaddr_rxrpc rxrpc_null_addr;
++
+ /*
+  * Hash a peer key.
+  */
+@@ -457,39 +459,53 @@ void rxrpc_destroy_all_peers(struct rxrpc_net *rxnet)
+ }
+ 
+ /**
+- * rxrpc_kernel_get_peer - Get the peer address of a call
++ * rxrpc_kernel_get_call_peer - Get the peer address of a call
+  * @sock: The socket on which the call is in progress.
+  * @call: The call to query
+- * @_srx: Where to place the result
+  *
+- * Get the address of the remote peer in a call.
++ * Get a record for the remote peer in a call.
+  */
+-void rxrpc_kernel_get_peer(struct socket *sock, struct rxrpc_call *call,
+-                         struct sockaddr_rxrpc *_srx)
++struct rxrpc_peer *rxrpc_kernel_get_call_peer(struct socket *sock, struct rxrpc_call *call)
+ {
+-      *_srx = call->peer->srx;
++      return call->peer;
+ }
+-EXPORT_SYMBOL(rxrpc_kernel_get_peer);
++EXPORT_SYMBOL(rxrpc_kernel_get_call_peer);
+ 
+ /**
+  * rxrpc_kernel_get_srtt - Get a call's peer smoothed RTT
+- * @sock: The socket on which the call is in progress.
+- * @call: The call to query
+- * @_srtt: Where to store the SRTT value.
++ * @peer: The peer to query
+  *
+- * Get the call's peer smoothed RTT in uS.
++ * Get the call's peer smoothed RTT in uS or UINT_MAX if we have no samples.
+  */
+-bool rxrpc_kernel_get_srtt(struct socket *sock, struct rxrpc_call *call,
+-                         u32 *_srtt)
++unsigned int rxrpc_kernel_get_srtt(const struct rxrpc_peer *peer)
+ {
+-      struct rxrpc_peer *peer = call->peer;
++      return peer->rtt_count > 0 ? peer->srtt_us >> 3 : UINT_MAX;
++}
++EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
+ 
+-      if (peer->rtt_count == 0) {
+-              *_srtt = 1000000; /* 1S */
+-              return false;
+-      }
++/**
++ * rxrpc_kernel_remote_srx - Get the address of a peer
++ * @peer: The peer to query
++ *
++ * Get a pointer to the address from a peer record.  The caller is responsible
++ * for making sure that the address is not deallocated.
++ */
++const struct sockaddr_rxrpc *rxrpc_kernel_remote_srx(const struct rxrpc_peer *peer)
++{
++      return peer ? &peer->srx : &rxrpc_null_addr;
++}
++EXPORT_SYMBOL(rxrpc_kernel_remote_srx);
+ 
+-      *_srtt = call->peer->srtt_us >> 3;
+-      return true;
++/**
++ * rxrpc_kernel_remote_addr - Get the peer transport address of a call
++ * @peer: The peer to query
++ *
++ * Get a pointer to the transport address from a peer record.  The caller is
++ * responsible for making sure that the address is not deallocated.
++ */
++const struct sockaddr *rxrpc_kernel_remote_addr(const struct rxrpc_peer *peer)
++{
++      return (const struct sockaddr *)
++              (peer ? &peer->srx.transport : &rxrpc_null_addr.transport);
+ }
+-EXPORT_SYMBOL(rxrpc_kernel_get_srtt);
++EXPORT_SYMBOL(rxrpc_kernel_remote_addr);
+diff --git a/net/rxrpc/sendmsg.c b/net/rxrpc/sendmsg.c
+index 8e0b94714e84..5677d5690a02 100644
+--- a/net/rxrpc/sendmsg.c
++++ b/net/rxrpc/sendmsg.c
+@@ -572,6 +572,7 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+       __acquires(&call->user_mutex)
+ {
+       struct rxrpc_conn_parameters cp;
++      struct rxrpc_peer *peer;
+       struct rxrpc_call *call;
+       struct key *key;
+ 
+@@ -584,21 +585,29 @@ rxrpc_new_client_call_for_sendmsg(struct rxrpc_sock *rx, struct msghdr *msg,
+               return ERR_PTR(-EDESTADDRREQ);
+       }
+ 
++      peer = rxrpc_lookup_peer(rx->local, srx, GFP_KERNEL);
++      if (!peer) {
++              release_sock(&rx->sk);
++              return ERR_PTR(-ENOMEM);
++      }
++
+       key = rx->key;
+       if (key && !rx->key->payload.data[0])
+               key = NULL;
+ 
+       memset(&cp, 0, sizeof(cp));
+       cp.local                = rx->local;
++      cp.peer                 = peer;
+       cp.key                  = rx->key;
+       cp.security_level       = rx->min_sec_level;
+       cp.exclusive            = rx->exclusive | p->exclusive;
+       cp.upgrade              = p->upgrade;
+       cp.service_id           = srx->srx_service;
+-      call = rxrpc_new_client_call(rx, &cp, srx, &p->call, GFP_KERNEL,
++      call = rxrpc_new_client_call(rx, &cp, &p->call, GFP_KERNEL,
+                                    atomic_inc_return(&rxrpc_debug_id));
+       /* The socket is now unlocked */
+ 
++      rxrpc_put_peer(peer, rxrpc_peer_put_application);
+       _leave(" = %p\n", call);
+       return call;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch b/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch

new file mode 100644 (file)

index 0000000..9a55f9d
--- /dev/null
+++ b/queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch
@@ -0,0 +1,231 @@
+From 7f04c082cab699672ebde045251234ff678693af Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 19:16:42 -0800
+Subject: selftest: Don't reuse port for SO_INCOMING_CPU test.
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+[ Upstream commit 97de5a15edf2d22184f5ff588656030bbb7fa358 ]
+
+Jakub reported that ASSERT_EQ(cpu, i) in so_incoming_cpu.c seems to
+fire somewhat randomly.
+
+  # #  RUN           so_incoming_cpu.before_reuseport.test3 ...
+  # # so_incoming_cpu.c:191:test3:Expected cpu (32) == i (0)
+  # # test3: Test terminated by assertion
+  # #          FAIL  so_incoming_cpu.before_reuseport.test3
+  # not ok 3 so_incoming_cpu.before_reuseport.test3
+
+When the test failed, not-yet-accepted CLOSE_WAIT sockets received
+SYN with a "challenging" SEQ number, which was sent from an unexpected
+CPU that did not create the receiver.
+
+The test basically does:
+
+  1. for each cpu:
+    1-1. create a server
+    1-2. set SO_INCOMING_CPU
+
+  2. for each cpu:
+    2-1. set cpu affinity
+    2-2. create some clients
+    2-3. let clients connect() to the server on the same cpu
+    2-4. close() clients
+
+  3. for each server:
+    3-1. accept() all child sockets
+    3-2. check if all children have the same SO_INCOMING_CPU with the server
+
+The root cause was the close() in 2-4. and net.ipv4.tcp_tw_reuse.
+
+In a loop of 2., close() changed the client state to FIN_WAIT_2, and
+the peer transitioned to CLOSE_WAIT.
+
+In another loop of 2., connect() happened to select the same port of
+the FIN_WAIT_2 socket, and it was reused as the default value of
+net.ipv4.tcp_tw_reuse is 2.
+
+As a result, the new client sent SYN to the CLOSE_WAIT socket from
+a different CPU, and the receiver's sk_incoming_cpu was overwritten
+with unexpected CPU ID.
+
+Also, the SYN had a different SEQ number, so the CLOSE_WAIT socket
+responded with Challenge ACK.  The new client properly returned RST
+and effectively killed the CLOSE_WAIT socket.
+
+This way, all clients were created successfully, but the error was
+detected later by 3-2., ASSERT_EQ(cpu, i).
+
+To avoid the failure, let's make sure that (i) the number of clients
+is less than the number of available ports and (ii) such reuse never
+happens.
+
+Fixes: 6df96146b202 ("selftest: Add test for SO_INCOMING_CPU.")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Tested-by: Jakub Kicinski <kuba@kernel.org>
+Link: https://lore.kernel.org/r/20240120031642.67014-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/so_incoming_cpu.c | 68 ++++++++++++++-----
+ 1 file changed, 50 insertions(+), 18 deletions(-)
+
+diff --git a/tools/testing/selftests/net/so_incoming_cpu.c b/tools/testing/selftests/net/so_incoming_cpu.c
+index a14818164102..e9fa14e10732 100644
+--- a/tools/testing/selftests/net/so_incoming_cpu.c
++++ b/tools/testing/selftests/net/so_incoming_cpu.c
+@@ -3,19 +3,16 @@
+ #define _GNU_SOURCE
+ #include <sched.h>
+ 
++#include <fcntl.h>
++
+ #include <netinet/in.h>
+ #include <sys/socket.h>
+ #include <sys/sysinfo.h>
+ 
+ #include "../kselftest_harness.h"
+ 
+-#define CLIENT_PER_SERVER     32 /* More sockets, more reliable */
+-#define NR_SERVER             self->nproc
+-#define NR_CLIENT             (CLIENT_PER_SERVER * NR_SERVER)
+-
+ FIXTURE(so_incoming_cpu)
+ {
+-      int nproc;
+       int *servers;
+       union {
+               struct sockaddr addr;
+@@ -56,12 +53,47 @@ FIXTURE_VARIANT_ADD(so_incoming_cpu, after_all_listen)
+       .when_to_set = AFTER_ALL_LISTEN,
+ };
+ 
++static void write_sysctl(struct __test_metadata *_metadata,
++                       char *filename, char *string)
++{
++      int fd, len, ret;
++
++      fd = open(filename, O_WRONLY);
++      ASSERT_NE(fd, -1);
++
++      len = strlen(string);
++      ret = write(fd, string, len);
++      ASSERT_EQ(ret, len);
++}
++
++static void setup_netns(struct __test_metadata *_metadata)
++{
++      ASSERT_EQ(unshare(CLONE_NEWNET), 0);
++      ASSERT_EQ(system("ip link set lo up"), 0);
++
++      write_sysctl(_metadata, "/proc/sys/net/ipv4/ip_local_port_range", "10000 60001");
++      write_sysctl(_metadata, "/proc/sys/net/ipv4/tcp_tw_reuse", "0");
++}
++
++#define NR_PORT                               (60001 - 10000 - 1)
++#define NR_CLIENT_PER_SERVER_DEFAULT  32
++static int nr_client_per_server, nr_server, nr_client;
++
+ FIXTURE_SETUP(so_incoming_cpu)
+ {
+-      self->nproc = get_nprocs();
+-      ASSERT_LE(2, self->nproc);
++      setup_netns(_metadata);
++
++      nr_server = get_nprocs();
++      ASSERT_LE(2, nr_server);
++
++      if (NR_CLIENT_PER_SERVER_DEFAULT * nr_server < NR_PORT)
++              nr_client_per_server = NR_CLIENT_PER_SERVER_DEFAULT;
++      else
++              nr_client_per_server = NR_PORT / nr_server;
++
++      nr_client = nr_client_per_server * nr_server;
+ 
+-      self->servers = malloc(sizeof(int) * NR_SERVER);
++      self->servers = malloc(sizeof(int) * nr_server);
+       ASSERT_NE(self->servers, NULL);
+ 
+       self->in_addr.sin_family = AF_INET;
+@@ -74,7 +106,7 @@ FIXTURE_TEARDOWN(so_incoming_cpu)
+ {
+       int i;
+ 
+-      for (i = 0; i < NR_SERVER; i++)
++      for (i = 0; i < nr_server; i++)
+               close(self->servers[i]);
+ 
+       free(self->servers);
+@@ -110,10 +142,10 @@ int create_server(struct __test_metadata *_metadata,
+       if (variant->when_to_set == BEFORE_LISTEN)
+               set_so_incoming_cpu(_metadata, fd, cpu);
+ 
+-      /* We don't use CLIENT_PER_SERVER here not to block
++      /* We don't use nr_client_per_server here not to block
+        * this test at connect() if SO_INCOMING_CPU is broken.
+        */
+-      ret = listen(fd, NR_CLIENT);
++      ret = listen(fd, nr_client);
+       ASSERT_EQ(ret, 0);
+ 
+       if (variant->when_to_set == AFTER_LISTEN)
+@@ -128,7 +160,7 @@ void create_servers(struct __test_metadata *_metadata,
+ {
+       int i, ret;
+ 
+-      for (i = 0; i < NR_SERVER; i++) {
++      for (i = 0; i < nr_server; i++) {
+               self->servers[i] = create_server(_metadata, self, variant, i);
+ 
+               if (i == 0) {
+@@ -138,7 +170,7 @@ void create_servers(struct __test_metadata *_metadata,
+       }
+ 
+       if (variant->when_to_set == AFTER_ALL_LISTEN) {
+-              for (i = 0; i < NR_SERVER; i++)
++              for (i = 0; i < nr_server; i++)
+                       set_so_incoming_cpu(_metadata, self->servers[i], i);
+       }
+ }
+@@ -149,7 +181,7 @@ void create_clients(struct __test_metadata *_metadata,
+       cpu_set_t cpu_set;
+       int i, j, fd, ret;
+ 
+-      for (i = 0; i < NR_SERVER; i++) {
++      for (i = 0; i < nr_server; i++) {
+               CPU_ZERO(&cpu_set);
+ 
+               CPU_SET(i, &cpu_set);
+@@ -162,7 +194,7 @@ void create_clients(struct __test_metadata *_metadata,
+               ret = sched_setaffinity(0, sizeof(cpu_set), &cpu_set);
+               ASSERT_EQ(ret, 0);
+ 
+-              for (j = 0; j < CLIENT_PER_SERVER; j++) {
++              for (j = 0; j < nr_client_per_server; j++) {
+                       fd  = socket(AF_INET, SOCK_STREAM, 0);
+                       ASSERT_NE(fd, -1);
+ 
+@@ -180,8 +212,8 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+       int i, j, fd, cpu, ret, total = 0;
+       socklen_t len = sizeof(int);
+ 
+-      for (i = 0; i < NR_SERVER; i++) {
+-              for (j = 0; j < CLIENT_PER_SERVER; j++) {
++      for (i = 0; i < nr_server; i++) {
++              for (j = 0; j < nr_client_per_server; j++) {
+                       /* If we see -EAGAIN here, SO_INCOMING_CPU is broken */
+                       fd = accept(self->servers[i], &self->addr, &self->addrlen);
+                       ASSERT_NE(fd, -1);
+@@ -195,7 +227,7 @@ void verify_incoming_cpu(struct __test_metadata *_metadata,
+               }
+       }
+ 
+-      ASSERT_EQ(total, NR_CLIENT);
++      ASSERT_EQ(total, nr_client);
+       TH_LOG("SO_INCOMING_CPU is very likely to be "
+              "working correctly with %d sockets.", total);
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch b/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch

new file mode 100644 (file)

index 0000000..8477222
--- /dev/null
+++ b/queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch
@@ -0,0 +1,63 @@
+From e1c2375e884d125e1bb994d6ca1b503b345d0b2c Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 15:59:17 +0800
+Subject: selftests: bonding: do not test arp/ns target with mode
+ balance-alb/tlb
+
+From: Hangbin Liu <liuhangbin@gmail.com>
+
+[ Upstream commit a2933a8759a62269754e54733d993b19de870e84 ]
+
+The prio_arp/ns tests hard code the mode to active-backup. At the same
+time, The balance-alb/tlb modes do not support arp/ns target. So remove
+the prio_arp/ns tests from the loop and only test active-backup mode.
+
+Fixes: 481b56e0391e ("selftests: bonding: re-format bond option tests")
+Reported-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Closes: https://lore.kernel.org/netdev/17415.1705965957@famine/
+Signed-off-by: Hangbin Liu <liuhangbin@gmail.com>
+Acked-by: Jay Vosburgh <jay.vosburgh@canonical.com>
+Link: https://lore.kernel.org/r/20240123075917.1576360-1-liuhangbin@gmail.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../testing/selftests/drivers/net/bonding/bond_options.sh | 8 ++++----
+ 1 file changed, 4 insertions(+), 4 deletions(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/bond_options.sh b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+index c54d1697f439..d508486cc0bd 100755
+--- a/tools/testing/selftests/drivers/net/bonding/bond_options.sh
++++ b/tools/testing/selftests/drivers/net/bonding/bond_options.sh
+@@ -162,7 +162,7 @@ prio_arp()
+       local mode=$1
+ 
+       for primary_reselect in 0 1 2; do
+-              prio_test "mode active-backup arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
++              prio_test "mode $mode arp_interval 100 arp_ip_target ${g_ip4} primary eth1 primary_reselect $primary_reselect"
+               log_test "prio" "$mode arp_ip_target primary_reselect $primary_reselect"
+       done
+ }
+@@ -178,7 +178,7 @@ prio_ns()
+       fi
+ 
+       for primary_reselect in 0 1 2; do
+-              prio_test "mode active-backup arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
++              prio_test "mode $mode arp_interval 100 ns_ip6_target ${g_ip6} primary eth1 primary_reselect $primary_reselect"
+               log_test "prio" "$mode ns_ip6_target primary_reselect $primary_reselect"
+       done
+ }
+@@ -194,9 +194,9 @@ prio()
+ 
+       for mode in $modes; do
+               prio_miimon $mode
+-              prio_arp $mode
+-              prio_ns $mode
+       done
++      prio_arp "active-backup"
++      prio_ns "active-backup"
+ }
+ 
+ arp_validate_test()
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch b/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch

new file mode 100644 (file)

index 0000000..4eaf579
--- /dev/null
+++ b/queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch
@@ -0,0 +1,56 @@
+From 7ea5a6e9ccc49ca9a909ce3a6a447474f910df84 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 19:12:32 -0500
+Subject: selftests: bonding: Increase timeout to 1200s
+
+From: Benjamin Poirier <bpoirier@nvidia.com>
+
+[ Upstream commit b01f15a7571b7aa222458bc9bf26ab59bd84e384 ]
+
+When tests are run by runner.sh, bond_options.sh gets killed before
+it can complete:
+
+make -C tools/testing/selftests run_tests TARGETS="drivers/net/bonding"
+       [...]
+       # timeout set to 120
+       # selftests: drivers/net/bonding: bond_options.sh
+       # TEST: prio (active-backup miimon primary_reselect 0)                [ OK ]
+       # TEST: prio (active-backup miimon primary_reselect 1)                [ OK ]
+       # TEST: prio (active-backup miimon primary_reselect 2)                [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 0)         [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 1)         [ OK ]
+       # TEST: prio (active-backup arp_ip_target primary_reselect 2)         [ OK ]
+       #
+       not ok 7 selftests: drivers/net/bonding: bond_options.sh # TIMEOUT 120 seconds
+
+This test includes many sleep statements, at least some of which are
+related to timers in the operation of the bonding driver itself. Increase
+the test timeout to allow the test to complete.
+
+I ran the test in slightly different VMs (including one without HW
+virtualization support) and got runtimes of 13m39.760s, 13m31.238s, and
+13m2.956s. Use a ~1.5x "safety factor" and set the timeout to 1200s.
+
+Fixes: 42a8d4aaea84 ("selftests: bonding: add bonding prio option test")
+Reported-by: Jakub Kicinski <kuba@kernel.org>
+Closes: https://lore.kernel.org/netdev/20240116104402.1203850a@kernel.org/#t
+Suggested-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Benjamin Poirier <bpoirier@nvidia.com>
+Reviewed-by: Hangbin Liu <liuhangbin@gmail.com>
+Link: https://lore.kernel.org/r/20240118001233.304759-1-bpoirier@nvidia.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/drivers/net/bonding/settings | 2 +-
+ 1 file changed, 1 insertion(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/drivers/net/bonding/settings b/tools/testing/selftests/drivers/net/bonding/settings
+index 6091b45d226b..79b65bdf05db 100644
+--- a/tools/testing/selftests/drivers/net/bonding/settings
++++ b/tools/testing/selftests/drivers/net/bonding/settings
+@@ -1 +1 @@
+-timeout=120
++timeout=1200
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch b/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch

new file mode 100644 (file)

index 0000000..7993fe3
--- /dev/null
+++ b/queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch
@@ -0,0 +1,117 @@
+From 0e8d9de5247c4b0e9e3d8c0b504d84ca340c109d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 12:35:28 -0800
+Subject: selftests: fill in some missing configs for net
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 04fe7c5029cbdbcdb28917f09a958d939a8f19f7 ]
+
+We are missing a lot of config options from net selftests,
+it seems:
+
+tun/tap:     CONFIG_TUN, CONFIG_MACVLAN, CONFIG_MACVTAP
+fib_tests:   CONFIG_NET_SCH_FQ_CODEL
+l2tp:        CONFIG_L2TP, CONFIG_L2TP_V3, CONFIG_L2TP_IP, CONFIG_L2TP_ETH
+sctp-vrf:    CONFIG_INET_DIAG
+txtimestamp: CONFIG_NET_CLS_U32
+vxlan_mdb:   CONFIG_BRIDGE_VLAN_FILTERING
+gre_gso:     CONFIG_NET_IPGRE_DEMUX, CONFIG_IP_GRE, CONFIG_IPV6_GRE
+srv6_end_dt*_l3vpn:   CONFIG_IPV6_SEG6_LWTUNNEL
+ip_local_port_range:  CONFIG_MPTCP
+fib_test:    CONFIG_NET_CLS_BASIC
+rtnetlink:   CONFIG_MACSEC, CONFIG_NET_SCH_HTB, CONFIG_XFRM_INTERFACE
+             CONFIG_NET_IPGRE, CONFIG_BONDING
+fib_nexthops: CONFIG_MPLS, CONFIG_MPLS_ROUTING
+vxlan_mdb:   CONFIG_NET_ACT_GACT
+tls:         CONFIG_TLS, CONFIG_CRYPTO_CHACHA20POLY1305
+psample:     CONFIG_PSAMPLE
+fcnal:       CONFIG_TCP_MD5SIG
+
+Try to add them in a semi-alphabetical order.
+
+Fixes: 62199e3f1658 ("selftests: net: Add VXLAN MDB test")
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Fixes: 122db5e3634b ("selftests/net: add MPTCP coverage for IP_LOCAL_PORT_RANGE")
+Link: https://lore.kernel.org/r/20240122203528.672004-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/config | 28 ++++++++++++++++++++++++++++
+ 1 file changed, 28 insertions(+)
+
+diff --git a/tools/testing/selftests/net/config b/tools/testing/selftests/net/config
+index 8da562a9ae87..19ff75051660 100644
+--- a/tools/testing/selftests/net/config
++++ b/tools/testing/selftests/net/config
+@@ -1,5 +1,6 @@
+ CONFIG_USER_NS=y
+ CONFIG_NET_NS=y
++CONFIG_BONDING=m
+ CONFIG_BPF_SYSCALL=y
+ CONFIG_TEST_BPF=m
+ CONFIG_NUMA=y
+@@ -14,9 +15,13 @@ CONFIG_VETH=y
+ CONFIG_NET_IPVTI=y
+ CONFIG_IPV6_VTI=y
+ CONFIG_DUMMY=y
++CONFIG_BRIDGE_VLAN_FILTERING=y
+ CONFIG_BRIDGE=y
++CONFIG_CRYPTO_CHACHA20POLY1305=m
+ CONFIG_VLAN_8021Q=y
+ CONFIG_IFB=y
++CONFIG_INET_DIAG=y
++CONFIG_IP_GRE=m
+ CONFIG_NETFILTER=y
+ CONFIG_NETFILTER_ADVANCED=y
+ CONFIG_NF_CONNTRACK=m
+@@ -25,15 +30,36 @@ CONFIG_IP6_NF_IPTABLES=m
+ CONFIG_IP_NF_IPTABLES=m
+ CONFIG_IP6_NF_NAT=m
+ CONFIG_IP_NF_NAT=m
++CONFIG_IPV6_GRE=m
++CONFIG_IPV6_SEG6_LWTUNNEL=y
++CONFIG_L2TP_ETH=m
++CONFIG_L2TP_IP=m
++CONFIG_L2TP=m
++CONFIG_L2TP_V3=y
++CONFIG_MACSEC=m
++CONFIG_MACVLAN=y
++CONFIG_MACVTAP=y
++CONFIG_MPLS=y
++CONFIG_MPTCP=y
+ CONFIG_NF_TABLES=m
+ CONFIG_NF_TABLES_IPV6=y
+ CONFIG_NF_TABLES_IPV4=y
+ CONFIG_NFT_NAT=m
++CONFIG_NET_ACT_GACT=m
++CONFIG_NET_CLS_BASIC=m
++CONFIG_NET_CLS_U32=m
++CONFIG_NET_IPGRE_DEMUX=m
++CONFIG_NET_IPGRE=m
++CONFIG_NET_SCH_FQ_CODEL=m
++CONFIG_NET_SCH_HTB=m
+ CONFIG_NET_SCH_FQ=m
+ CONFIG_NET_SCH_ETF=m
+ CONFIG_NET_SCH_NETEM=y
++CONFIG_PSAMPLE=m
++CONFIG_TCP_MD5SIG=y
+ CONFIG_TEST_BLACKHOLE_DEV=m
+ CONFIG_KALLSYMS=y
++CONFIG_TLS=m
+ CONFIG_TRACEPOINTS=y
+ CONFIG_NET_DROP_MONITOR=m
+ CONFIG_NETDEVSIM=m
+@@ -48,7 +74,9 @@ CONFIG_BAREUDP=m
+ CONFIG_IPV6_IOAM6_LWTUNNEL=y
+ CONFIG_CRYPTO_SM4_GENERIC=y
+ CONFIG_AMT=m
++CONFIG_TUN=y
+ CONFIG_VXLAN=m
+ CONFIG_IP_SCTP=m
+ CONFIG_NETFILTER_XT_MATCH_POLICY=m
+ CONFIG_CRYPTO_ARIA=y
++CONFIG_XFRM_INTERFACE=m
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch b/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch

new file mode 100644 (file)

index 0000000..74b4ee1
--- /dev/null
+++ b/queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch
@@ -0,0 +1,51 @@
+From 59a87296fd28cff283ab62c5fcad3bbbbda290a4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 11:58:15 -0800
+Subject: selftests: net: fix rps_default_mask with >32 CPUs
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0719b5338a0cbe80d1637a5fb03d8141b5bfc7a1 ]
+
+If there is more than 32 cpus the bitmask will start to contain
+commas, leading to:
+
+./rps_default_mask.sh: line 36: [: 00000000,00000000: integer expression expected
+
+Remove the commas, bash doesn't interpret leading zeroes as oct
+so that should be good enough. Switch to bash, Simon reports that
+not all shells support this type of substitution.
+
+Fixes: c12e0d5f267d ("self-tests: introduce self-tests for RPS default mask")
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240122195815.638997-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ tools/testing/selftests/net/rps_default_mask.sh | 6 +++++-
+ 1 file changed, 5 insertions(+), 1 deletion(-)
+
+diff --git a/tools/testing/selftests/net/rps_default_mask.sh b/tools/testing/selftests/net/rps_default_mask.sh
+index a26c5624429f..4287a8529890 100755
+--- a/tools/testing/selftests/net/rps_default_mask.sh
++++ b/tools/testing/selftests/net/rps_default_mask.sh
+@@ -1,4 +1,4 @@
+-#!/bin/sh
++#!/bin/bash
+ # SPDX-License-Identifier: GPL-2.0
+ 
+ readonly ksft_skip=4
+@@ -33,6 +33,10 @@ chk_rps() {
+ 
+       rps_mask=$($cmd /sys/class/net/$dev_name/queues/rx-0/rps_cpus)
+       printf "%-60s" "$msg"
++
++      # In case there is more than 32 CPUs we need to remove commas from masks
++      rps_mask=${rps_mask//,}
++      expected_rps_mask=${expected_rps_mask//,}
+       if [ $rps_mask -eq $expected_rps_mask ]; then
+               echo "[ ok ]"
+       else
+-- 
+2.43.0
+
diff --git a/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch b/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch

new file mode 100644 (file)

index 0000000..87a65a9
--- /dev/null
+++ b/queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch
@@ -0,0 +1,102 @@
+From dfca3552b6e84cfec030272e59570cf545b3148a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 22:05:29 -0800
+Subject: selftests: netdevsim: fix the udp_tunnel_nic test
+
+From: Jakub Kicinski <kuba@kernel.org>
+
+[ Upstream commit 0879020a7817e7ce636372c016b4528f541c9f4d ]
+
+This test is missing a whole bunch of checks for interface
+renaming and one ifup. Presumably it was only used on a system
+with renaming disabled and NetworkManager running.
+
+Fixes: 91f430b2c49d ("selftests: net: add a test for UDP tunnel info infra")
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240123060529.1033912-1-kuba@kernel.org
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ .../selftests/drivers/net/netdevsim/udp_tunnel_nic.sh    | 9 +++++++++
+ 1 file changed, 9 insertions(+)
+
+diff --git a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+index 1b08e042cf94..185b02d2d4cd 100755
+--- a/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
++++ b/tools/testing/selftests/drivers/net/netdevsim/udp_tunnel_nic.sh
+@@ -269,6 +269,7 @@ for port in 0 1; do
+       echo 1 > $NSIM_DEV_SYS/new_port
+     fi
+     NSIM_NETDEV=`get_netdev_name old_netdevs`
++    ifconfig $NSIM_NETDEV up
+ 
+     msg="new NIC device created"
+     exp0=( 0 0 0 0 )
+@@ -430,6 +431,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     overflow_table0 "overflow NIC table"
+@@ -487,6 +489,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     overflow_table0 "overflow NIC table"
+@@ -543,6 +546,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     overflow_table0 "destroy NIC"
+@@ -572,6 +576,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     msg="create VxLANs v6"
+@@ -632,6 +637,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     echo 110 > $NSIM_DEV_DFS/ports/$port/udp_ports_inject_error
+@@ -687,6 +693,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     msg="create VxLANs v6"
+@@ -746,6 +753,7 @@ for port in 0 1; do
+     fi
+ 
+     echo $port > $NSIM_DEV_SYS/new_port
++    NSIM_NETDEV=`get_netdev_name old_netdevs`
+     ifconfig $NSIM_NETDEV up
+ 
+     msg="create VxLANs v6"
+@@ -876,6 +884,7 @@ msg="re-add a port"
+ 
+ echo 2 > $NSIM_DEV_SYS/del_port
+ echo 2 > $NSIM_DEV_SYS/new_port
++NSIM_NETDEV=`get_netdev_name old_netdevs`
+ check_tables
+ 
+ msg="replace VxLAN in overflow table"
+-- 
+2.43.0
+
diff --git a/queue-6.7/series b/queue-6.7/series

index f8de7866666c52ee8119f802a23c3d8f80defa5f..e3a392fc5e03055a5ef730efd5adcd2d3d3a1228 100644 (file)
--- a/queue-6.7/series
+++ b/queue-6.7/series
@@ -142,3 +142,80 @@ ksmbd-fix-potential-circular-locking-issue-in-smb2_set_ea.patch
  ksmbd-don-t-increment-epoch-if-current-state-and-request-state-are-same.patch
  ksmbd-send-lease-break-notification-on-file_rename_information.patch
  ksmbd-add-missing-set_freezable-for-freezable-kthread.patch
+sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch
+wifi-mac80211-fix-potential-sta-link-leak.patch
+btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch
+net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch
+selftests-bonding-increase-timeout-to-1200s.patch
+tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch
+bnxt_en-wait-for-flr-to-complete-during-probe.patch
+bnxt_en-prevent-kernel-warning-when-running-offline-.patch
+vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch
+llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch
+llc-drop-support-for-eth_p_tr_802_2.patch
+udp-fix-busy-polling.patch
+idpf-distinguish-vports-by-the-dev_port-attribute.patch
+net-fix-removing-a-namespace-with-conflicting-altnam.patch
+tun-fix-missing-dropped-counter-in-tun_xdp_act.patch
+tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch
+dpll-fix-broken-error-path-in-dpll_pin_alloc.patch
+dpll-fix-pin-dump-crash-for-rebound-module.patch
+dpll-fix-userspace-availability-of-pins.patch
+dpll-fix-register-pin-with-unregistered-parent-pin.patch
+net-micrel-fix-ptp-frame-parsing-for-lan8814.patch
+net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch
+netfs-fscache-prevent-oops-in-fscache_put_cache.patch
+tracing-ensure-visibility-when-inserting-an-element-.patch
+afs-hide-silly-rename-files-from-userspace.patch
+afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch
+afs-add-comments-on-abort-handling.patch
+afs-turn-the-afs_addr_list-address-array-into-an-arr.patch
+rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch
+afs-handle-the-vio-and-uaeio-aborts-explicitly.patch
+afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch
+afs-wrap-most-op-error-accesses-with-inline-funcs.patch
+afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch
+afs-simplify-error-handling.patch
+afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch
+tcp-add-memory-barrier-to-tcp_push.patch
+selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch
+netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch
+ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch
+selftests-fill-in-some-missing-configs-for-net.patch
+net-sched-flower-fix-chain-template-offload.patch
+net-mlx5e-fix-operation-precedence-bug-in-port-times.patch
+net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch
+net-mlx5e-fix-peer-flow-lists-handling.patch
+net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch
+net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch
+net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch
+net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch
+net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch
+net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch
+net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch
+net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch
+net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch
+rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch
+netfilter-nft_limit-reject-configurations-that-cause.patch
+netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch
+netfilter-nf_tables-validate-nfproto_-family.patch
+net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch
+net-mvpp2-clear-bm-pool-before-initialization.patch
+selftests-net-fix-rps_default_mask-with-32-cpus.patch
+selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch
+xsk-recycle-buffer-in-case-rx-queue-was-full.patch
+xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch
+xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch
+ice-work-on-pre-xdp-prog-frag-count.patch
+i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch
+ice-remove-redundant-xdp_rxq_info-registration.patch
+intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch
+ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch
+xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch
+i40e-set-xdp_rxq_info-frag_size.patch
+i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch
+fjes-fix-memleaks-in-fjes_hw_setup.patch
+selftests-bonding-do-not-test-arp-ns-target-with-mod.patch
+net-fec-fix-the-unhandled-context-fault-from-smmu.patch
+tsnep-remove-fcs-for-xdp-data-path.patch
+tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch
diff --git a/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch b/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch

new file mode 100644 (file)

index 0000000..15c2a21
--- /dev/null
+++ b/queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch
@@ -0,0 +1,42 @@
+From 58c1e7163139f42ce19e100c31cfa906196959e8 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 17 Jan 2024 22:06:28 +0100
+Subject: SUNRPC: use request size to initialize bio_vec in svc_udp_sendto()
+
+From: Lucas Stach <l.stach@pengutronix.de>
+
+[ Upstream commit 1d9cabe2817edd215779dc9c2fe5e7ab9aac0704 ]
+
+Use the proper size when setting up the bio_vec, as otherwise only
+zero-length UDP packets will be sent.
+
+Fixes: baabf59c2414 ("SUNRPC: Convert svc_udp_sendto() to use the per-socket bio_vec array")
+Signed-off-by: Lucas Stach <l.stach@pengutronix.de>
+Signed-off-by: Chuck Lever <chuck.lever@oracle.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/sunrpc/svcsock.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/net/sunrpc/svcsock.c b/net/sunrpc/svcsock.c
+index 998687421fa6..e0ce4276274b 100644
+--- a/net/sunrpc/svcsock.c
++++ b/net/sunrpc/svcsock.c
+@@ -717,12 +717,12 @@ static int svc_udp_sendto(struct svc_rqst *rqstp)
+                               ARRAY_SIZE(rqstp->rq_bvec), xdr);
+ 
+       iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+-                    count, 0);
++                    count, rqstp->rq_res.len);
+       err = sock_sendmsg(svsk->sk_sock, &msg);
+       if (err == -ECONNREFUSED) {
+               /* ICMP error on earlier request. */
+               iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, rqstp->rq_bvec,
+-                            count, 0);
++                            count, rqstp->rq_res.len);
+               err = sock_sendmsg(svsk->sk_sock, &msg);
+       }
+ 
+-- 
+2.43.0
+
diff --git a/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch b/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch

new file mode 100644 (file)

index 0000000..844e1e2
--- /dev/null
+++ b/queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch
@@ -0,0 +1,101 @@
+From 38e4f18aaf076cdae4088f54afc329320ce2f0f3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 11:01:33 -0800
+Subject: tcp: Add memory barrier to tcp_push()
+
+From: Salvatore Dipietro <dipiets@amazon.com>
+
+[ Upstream commit 7267e8dcad6b2f9fce05a6a06335d7040acbc2b6 ]
+
+On CPUs with weak memory models, reads and updates performed by tcp_push
+to the sk variables can get reordered leaving the socket throttled when
+it should not. The tasklet running tcp_wfree() may also not observe the
+memory updates in time and will skip flushing any packets throttled by
+tcp_push(), delaying the sending. This can pathologically cause 40ms
+extra latency due to bad interactions with delayed acks.
+
+Adding a memory barrier in tcp_push removes the bug, similarly to the
+previous commit bf06200e732d ("tcp: tsq: fix nonagle handling").
+smp_mb__after_atomic() is used to not incur in unnecessary overhead
+on x86 since not affected.
+
+Patch has been tested using an AWS c7g.2xlarge instance with Ubuntu
+22.04 and Apache Tomcat 9.0.83 running the basic servlet below:
+
+import java.io.IOException;
+import java.io.OutputStreamWriter;
+import java.io.PrintWriter;
+import javax.servlet.ServletException;
+import javax.servlet.http.HttpServlet;
+import javax.servlet.http.HttpServletRequest;
+import javax.servlet.http.HttpServletResponse;
+
+public class HelloWorldServlet extends HttpServlet {
+    @Override
+    protected void doGet(HttpServletRequest request, HttpServletResponse response)
+      throws ServletException, IOException {
+        response.setContentType("text/html;charset=utf-8");
+        OutputStreamWriter osw = new OutputStreamWriter(response.getOutputStream(),"UTF-8");
+        String s = "a".repeat(3096);
+        osw.write(s,0,s.length());
+        osw.flush();
+    }
+}
+
+Load was applied using wrk2 (https://github.com/kinvolk/wrk2) from an AWS
+c6i.8xlarge instance. Before the patch an additional 40ms latency from P99.99+
+values is observed while, with the patch, the extra latency disappears.
+
+No patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000  http://172.31.60.173:8080/hello/hello
+  ...
+ 50.000%    0.91ms
+ 75.000%    1.13ms
+ 90.000%    1.46ms
+ 99.000%    1.74ms
+ 99.900%    1.89ms
+ 99.990%   41.95ms  <<< 40+ ms extra latency
+ 99.999%   48.32ms
+100.000%   48.96ms
+
+With patch and tcp_autocorking=1
+./wrk -t32 -c128 -d40s --latency -R10000  http://172.31.60.173:8080/hello/hello
+  ...
+ 50.000%    0.90ms
+ 75.000%    1.13ms
+ 90.000%    1.45ms
+ 99.000%    1.72ms
+ 99.900%    1.83ms
+ 99.990%    2.11ms  <<< no 40+ ms extra latency
+ 99.999%    2.53ms
+100.000%    2.62ms
+
+Patch has been also tested on x86 (m7i.2xlarge instance) which it is not
+affected by this issue and the patch doesn't introduce any additional
+delay.
+
+Fixes: 7aa5470c2c09 ("tcp: tsq: move tsq_flags close to sk_wmem_alloc")
+Signed-off-by: Salvatore Dipietro <dipiets@amazon.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240119190133.43698-1-dipiets@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/ipv4/tcp.c | 1 +
+ 1 file changed, 1 insertion(+)
+
+diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c
+index ff6838ca2e58..7bce79beca2b 100644
+--- a/net/ipv4/tcp.c
++++ b/net/ipv4/tcp.c
+@@ -722,6 +722,7 @@ void tcp_push(struct sock *sk, int flags, int mss_now,
+               if (!test_bit(TSQ_THROTTLED, &sk->sk_tsq_flags)) {
+                       NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPAUTOCORKING);
+                       set_bit(TSQ_THROTTLED, &sk->sk_tsq_flags);
++                      smp_mb__after_atomic();
+               }
+               /* It is possible TX completion already happened
+                * before we set TSQ_THROTTLED.
+-- 
+2.43.0
+
diff --git a/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch b/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch

new file mode 100644 (file)

index 0000000..adcce9e
--- /dev/null
+++ b/queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch
@@ -0,0 +1,170 @@
+From a96f3a5cb5848f1ff49b6839fc043d33bc94ec3b Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 09:20:19 +0800
+Subject: tcp: make sure init the accept_queue's spinlocks once
+
+From: Zhengchao Shao <shaozhengchao@huawei.com>
+
+[ Upstream commit 198bc90e0e734e5f98c3d2833e8390cac3df61b2 ]
+
+When I run syz's reproduction C program locally, it causes the following
+issue:
+pvqspinlock: lock 0xffff9d181cd5c660 has corrupted value 0x0!
+WARNING: CPU: 19 PID: 21160 at __pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Hardware name: Red Hat KVM, BIOS 0.5.1 01/01/2011
+RIP: 0010:__pv_queued_spin_unlock_slowpath (kernel/locking/qspinlock_paravirt.h:508)
+Code: 73 56 3a ff 90 c3 cc cc cc cc 8b 05 bb 1f 48 01 85 c0 74 05 c3 cc cc cc cc 8b 17 48 89 fe 48 c7 c7
+30 20 ce 8f e8 ad 56 42 ff <0f> 0b c3 cc cc cc cc 0f 0b 0f 1f 40 00 90 90 90 90 90 90 90 90 90
+RSP: 0018:ffffa8d200604cb8 EFLAGS: 00010282
+RAX: 0000000000000000 RBX: 0000000000000000 RCX: ffff9d1ef60e0908
+RDX: 00000000ffffffd8 RSI: 0000000000000027 RDI: ffff9d1ef60e0900
+RBP: ffff9d181cd5c280 R08: 0000000000000000 R09: 00000000ffff7fff
+R10: ffffa8d200604b68 R11: ffffffff907dcdc8 R12: 0000000000000000
+R13: ffff9d181cd5c660 R14: ffff9d1813a3f330 R15: 0000000000001000
+FS:  00007fa110184640(0000) GS:ffff9d1ef60c0000(0000) knlGS:0000000000000000
+CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+CR2: 0000000020000000 CR3: 000000011f65e000 CR4: 00000000000006f0
+Call Trace:
+<IRQ>
+  _raw_spin_unlock (kernel/locking/spinlock.c:186)
+  inet_csk_reqsk_queue_add (net/ipv4/inet_connection_sock.c:1321)
+  inet_csk_complete_hashdance (net/ipv4/inet_connection_sock.c:1358)
+  tcp_check_req (net/ipv4/tcp_minisocks.c:868)
+  tcp_v4_rcv (net/ipv4/tcp_ipv4.c:2260)
+  ip_protocol_deliver_rcu (net/ipv4/ip_input.c:205)
+  ip_local_deliver_finish (net/ipv4/ip_input.c:234)
+  __netif_receive_skb_one_core (net/core/dev.c:5529)
+  process_backlog (./include/linux/rcupdate.h:779)
+  __napi_poll (net/core/dev.c:6533)
+  net_rx_action (net/core/dev.c:6604)
+  __do_softirq (./arch/x86/include/asm/jump_label.h:27)
+  do_softirq (kernel/softirq.c:454 kernel/softirq.c:441)
+</IRQ>
+<TASK>
+  __local_bh_enable_ip (kernel/softirq.c:381)
+  __dev_queue_xmit (net/core/dev.c:4374)
+  ip_finish_output2 (./include/net/neighbour.h:540 net/ipv4/ip_output.c:235)
+  __ip_queue_xmit (net/ipv4/ip_output.c:535)
+  __tcp_transmit_skb (net/ipv4/tcp_output.c:1462)
+  tcp_rcv_synsent_state_process (net/ipv4/tcp_input.c:6469)
+  tcp_rcv_state_process (net/ipv4/tcp_input.c:6657)
+  tcp_v4_do_rcv (net/ipv4/tcp_ipv4.c:1929)
+  __release_sock (./include/net/sock.h:1121 net/core/sock.c:2968)
+  release_sock (net/core/sock.c:3536)
+  inet_wait_for_connect (net/ipv4/af_inet.c:609)
+  __inet_stream_connect (net/ipv4/af_inet.c:702)
+  inet_stream_connect (net/ipv4/af_inet.c:748)
+  __sys_connect (./include/linux/file.h:45 net/socket.c:2064)
+  __x64_sys_connect (net/socket.c:2073 net/socket.c:2070 net/socket.c:2070)
+  do_syscall_64 (arch/x86/entry/common.c:51 arch/x86/entry/common.c:82)
+  entry_SYSCALL_64_after_hwframe (arch/x86/entry/entry_64.S:129)
+  RIP: 0033:0x7fa10ff05a3d
+  Code: 5b 41 5c c3 66 0f 1f 84 00 00 00 00 00 f3 0f 1e fa 48 89 f8 48 89 f7 48 89 d6 48 89 ca 4d 89
+  c2 4d 89 c8 4c 8b 4c 24 08 0f 05 <48> 3d 01 f0 ff ff 73 01 c3 48 8b 0d ab a3 0e 00 f7 d8 64 89 01 48
+  RSP: 002b:00007fa110183de8 EFLAGS: 00000202 ORIG_RAX: 000000000000002a
+  RAX: ffffffffffffffda RBX: 0000000020000054 RCX: 00007fa10ff05a3d
+  RDX: 000000000000001c RSI: 0000000020000040 RDI: 0000000000000003
+  RBP: 00007fa110183e20 R08: 0000000000000000 R09: 0000000000000000
+  R10: 0000000000000000 R11: 0000000000000202 R12: 00007fa110184640
+  R13: 0000000000000000 R14: 00007fa10fe8b060 R15: 00007fff73e23b20
+</TASK>
+
+The issue triggering process is analyzed as follows:
+Thread A                                       Thread B
+tcp_v4_rcv     //receive ack TCP packet       inet_shutdown
+  tcp_check_req                                  tcp_disconnect //disconnect sock
+  ...                                              tcp_set_state(sk, TCP_CLOSE)
+    inet_csk_complete_hashdance                ...
+      inet_csk_reqsk_queue_add                 inet_listen  //start listen
+        spin_lock(&queue->rskq_lock)             inet_csk_listen_start
+        ...                                        reqsk_queue_alloc
+        ...                                          spin_lock_init
+        spin_unlock(&queue->rskq_lock) //warning
+
+When the socket receives the ACK packet during the three-way handshake,
+it will hold spinlock. And then the user actively shutdowns the socket
+and listens to the socket immediately, the spinlock will be initialized.
+When the socket is going to release the spinlock, a warning is generated.
+Also the same issue to fastopenq.lock.
+
+Move init spinlock to inet_create and inet_accept to make sure init the
+accept_queue's spinlocks once.
+
+Fixes: fff1f3001cc5 ("tcp: add a spinlock to protect struct request_sock_queue")
+Fixes: 168a8f58059a ("tcp: TCP Fast Open Server - main code path")
+Reported-by: Ming Shu <sming56@aliyun.com>
+Signed-off-by: Zhengchao Shao <shaozhengchao@huawei.com>
+Reviewed-by: Eric Dumazet <edumazet@google.com>
+Link: https://lore.kernel.org/r/20240118012019.1751966-1-shaozhengchao@huawei.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/inet_connection_sock.h | 8 ++++++++
+ net/core/request_sock.c            | 3 ---
+ net/ipv4/af_inet.c                 | 3 +++
+ net/ipv4/inet_connection_sock.c    | 4 ++++
+ 4 files changed, 15 insertions(+), 3 deletions(-)
+
+diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h
+index d0a2f827d5f2..9ab4bf704e86 100644
+--- a/include/net/inet_connection_sock.h
++++ b/include/net/inet_connection_sock.h
+@@ -357,4 +357,12 @@ static inline bool inet_csk_has_ulp(const struct sock *sk)
+       return inet_test_bit(IS_ICSK, sk) && !!inet_csk(sk)->icsk_ulp_ops;
+ }
+ 
++static inline void inet_init_csk_locks(struct sock *sk)
++{
++      struct inet_connection_sock *icsk = inet_csk(sk);
++
++      spin_lock_init(&icsk->icsk_accept_queue.rskq_lock);
++      spin_lock_init(&icsk->icsk_accept_queue.fastopenq.lock);
++}
++
+ #endif /* _INET_CONNECTION_SOCK_H */
+diff --git a/net/core/request_sock.c b/net/core/request_sock.c
+index f35c2e998406..63de5c635842 100644
+--- a/net/core/request_sock.c
++++ b/net/core/request_sock.c
+@@ -33,9 +33,6 @@
+ 
+ void reqsk_queue_alloc(struct request_sock_queue *queue)
+ {
+-      spin_lock_init(&queue->rskq_lock);
+-
+-      spin_lock_init(&queue->fastopenq.lock);
+       queue->fastopenq.rskq_rst_head = NULL;
+       queue->fastopenq.rskq_rst_tail = NULL;
+       queue->fastopenq.qlen = 0;
+diff --git a/net/ipv4/af_inet.c b/net/ipv4/af_inet.c
+index ea0b0334a0fb..1c58bd72e124 100644
+--- a/net/ipv4/af_inet.c
++++ b/net/ipv4/af_inet.c
+@@ -330,6 +330,9 @@ static int inet_create(struct net *net, struct socket *sock, int protocol,
+       if (INET_PROTOSW_REUSE & answer_flags)
+               sk->sk_reuse = SK_CAN_REUSE;
+ 
++      if (INET_PROTOSW_ICSK & answer_flags)
++              inet_init_csk_locks(sk);
++
+       inet = inet_sk(sk);
+       inet_assign_bit(IS_ICSK, sk, INET_PROTOSW_ICSK & answer_flags);
+ 
+diff --git a/net/ipv4/inet_connection_sock.c b/net/ipv4/inet_connection_sock.c
+index 394a498c2823..762817d6c8d7 100644
+--- a/net/ipv4/inet_connection_sock.c
++++ b/net/ipv4/inet_connection_sock.c
+@@ -730,6 +730,10 @@ struct sock *inet_csk_accept(struct sock *sk, int flags, int *err, bool kern)
+       }
+       if (req)
+               reqsk_put(req);
++
++      if (newsk)
++              inet_init_csk_locks(newsk);
++
+       return newsk;
+ out_err:
+       newsk = NULL;
+-- 
+2.43.0
+
diff --git a/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch b/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch

new file mode 100644 (file)

index 0000000..b73afa9
--- /dev/null
+++ b/queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch
@@ -0,0 +1,129 @@
+From bb9fda497e2b1f6927c979fc39505e3da60e56bb Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Mon, 22 Jan 2024 16:09:28 +0100
+Subject: tracing: Ensure visibility when inserting an element into tracing_map
+
+From: Petr Pavlu <petr.pavlu@suse.com>
+
+[ Upstream commit 2b44760609e9eaafc9d234a6883d042fc21132a7 ]
+
+Running the following two commands in parallel on a multi-processor
+AArch64 machine can sporadically produce an unexpected warning about
+duplicate histogram entries:
+
+ $ while true; do
+     echo hist:key=id.syscall:val=hitcount > \
+       /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/trigger
+     cat /sys/kernel/debug/tracing/events/raw_syscalls/sys_enter/hist
+     sleep 0.001
+   done
+ $ stress-ng --sysbadaddr $(nproc)
+
+The warning looks as follows:
+
+[ 2911.172474] ------------[ cut here ]------------
+[ 2911.173111] Duplicates detected: 1
+[ 2911.173574] WARNING: CPU: 2 PID: 12247 at kernel/trace/tracing_map.c:983 tracing_map_sort_entries+0x3e0/0x408
+[ 2911.174702] Modules linked in: iscsi_ibft(E) iscsi_boot_sysfs(E) rfkill(E) af_packet(E) nls_iso8859_1(E) nls_cp437(E) vfat(E) fat(E) ena(E) tiny_power_button(E) qemu_fw_cfg(E) button(E) fuse(E) efi_pstore(E) ip_tables(E) x_tables(E) xfs(E) libcrc32c(E) aes_ce_blk(E) aes_ce_cipher(E) crct10dif_ce(E) polyval_ce(E) polyval_generic(E) ghash_ce(E) gf128mul(E) sm4_ce_gcm(E) sm4_ce_ccm(E) sm4_ce(E) sm4_ce_cipher(E) sm4(E) sm3_ce(E) sm3(E) sha3_ce(E) sha512_ce(E) sha512_arm64(E) sha2_ce(E) sha256_arm64(E) nvme(E) sha1_ce(E) nvme_core(E) nvme_auth(E) t10_pi(E) sg(E) scsi_mod(E) scsi_common(E) efivarfs(E)
+[ 2911.174738] Unloaded tainted modules: cppc_cpufreq(E):1
+[ 2911.180985] CPU: 2 PID: 12247 Comm: cat Kdump: loaded Tainted: G            E      6.7.0-default #2 1b58bbb22c97e4399dc09f92d309344f69c44a01
+[ 2911.182398] Hardware name: Amazon EC2 c7g.8xlarge/, BIOS 1.0 11/1/2018
+[ 2911.183208] pstate: 61400005 (nZCv daif +PAN -UAO -TCO +DIT -SSBS BTYPE=--)
+[ 2911.184038] pc : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.184667] lr : tracing_map_sort_entries+0x3e0/0x408
+[ 2911.185310] sp : ffff8000a1513900
+[ 2911.185750] x29: ffff8000a1513900 x28: ffff0003f272fe80 x27: 0000000000000001
+[ 2911.186600] x26: ffff0003f272fe80 x25: 0000000000000030 x24: 0000000000000008
+[ 2911.187458] x23: ffff0003c5788000 x22: ffff0003c16710c8 x21: ffff80008017f180
+[ 2911.188310] x20: ffff80008017f000 x19: ffff80008017f180 x18: ffffffffffffffff
+[ 2911.189160] x17: 0000000000000000 x16: 0000000000000000 x15: ffff8000a15134b8
+[ 2911.190015] x14: 0000000000000000 x13: 205d373432323154 x12: 5b5d313131333731
+[ 2911.190844] x11: 00000000fffeffff x10: 00000000fffeffff x9 : ffffd1b78274a13c
+[ 2911.191716] x8 : 000000000017ffe8 x7 : c0000000fffeffff x6 : 000000000057ffa8
+[ 2911.192554] x5 : ffff0012f6c24ec0 x4 : 0000000000000000 x3 : ffff2e5b72b5d000
+[ 2911.193404] x2 : 0000000000000000 x1 : 0000000000000000 x0 : ffff0003ff254480
+[ 2911.194259] Call trace:
+[ 2911.194626]  tracing_map_sort_entries+0x3e0/0x408
+[ 2911.195220]  hist_show+0x124/0x800
+[ 2911.195692]  seq_read_iter+0x1d4/0x4e8
+[ 2911.196193]  seq_read+0xe8/0x138
+[ 2911.196638]  vfs_read+0xc8/0x300
+[ 2911.197078]  ksys_read+0x70/0x108
+[ 2911.197534]  __arm64_sys_read+0x24/0x38
+[ 2911.198046]  invoke_syscall+0x78/0x108
+[ 2911.198553]  el0_svc_common.constprop.0+0xd0/0xf8
+[ 2911.199157]  do_el0_svc+0x28/0x40
+[ 2911.199613]  el0_svc+0x40/0x178
+[ 2911.200048]  el0t_64_sync_handler+0x13c/0x158
+[ 2911.200621]  el0t_64_sync+0x1a8/0x1b0
+[ 2911.201115] ---[ end trace 0000000000000000 ]---
+
+The problem appears to be caused by CPU reordering of writes issued from
+__tracing_map_insert().
+
+The check for the presence of an element with a given key in this
+function is:
+
+ val = READ_ONCE(entry->val);
+ if (val && keys_match(key, val->key, map->key_size)) ...
+
+The write of a new entry is:
+
+ elt = get_free_elt(map);
+ memcpy(elt->key, key, map->key_size);
+ entry->val = elt;
+
+The "memcpy(elt->key, key, map->key_size);" and "entry->val = elt;"
+stores may become visible in the reversed order on another CPU. This
+second CPU might then incorrectly determine that a new key doesn't match
+an already present val->key and subsequently insert a new element,
+resulting in a duplicate.
+
+Fix the problem by adding a write barrier between
+"memcpy(elt->key, key, map->key_size);" and "entry->val = elt;", and for
+good measure, also use WRITE_ONCE(entry->val, elt) for publishing the
+element. The sequence pairs with the mentioned "READ_ONCE(entry->val);"
+and the "val->key" check which has an address dependency.
+
+The barrier is placed on a path executed when adding an element for
+a new key. Subsequent updates targeting the same key remain unaffected.
+
+From the user's perspective, the issue was introduced by commit
+c193707dde77 ("tracing: Remove code which merges duplicates"), which
+followed commit cbf4100efb8f ("tracing: Add support to detect and avoid
+duplicates"). The previous code operated differently; it inherently
+expected potential races which result in duplicates but merged them
+later when they occurred.
+
+Link: https://lore.kernel.org/linux-trace-kernel/20240122150928.27725-1-petr.pavlu@suse.com
+
+Fixes: c193707dde77 ("tracing: Remove code which merges duplicates")
+Signed-off-by: Petr Pavlu <petr.pavlu@suse.com>
+Acked-by: Tom Zanussi <tom.zanussi@linux.intel.com>
+Signed-off-by: Steven Rostedt (Google) <rostedt@goodmis.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ kernel/trace/tracing_map.c | 7 ++++++-
+ 1 file changed, 6 insertions(+), 1 deletion(-)
+
+diff --git a/kernel/trace/tracing_map.c b/kernel/trace/tracing_map.c
+index c774e560f2f9..a4dcf0f24352 100644
+--- a/kernel/trace/tracing_map.c
++++ b/kernel/trace/tracing_map.c
+@@ -574,7 +574,12 @@ __tracing_map_insert(struct tracing_map *map, void *key, bool lookup_only)
+                               }
+ 
+                               memcpy(elt->key, key, map->key_size);
+-                              entry->val = elt;
++                              /*
++                               * Ensure the initialization is visible and
++                               * publish the elt.
++                               */
++                              smp_wmb();
++                              WRITE_ONCE(entry->val, elt);
+                               atomic64_inc(&map->hits);
+ 
+                               return entry->val;
+-- 
+2.43.0
+
diff --git a/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch b/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch

new file mode 100644 (file)

index 0000000..e2a58b7
--- /dev/null
+++ b/queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch
@@ -0,0 +1,52 @@
+From 96e806d8a4c5fc2829444a2c28f953e512431242 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:18 +0100
+Subject: tsnep: Fix XDP_RING_NEED_WAKEUP for empty fill ring
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 9a91c05f4bd6f6bdd6b8f90445e0da92e3ac956c ]
+
+The fill ring of the XDP socket may contain not enough buffers to
+completey fill the RX queue during socket creation. In this case the
+flag XDP_RING_NEED_WAKEUP is not set as this flag is only set if the RX
+queue is not completely filled during polling.
+
+Set XDP_RING_NEED_WAKEUP flag also if RX queue is not completely filled
+during XDP socket creation.
+
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 13 +++++++++++++
+ 1 file changed, 13 insertions(+)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index 456e0336f3f6..9aeff2b37a61 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1762,6 +1762,19 @@ static void tsnep_rx_reopen_xsk(struct tsnep_rx *rx)
+                       allocated--;
+               }
+       }
++
++      /* set need wakeup flag immediately if ring is not filled completely,
++       * first polling would be too late as need wakeup signalisation would
++       * be delayed for an indefinite time
++       */
++      if (xsk_uses_need_wakeup(rx->xsk_pool)) {
++              int desc_available = tsnep_rx_desc_available(rx);
++
++              if (desc_available)
++                      xsk_set_rx_need_wakeup(rx->xsk_pool);
++              else
++                      xsk_clear_rx_need_wakeup(rx->xsk_pool);
++      }
+ }
+ 
+ static bool tsnep_pending(struct tsnep_queue *queue)
+-- 
+2.43.0
+
diff --git a/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch b/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch

new file mode 100644 (file)

index 0000000..7215c8f
--- /dev/null
+++ b/queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch
@@ -0,0 +1,49 @@
+From 2d21d1e8559b9b89588155510d926751ac77c1ba Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Tue, 23 Jan 2024 21:09:17 +0100
+Subject: tsnep: Remove FCS for XDP data path
+
+From: Gerhard Engleder <gerhard@engleder-embedded.com>
+
+[ Upstream commit 50bad6f797d4d501c5ef416a6f92e1912ab5aa8b ]
+
+The RX data buffer includes the FCS. The FCS is already stripped for the
+normal data path. But for the XDP data path the FCS is included and
+acts like additional/useless data.
+
+Remove the FCS from the RX data buffer also for XDP.
+
+Fixes: 65b28c810035 ("tsnep: Add XDP RX support")
+Fixes: 3fc2333933fd ("tsnep: Add XDP socket zero-copy RX support")
+Signed-off-by: Gerhard Engleder <gerhard@engleder-embedded.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/engleder/tsnep_main.c | 4 ++--
+ 1 file changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/engleder/tsnep_main.c b/drivers/net/ethernet/engleder/tsnep_main.c
+index df40c720e7b2..456e0336f3f6 100644
+--- a/drivers/net/ethernet/engleder/tsnep_main.c
++++ b/drivers/net/ethernet/engleder/tsnep_main.c
+@@ -1485,7 +1485,7 @@ static int tsnep_rx_poll(struct tsnep_rx *rx, struct napi_struct *napi,
+ 
+                       xdp_prepare_buff(&xdp, page_address(entry->page),
+                                        XDP_PACKET_HEADROOM + TSNEP_RX_INLINE_METADATA_SIZE,
+-                                       length, false);
++                                       length - ETH_FCS_LEN, false);
+ 
+                       consume = tsnep_xdp_run_prog(rx, prog, &xdp,
+                                                    &xdp_status, tx_nq, tx);
+@@ -1568,7 +1568,7 @@ static int tsnep_rx_poll_zc(struct tsnep_rx *rx, struct napi_struct *napi,
+               prefetch(entry->xdp->data);
+               length = __le32_to_cpu(entry->desc_wb->properties) &
+                        TSNEP_DESC_LENGTH_MASK;
+-              xsk_buff_set_size(entry->xdp, length);
++              xsk_buff_set_size(entry->xdp, length - ETH_FCS_LEN);
+               xsk_buff_dma_sync_for_cpu(entry->xdp, rx->xsk_pool);
+ 
+               /* RX metadata with timestamps is in front of actual data,
+-- 
+2.43.0
+
diff --git a/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch b/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch

new file mode 100644 (file)

index 0000000..e698419
--- /dev/null
+++ b/queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch
@@ -0,0 +1,49 @@
+From 1cc6dc39a13a171888b77d808036b7e7b1013f78 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:56 +0800
+Subject: tun: add missing rx stats accounting in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit f1084c427f55d573fcd5688d9ba7b31b78019716 ]
+
+The TUN can be used as vhost-net backend, and it is necessary to
+count the packets transmitted from TUN to vhost-net/virtio-net.
+However, there are some places in the receive path that were not
+taken into account when using XDP. It would be beneficial to also
+include new accounting for successfully received bytes using
+dev_sw_netstats_rx_add.
+
+Fixes: 761876c857cb ("tap: XDP support")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index 237fef557ba5..4a4f8c8e79fa 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1634,6 +1634,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+                       dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
+               }
++              dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+               break;
+       case XDP_TX:
+               err = tun_xdp_tx(tun->dev, xdp);
+@@ -1641,6 +1642,7 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+                       dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
+               }
++              dev_sw_netstats_rx_add(tun->dev, xdp->data_end - xdp->data);
+               break;
+       case XDP_PASS:
+               break;
+-- 
+2.43.0
+
diff --git a/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch b/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch

new file mode 100644 (file)

index 0000000..f10345d
--- /dev/null
+++ b/queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch
@@ -0,0 +1,52 @@
+From c8a46f874ec240aa00fd746cab6f00c704ef999a Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Fri, 19 Jan 2024 18:22:35 +0800
+Subject: tun: fix missing dropped counter in tun_xdp_act
+
+From: Yunjian Wang <wangyunjian@huawei.com>
+
+[ Upstream commit 5744ba05e7c4bff8fec133dd0f9e51ddffba92f5 ]
+
+The commit 8ae1aff0b331 ("tuntap: split out XDP logic") includes
+dropped counter for XDP_DROP, XDP_ABORTED, and invalid XDP actions.
+Unfortunately, that commit missed the dropped counter when error
+occurs during XDP_TX and XDP_REDIRECT actions. This patch fixes
+this issue.
+
+Fixes: 8ae1aff0b331 ("tuntap: split out XDP logic")
+Signed-off-by: Yunjian Wang <wangyunjian@huawei.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Acked-by: Jason Wang <jasowang@redhat.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/tun.c | 8 ++++++--
+ 1 file changed, 6 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/tun.c b/drivers/net/tun.c
+index afa5497f7c35..237fef557ba5 100644
+--- a/drivers/net/tun.c
++++ b/drivers/net/tun.c
+@@ -1630,13 +1630,17 @@ static int tun_xdp_act(struct tun_struct *tun, struct bpf_prog *xdp_prog,
+       switch (act) {
+       case XDP_REDIRECT:
+               err = xdp_do_redirect(tun->dev, xdp, xdp_prog);
+-              if (err)
++              if (err) {
++                      dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
++              }
+               break;
+       case XDP_TX:
+               err = tun_xdp_tx(tun->dev, xdp);
+-              if (err < 0)
++              if (err < 0) {
++                      dev_core_stats_rx_dropped_inc(tun->dev);
+                       return err;
++              }
+               break;
+       case XDP_PASS:
+               break;
+-- 
+2.43.0
+
diff --git a/queue-6.7/udp-fix-busy-polling.patch b/queue-6.7/udp-fix-busy-polling.patch

new file mode 100644 (file)

index 0000000..7751b6a
--- /dev/null
+++ b/queue-6.7/udp-fix-busy-polling.patch
@@ -0,0 +1,134 @@
+From 40295213484304936b40dafc21ab65a5dd7cce8d Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 20:17:49 +0000
+Subject: udp: fix busy polling
+
+From: Eric Dumazet <edumazet@google.com>
+
+[ Upstream commit a54d51fb2dfb846aedf3751af501e9688db447f5 ]
+
+Generic sk_busy_loop_end() only looks at sk->sk_receive_queue
+for presence of packets.
+
+Problem is that for UDP sockets after blamed commit, some packets
+could be present in another queue: udp_sk(sk)->reader_queue
+
+In some cases, a busy poller could spin until timeout expiration,
+even if some packets are available in udp_sk(sk)->reader_queue.
+
+v3: - make sk_busy_loop_end() nicer (Willem)
+
+v2: - add a READ_ONCE(sk->sk_family) in sk_is_inet() to avoid KCSAN splats.
+    - add a sk_is_inet() check in sk_is_udp() (Willem feedback)
+    - add a sk_is_inet() check in sk_is_tcp().
+
+Fixes: 2276f58ac589 ("udp: use a separate rx queue for packet reception")
+Signed-off-by: Eric Dumazet <edumazet@google.com>
+Reviewed-by: Paolo Abeni <pabeni@redhat.com>
+Reviewed-by: Willem de Bruijn <willemb@google.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Signed-off-by: David S. Miller <davem@davemloft.net>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/linux/skmsg.h   |  6 ------
+ include/net/inet_sock.h |  5 -----
+ include/net/sock.h      | 18 +++++++++++++++++-
+ net/core/sock.c         | 11 +++++++++--
+ 4 files changed, 26 insertions(+), 14 deletions(-)
+
+diff --git a/include/linux/skmsg.h b/include/linux/skmsg.h
+index c953b8c0d2f4..bd4418377bac 100644
+--- a/include/linux/skmsg.h
++++ b/include/linux/skmsg.h
+@@ -500,12 +500,6 @@ static inline bool sk_psock_strp_enabled(struct sk_psock *psock)
+       return !!psock->saved_data_ready;
+ }
+ 
+-static inline bool sk_is_udp(const struct sock *sk)
+-{
+-      return sk->sk_type == SOCK_DGRAM &&
+-             sk->sk_protocol == IPPROTO_UDP;
+-}
+-
+ #if IS_ENABLED(CONFIG_NET_SOCK_MSG)
+ 
+ #define BPF_F_STRPARSER       (1UL << 1)
+diff --git a/include/net/inet_sock.h b/include/net/inet_sock.h
+index 74db6d97cae1..8d5fe15b0f6f 100644
+--- a/include/net/inet_sock.h
++++ b/include/net/inet_sock.h
+@@ -310,11 +310,6 @@ static inline unsigned long inet_cmsg_flags(const struct inet_sock *inet)
+ #define inet_assign_bit(nr, sk, val)          \
+       assign_bit(INET_FLAGS_##nr, &inet_sk(sk)->inet_flags, val)
+ 
+-static inline bool sk_is_inet(struct sock *sk)
+-{
+-      return sk->sk_family == AF_INET || sk->sk_family == AF_INET6;
+-}
+-
+ /**
+  * sk_to_full_sk - Access to a full socket
+  * @sk: pointer to a socket
+diff --git a/include/net/sock.h b/include/net/sock.h
+index 0201136b0b9c..f9a9f61fa122 100644
+--- a/include/net/sock.h
++++ b/include/net/sock.h
+@@ -2794,9 +2794,25 @@ static inline void skb_setup_tx_timestamp(struct sk_buff *skb, __u16 tsflags)
+                          &skb_shinfo(skb)->tskey);
+ }
+ 
++static inline bool sk_is_inet(const struct sock *sk)
++{
++      int family = READ_ONCE(sk->sk_family);
++
++      return family == AF_INET || family == AF_INET6;
++}
++
+ static inline bool sk_is_tcp(const struct sock *sk)
+ {
+-      return sk->sk_type == SOCK_STREAM && sk->sk_protocol == IPPROTO_TCP;
++      return sk_is_inet(sk) &&
++             sk->sk_type == SOCK_STREAM &&
++             sk->sk_protocol == IPPROTO_TCP;
++}
++
++static inline bool sk_is_udp(const struct sock *sk)
++{
++      return sk_is_inet(sk) &&
++             sk->sk_type == SOCK_DGRAM &&
++             sk->sk_protocol == IPPROTO_UDP;
+ }
+ 
+ static inline bool sk_is_stream_unix(const struct sock *sk)
+diff --git a/net/core/sock.c b/net/core/sock.c
+index d02534c77413..e5d43a068f8e 100644
+--- a/net/core/sock.c
++++ b/net/core/sock.c
+@@ -107,6 +107,7 @@
+ #include <linux/interrupt.h>
+ #include <linux/poll.h>
+ #include <linux/tcp.h>
++#include <linux/udp.h>
+ #include <linux/init.h>
+ #include <linux/highmem.h>
+ #include <linux/user_namespace.h>
+@@ -4148,8 +4149,14 @@ bool sk_busy_loop_end(void *p, unsigned long start_time)
+ {
+       struct sock *sk = p;
+ 
+-      return !skb_queue_empty_lockless(&sk->sk_receive_queue) ||
+-             sk_busy_loop_timeout(sk, start_time);
++      if (!skb_queue_empty_lockless(&sk->sk_receive_queue))
++              return true;
++
++      if (sk_is_udp(sk) &&
++          !skb_queue_empty_lockless(&udp_sk(sk)->reader_queue))
++              return true;
++
++      return sk_busy_loop_timeout(sk, start_time);
+ }
+ EXPORT_SYMBOL(sk_busy_loop_end);
+ #endif /* CONFIG_NET_RX_BUSY_POLL */
+-- 
+2.43.0
+
diff --git a/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch b/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch

new file mode 100644 (file)

index 0000000..baba253
--- /dev/null
+++ b/queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch
@@ -0,0 +1,58 @@
+From b3cc206ba9b0e1375f74191078ca0f84f0e06365 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 18 Jan 2024 21:03:06 +0800
+Subject: vlan: skip nested type that is not IFLA_VLAN_QOS_MAPPING
+
+From: Lin Ma <linma@zju.edu.cn>
+
+[ Upstream commit 6c21660fe221a15c789dee2bc2fd95516bc5aeaf ]
+
+In the vlan_changelink function, a loop is used to parse the nested
+attributes IFLA_VLAN_EGRESS_QOS and IFLA_VLAN_INGRESS_QOS in order to
+obtain the struct ifla_vlan_qos_mapping. These two nested attributes are
+checked in the vlan_validate_qos_map function, which calls
+nla_validate_nested_deprecated with the vlan_map_policy.
+
+However, this deprecated validator applies a LIBERAL strictness, allowing
+the presence of an attribute with the type IFLA_VLAN_QOS_UNSPEC.
+Consequently, the loop in vlan_changelink may parse an attribute of type
+IFLA_VLAN_QOS_UNSPEC and believe it carries a payload of
+struct ifla_vlan_qos_mapping, which is not necessarily true.
+
+To address this issue and ensure compatibility, this patch introduces two
+type checks that skip attributes whose type is not IFLA_VLAN_QOS_MAPPING.
+
+Fixes: 07b5b17e157b ("[VLAN]: Use rtnl_link API")
+Signed-off-by: Lin Ma <linma@zju.edu.cn>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240118130306.1644001-1-linma@zju.edu.cn
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/8021q/vlan_netlink.c | 4 ++++
+ 1 file changed, 4 insertions(+)
+
+diff --git a/net/8021q/vlan_netlink.c b/net/8021q/vlan_netlink.c
+index 214532173536..a3b68243fd4b 100644
+--- a/net/8021q/vlan_netlink.c
++++ b/net/8021q/vlan_netlink.c
+@@ -118,12 +118,16 @@ static int vlan_changelink(struct net_device *dev, struct nlattr *tb[],
+       }
+       if (data[IFLA_VLAN_INGRESS_QOS]) {
+               nla_for_each_nested(attr, data[IFLA_VLAN_INGRESS_QOS], rem) {
++                      if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++                              continue;
+                       m = nla_data(attr);
+                       vlan_dev_set_ingress_priority(dev, m->to, m->from);
+               }
+       }
+       if (data[IFLA_VLAN_EGRESS_QOS]) {
+               nla_for_each_nested(attr, data[IFLA_VLAN_EGRESS_QOS], rem) {
++                      if (nla_type(attr) != IFLA_VLAN_QOS_MAPPING)
++                              continue;
+                       m = nla_data(attr);
+                       err = vlan_dev_set_egress_priority(dev, m->from, m->to);
+                       if (err)
+-- 
+2.43.0
+
diff --git a/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch b/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch

new file mode 100644 (file)

index 0000000..3826328
--- /dev/null
+++ b/queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch
@@ -0,0 +1,44 @@
+From fd2890505f1dd291a5dc74f190d704c71b303d92 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Thu, 11 Jan 2024 18:17:44 +0200
+Subject: wifi: mac80211: fix potential sta-link leak
+
+From: Johannes Berg <johannes.berg@intel.com>
+
+[ Upstream commit b01a74b3ca6fd51b62c67733ba7c3280fa6c5d26 ]
+
+When a station is allocated, links are added but not
+set to valid yet (e.g. during connection to an AP MLD),
+we might remove the station without ever marking links
+valid, and leak them. Fix that.
+
+Fixes: cb71f1d136a6 ("wifi: mac80211: add sta link addition/removal")
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Reviewed-by: Ilan Peer <ilan.peer@intel.com>
+Signed-off-by: Miri Korenblit <miriam.rachel.korenblit@intel.com>
+Link: https://msgid.link/20240111181514.6573998beaf8.I09ac2e1d41c80f82a5a616b8bd1d9d8dd709a6a6@changeid
+Signed-off-by: Johannes Berg <johannes.berg@intel.com>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/mac80211/sta_info.c | 5 ++++-
+ 1 file changed, 4 insertions(+), 1 deletion(-)
+
+diff --git a/net/mac80211/sta_info.c b/net/mac80211/sta_info.c
+index 0ba613dd1cc4..c33decbb97f2 100644
+--- a/net/mac80211/sta_info.c
++++ b/net/mac80211/sta_info.c
+@@ -404,7 +404,10 @@ void sta_info_free(struct ieee80211_local *local, struct sta_info *sta)
+       int i;
+ 
+       for (i = 0; i < ARRAY_SIZE(sta->link); i++) {
+-              if (!(sta->sta.valid_links & BIT(i)))
++              struct link_sta_info *link_sta;
++
++              link_sta = rcu_access_pointer(sta->link[i]);
++              if (!link_sta)
+                       continue;
+ 
+               sta_remove_link(sta, i, false);
+-- 
+2.43.0
+
diff --git a/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch b/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch

new file mode 100644 (file)

index 0000000..bbfd0da
--- /dev/null
+++ b/queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch
@@ -0,0 +1,42 @@
+From ce88e3847c9d2c03cbd9e1a47d10d20adfe0bdc3 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:16:00 +0100
+Subject: xdp: reflect tail increase for MEM_TYPE_XSK_BUFF_POOL
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit fbadd83a612c3b7aad2987893faca6bd24aaebb3 ]
+
+XSK ZC Rx path calculates the size of data that will be posted to XSK Rx
+queue via subtracting xdp_buff::data_end from xdp_buff::data.
+
+In bpf_xdp_frags_increase_tail(), when underlying memory type of
+xdp_rxq_info is MEM_TYPE_XSK_BUFF_POOL, add offset to data_end in tail
+fragment, so that later on user space will be able to take into account
+the amount of bytes added by XDP program.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-10-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/core/filter.c | 2 ++
+ 1 file changed, 2 insertions(+)
+
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 6575288b8580..cee53838310f 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -4091,6 +4091,8 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+       memset(skb_frag_address(frag) + skb_frag_size(frag), 0, offset);
+       skb_frag_size_add(frag, offset);
+       sinfo->xdp_frags_size += offset;
++      if (rxq->mem.type == MEM_TYPE_XSK_BUFF_POOL)
++              xsk_buff_get_tail(xdp)->data_end += offset;
+ 
+       return 0;
+ }
+-- 
+2.43.0
+
diff --git a/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch b/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch

new file mode 100644 (file)

index 0000000..0ae327d
--- /dev/null
+++ b/queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch
@@ -0,0 +1,195 @@
+From b345185d903cd3418f8b01e7cdd56bdcb02fcac4 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:54 +0100
+Subject: xsk: fix usage of multi-buffer BPF helpers for ZC XDP
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit c5114710c8ce86b8317e9b448f4fd15c711c2a82 ]
+
+Currently when packet is shrunk via bpf_xdp_adjust_tail() and memory
+type is set to MEM_TYPE_XSK_BUFF_POOL, null ptr dereference happens:
+
+[1136314.192256] BUG: kernel NULL pointer dereference, address:
+0000000000000034
+[1136314.203943] #PF: supervisor read access in kernel mode
+[1136314.213768] #PF: error_code(0x0000) - not-present page
+[1136314.223550] PGD 0 P4D 0
+[1136314.230684] Oops: 0000 [#1] PREEMPT SMP NOPTI
+[1136314.239621] CPU: 8 PID: 54203 Comm: xdpsock Not tainted 6.6.0+ #257
+[1136314.250469] Hardware name: Intel Corporation S2600WFT/S2600WFT,
+BIOS SE5C620.86B.02.01.0008.031920191559 03/19/2019
+[1136314.265615] RIP: 0010:__xdp_return+0x6c/0x210
+[1136314.274653] Code: ad 00 48 8b 47 08 49 89 f8 a8 01 0f 85 9b 01 00 00 0f 1f 44 00 00 f0 41 ff 48 34 75 32 4c 89 c7 e9 79 cd 80 ff 83 fe 03 75 17 <f6> 41 34 01 0f 85 02 01 00 00 48 89 cf e9 22 cc 1e 00 e9 3d d2 86
+[1136314.302907] RSP: 0018:ffffc900089f8db0 EFLAGS: 00010246
+[1136314.312967] RAX: ffffc9003168aed0 RBX: ffff8881c3300000 RCX:
+0000000000000000
+[1136314.324953] RDX: 0000000000000000 RSI: 0000000000000003 RDI:
+ffffc9003168c000
+[1136314.336929] RBP: 0000000000000ae0 R08: 0000000000000002 R09:
+0000000000010000
+[1136314.348844] R10: ffffc9000e495000 R11: 0000000000000040 R12:
+0000000000000001
+[1136314.360706] R13: 0000000000000524 R14: ffffc9003168aec0 R15:
+0000000000000001
+[1136314.373298] FS:  00007f8df8bbcb80(0000) GS:ffff8897e0e00000(0000)
+knlGS:0000000000000000
+[1136314.386105] CS:  0010 DS: 0000 ES: 0000 CR0: 0000000080050033
+[1136314.396532] CR2: 0000000000000034 CR3: 00000001aa912002 CR4:
+00000000007706f0
+[1136314.408377] DR0: 0000000000000000 DR1: 0000000000000000 DR2:
+0000000000000000
+[1136314.420173] DR3: 0000000000000000 DR6: 00000000fffe0ff0 DR7:
+0000000000000400
+[1136314.431890] PKRU: 55555554
+[1136314.439143] Call Trace:
+[1136314.446058]  <IRQ>
+[1136314.452465]  ? __die+0x20/0x70
+[1136314.459881]  ? page_fault_oops+0x15b/0x440
+[1136314.468305]  ? exc_page_fault+0x6a/0x150
+[1136314.476491]  ? asm_exc_page_fault+0x22/0x30
+[1136314.484927]  ? __xdp_return+0x6c/0x210
+[1136314.492863]  bpf_xdp_adjust_tail+0x155/0x1d0
+[1136314.501269]  bpf_prog_ccc47ae29d3b6570_xdp_sock_prog+0x15/0x60
+[1136314.511263]  ice_clean_rx_irq_zc+0x206/0xc60 [ice]
+[1136314.520222]  ? ice_xmit_zc+0x6e/0x150 [ice]
+[1136314.528506]  ice_napi_poll+0x467/0x670 [ice]
+[1136314.536858]  ? ttwu_do_activate.constprop.0+0x8f/0x1a0
+[1136314.546010]  __napi_poll+0x29/0x1b0
+[1136314.553462]  net_rx_action+0x133/0x270
+[1136314.561619]  __do_softirq+0xbe/0x28e
+[1136314.569303]  do_softirq+0x3f/0x60
+
+This comes from __xdp_return() call with xdp_buff argument passed as
+NULL which is supposed to be consumed by xsk_buff_free() call.
+
+To address this properly, in ZC case, a node that represents the frag
+being removed has to be pulled out of xskb_list. Introduce
+appropriate xsk helpers to do such node operation and use them
+accordingly within bpf_xdp_adjust_tail().
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com> # For the xsk header part
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-4-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ include/net/xdp_sock_drv.h | 26 +++++++++++++++++++++++
+ net/core/filter.c          | 42 ++++++++++++++++++++++++++++++++------
+ 2 files changed, 62 insertions(+), 6 deletions(-)
+
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 7290eb721c07..5425f7ad5ebd 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -147,6 +147,23 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+       return ret;
+ }
+ 
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++      struct xdp_buff_xsk *xskb = container_of(tail, struct xdp_buff_xsk, xdp);
++
++      list_del(&xskb->xskb_list_node);
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++      struct xdp_buff_xsk *xskb = container_of(first, struct xdp_buff_xsk, xdp);
++      struct xdp_buff_xsk *frag;
++
++      frag = list_last_entry(&xskb->pool->xskb_list, struct xdp_buff_xsk,
++                             xskb_list_node);
++      return &frag->xdp;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+       xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+@@ -310,6 +327,15 @@ static inline struct xdp_buff *xsk_buff_get_frag(struct xdp_buff *first)
+       return NULL;
+ }
+ 
++static inline void xsk_buff_del_tail(struct xdp_buff *tail)
++{
++}
++
++static inline struct xdp_buff *xsk_buff_get_tail(struct xdp_buff *first)
++{
++      return NULL;
++}
++
+ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+ {
+ }
+diff --git a/net/core/filter.c b/net/core/filter.c
+index 1737884be52f..6575288b8580 100644
+--- a/net/core/filter.c
++++ b/net/core/filter.c
+@@ -83,6 +83,7 @@
+ #include <net/netfilter/nf_conntrack_bpf.h>
+ #include <net/netkit.h>
+ #include <linux/un.h>
++#include <net/xdp_sock_drv.h>
+ 
+ #include "dev.h"
+ 
+@@ -4094,6 +4095,40 @@ static int bpf_xdp_frags_increase_tail(struct xdp_buff *xdp, int offset)
+       return 0;
+ }
+ 
++static void bpf_xdp_shrink_data_zc(struct xdp_buff *xdp, int shrink,
++                                 struct xdp_mem_info *mem_info, bool release)
++{
++      struct xdp_buff *zc_frag = xsk_buff_get_tail(xdp);
++
++      if (release) {
++              xsk_buff_del_tail(zc_frag);
++              __xdp_return(NULL, mem_info, false, zc_frag);
++      } else {
++              zc_frag->data_end -= shrink;
++      }
++}
++
++static bool bpf_xdp_shrink_data(struct xdp_buff *xdp, skb_frag_t *frag,
++                              int shrink)
++{
++      struct xdp_mem_info *mem_info = &xdp->rxq->mem;
++      bool release = skb_frag_size(frag) == shrink;
++
++      if (mem_info->type == MEM_TYPE_XSK_BUFF_POOL) {
++              bpf_xdp_shrink_data_zc(xdp, shrink, mem_info, release);
++              goto out;
++      }
++
++      if (release) {
++              struct page *page = skb_frag_page(frag);
++
++              __xdp_return(page_address(page), mem_info, false, NULL);
++      }
++
++out:
++      return release;
++}
++
+ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+ {
+       struct skb_shared_info *sinfo = xdp_get_shared_info_from_buff(xdp);
+@@ -4108,12 +4143,7 @@ static int bpf_xdp_frags_shrink_tail(struct xdp_buff *xdp, int offset)
+ 
+               len_free += shrink;
+               offset -= shrink;
+-
+-              if (skb_frag_size(frag) == shrink) {
+-                      struct page *page = skb_frag_page(frag);
+-
+-                      __xdp_return(page_address(page), &xdp->rxq->mem,
+-                                   false, NULL);
++              if (bpf_xdp_shrink_data(xdp, frag, shrink)) {
+                       n_frags_free++;
+               } else {
+                       skb_frag_size_sub(frag, shrink);
+-- 
+2.43.0
+
diff --git a/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch b/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch

new file mode 100644 (file)

index 0000000..be79226
--- /dev/null
+++ b/queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch
@@ -0,0 +1,107 @@
+From 4b6c54fb2e4f8e8a3de0a7e6e4fd37f4ccaf58d0 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:53 +0100
+Subject: xsk: make xsk_buff_pool responsible for clearing xdp_buff::flags
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit f7f6aa8e24383fbb11ac55942e66da9660110f80 ]
+
+XDP multi-buffer support introduced XDP_FLAGS_HAS_FRAGS flag that is
+used by drivers to notify data path whether xdp_buff contains fragments
+or not. Data path looks up mentioned flag on first buffer that occupies
+the linear part of xdp_buff, so drivers only modify it there. This is
+sufficient for SKB and XDP_DRV modes as usually xdp_buff is allocated on
+stack or it resides within struct representing driver's queue and
+fragments are carried via skb_frag_t structs. IOW, we are dealing with
+only one xdp_buff.
+
+ZC mode though relies on list of xdp_buff structs that is carried via
+xsk_buff_pool::xskb_list, so ZC data path has to make sure that
+fragments do *not* have XDP_FLAGS_HAS_FRAGS set. Otherwise,
+xsk_buff_free() could misbehave if it would be executed against xdp_buff
+that carries a frag with XDP_FLAGS_HAS_FRAGS flag set. Such scenario can
+take place when within supplied XDP program bpf_xdp_adjust_tail() is
+used with negative offset that would in turn release the tail fragment
+from multi-buffer frame.
+
+Calling xsk_buff_free() on tail fragment with XDP_FLAGS_HAS_FRAGS would
+result in releasing all the nodes from xskb_list that were produced by
+driver before XDP program execution, which is not what is intended -
+only tail fragment should be deleted from xskb_list and then it should
+be put onto xsk_buff_pool::free_list. Such multi-buffer frame will never
+make it up to user space, so from AF_XDP application POV there would be
+no traffic running, however due to free_list getting constantly new
+nodes, driver will be able to feed HW Rx queue with recycled buffers.
+Bottom line is that instead of traffic being redirected to user space,
+it would be continuously dropped.
+
+To fix this, let us clear the mentioned flag on xsk_buff_pool side
+during xdp_buff initialization, which is what should have been done
+right from the start of XSK multi-buffer support.
+
+Fixes: 1bbc04de607b ("ice: xsk: add RX multi-buffer support")
+Fixes: 1c9ba9c14658 ("i40e: xsk: add RX multi-buffer support")
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-3-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ drivers/net/ethernet/intel/i40e/i40e_xsk.c | 1 -
+ drivers/net/ethernet/intel/ice/ice_xsk.c   | 1 -
+ include/net/xdp_sock_drv.h                 | 1 +
+ net/xdp/xsk_buff_pool.c                    | 1 +
+ 4 files changed, 2 insertions(+), 2 deletions(-)
+
+diff --git a/drivers/net/ethernet/intel/i40e/i40e_xsk.c b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+index e99fa854d17f..fede0bb3e047 100644
+--- a/drivers/net/ethernet/intel/i40e/i40e_xsk.c
++++ b/drivers/net/ethernet/intel/i40e/i40e_xsk.c
+@@ -499,7 +499,6 @@ int i40e_clean_rx_irq_zc(struct i40e_ring *rx_ring, int budget)
+               xdp_res = i40e_run_xdp_zc(rx_ring, first, xdp_prog);
+               i40e_handle_xdp_result_zc(rx_ring, first, rx_desc, &rx_packets,
+                                         &rx_bytes, xdp_res, &failure);
+-              first->flags = 0;
+               next_to_clean = next_to_process;
+               if (failure)
+                       break;
+diff --git a/drivers/net/ethernet/intel/ice/ice_xsk.c b/drivers/net/ethernet/intel/ice/ice_xsk.c
+index 99954508184f..951f84bfdf2b 100644
+--- a/drivers/net/ethernet/intel/ice/ice_xsk.c
++++ b/drivers/net/ethernet/intel/ice/ice_xsk.c
+@@ -891,7 +891,6 @@ int ice_clean_rx_irq_zc(struct ice_rx_ring *rx_ring, int budget)
+ 
+               if (!first) {
+                       first = xdp;
+-                      xdp_buff_clear_frags_flag(first);
+               } else if (ice_add_xsk_frag(rx_ring, first, xdp, size)) {
+                       break;
+               }
+diff --git a/include/net/xdp_sock_drv.h b/include/net/xdp_sock_drv.h
+index 1f6fc8c7a84c..7290eb721c07 100644
+--- a/include/net/xdp_sock_drv.h
++++ b/include/net/xdp_sock_drv.h
+@@ -152,6 +152,7 @@ static inline void xsk_buff_set_size(struct xdp_buff *xdp, u32 size)
+       xdp->data = xdp->data_hard_start + XDP_PACKET_HEADROOM;
+       xdp->data_meta = xdp->data;
+       xdp->data_end = xdp->data + size;
++      xdp->flags = 0;
+ }
+ 
+ static inline dma_addr_t xsk_buff_raw_get_dma(struct xsk_buff_pool *pool,
+diff --git a/net/xdp/xsk_buff_pool.c b/net/xdp/xsk_buff_pool.c
+index 49cb9f9a09be..b0a611677865 100644
+--- a/net/xdp/xsk_buff_pool.c
++++ b/net/xdp/xsk_buff_pool.c
+@@ -541,6 +541,7 @@ struct xdp_buff *xp_alloc(struct xsk_buff_pool *pool)
+ 
+       xskb->xdp.data = xskb->xdp.data_hard_start + XDP_PACKET_HEADROOM;
+       xskb->xdp.data_meta = xskb->xdp.data;
++      xskb->xdp.flags = 0;
+ 
+       if (pool->dma_need_sync) {
+               dma_sync_single_range_for_device(pool->dev, xskb->dma, 0,
+-- 
+2.43.0
+
diff --git a/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch b/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch

new file mode 100644 (file)

index 0000000..0df44b9
--- /dev/null
+++ b/queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch
@@ -0,0 +1,58 @@
+From c2fe6af64698a43889e90bbda82f4a926d00e464 Mon Sep 17 00:00:00 2001
+From: Sasha Levin <sashal@kernel.org>
+Date: Wed, 24 Jan 2024 20:15:52 +0100
+Subject: xsk: recycle buffer in case Rx queue was full
+
+From: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+
+[ Upstream commit 269009893146c495f41e9572dd9319e787c2eba9 ]
+
+Add missing xsk_buff_free() call when __xsk_rcv_zc() failed to produce
+descriptor to XSK Rx queue.
+
+Fixes: 24ea50127ecf ("xsk: support mbuf on ZC RX")
+Acked-by: Magnus Karlsson <magnus.karlsson@intel.com>
+Signed-off-by: Maciej Fijalkowski <maciej.fijalkowski@intel.com>
+Link: https://lore.kernel.org/r/20240124191602.566724-2-maciej.fijalkowski@intel.com
+Signed-off-by: Alexei Starovoitov <ast@kernel.org>
+Signed-off-by: Sasha Levin <sashal@kernel.org>
+---
+ net/xdp/xsk.c | 12 ++++++++----
+ 1 file changed, 8 insertions(+), 4 deletions(-)
+
+diff --git a/net/xdp/xsk.c b/net/xdp/xsk.c
+index 3da0b52f308d..688e641cd278 100644
+--- a/net/xdp/xsk.c
++++ b/net/xdp/xsk.c
+@@ -167,8 +167,10 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+               contd = XDP_PKT_CONTD;
+ 
+       err = __xsk_rcv_zc(xs, xskb, len, contd);
+-      if (err || likely(!frags))
+-              goto out;
++      if (err)
++              goto err;
++      if (likely(!frags))
++              return 0;
+ 
+       xskb_list = &xskb->pool->xskb_list;
+       list_for_each_entry_safe(pos, tmp, xskb_list, xskb_list_node) {
+@@ -177,11 +179,13 @@ static int xsk_rcv_zc(struct xdp_sock *xs, struct xdp_buff *xdp, u32 len)
+               len = pos->xdp.data_end - pos->xdp.data;
+               err = __xsk_rcv_zc(xs, pos, len, contd);
+               if (err)
+-                      return err;
++                      goto err;
+               list_del(&pos->xskb_list_node);
+       }
+ 
+-out:
++      return 0;
++err:
++      xsk_buff_free(xdp);
+       return err;
+ }
+ 
+-- 
+2.43.0
+
author	Sasha Levin <sashal@kernel.org>
	Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)
committer	Sasha Levin <sashal@kernel.org>
	Sat, 27 Jan 2024 12:47:04 +0000 (07:47 -0500)
queue-6.7/afs-add-comments-on-abort-handling.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-don-t-put-afs_call-in-afs_wait_for_call_to_compl.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-fix-error-handling-with-lookup-via-fs.inlinebulk.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-fix-the-usage-of-read_seqbegin_or_lock-in-afs_fi.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-handle-the-vio-and-uaeio-aborts-explicitly.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-hide-silly-rename-files-from-userspace.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-simplify-error-handling.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-turn-the-afs_addr_list-address-array-into-an-arr.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-use-op-nr_iterations-1-to-indicate-to-begin-file.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/afs-wrap-most-op-error-accesses-with-inline-funcs.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/bnxt_en-prevent-kernel-warning-when-running-offline-.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/bnxt_en-wait-for-flr-to-complete-during-probe.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/btrfs-scrub-avoid-use-after-free-when-chunk-length-i.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/dpll-fix-broken-error-path-in-dpll_pin_alloc.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/dpll-fix-pin-dump-crash-for-rebound-module.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/dpll-fix-register-pin-with-unregistered-parent-pin.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/dpll-fix-userspace-availability-of-pins.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/fjes-fix-memleaks-in-fjes_hw_setup.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/i40e-handle-multi-buffer-packets-that-are-shrunk-by-.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/i40e-set-xdp_rxq_info-frag_size.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/i40e-update-xdp_rxq_info-frag_size-for-zc-enabled-rx.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/ice-remove-redundant-xdp_rxq_info-registration.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/ice-update-xdp_rxq_info-frag_size-for-zc-enabled-rx-.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/ice-work-on-pre-xdp-prog-frag-count.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/idpf-distinguish-vports-by-the-dev_port-attribute.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/intel-xsk-initialize-skb_frag_t-bv_offset-in-zc-driv.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/ipv6-init-the-accept_queue-s-spinlocks-in-inet6_crea.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/llc-drop-support-for-eth_p_tr_802_2.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/llc-make-llc_ui_sendmsg-more-robust-against-bonding-.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-fec-fix-the-unhandled-context-fault-from-smmu.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-fix-removing-a-namespace-with-conflicting-altnam.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-micrel-fix-ptp-frame-parsing-for-lan8814.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5-bridge-fix-multicast-packets-sent-to-uplink.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5-dr-can-t-go-to-uplink-vport-on-rx-rule.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5-dr-use-the-right-gvmi-number-for-drop-actio.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5-fix-a-warn-upon-a-callback-command-failure.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5-use-mlx5-device-constant-for-selecting-cq-p.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-allow-software-parsing-when-ipsec-crypto-i.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-fix-a-double-free-in-arfs_create_groups.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-fix-a-potential-double-free-in-fs_any_crea.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-fix-inconsistent-hairpin-rqt-sizes.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-fix-operation-precedence-bug-in-port-times.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-fix-peer-flow-lists-handling.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mlx5e-ignore-ipsec-replay-window-values-on-sende.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-mvpp2-clear-bm-pool-before-initialization.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-rds-fix-ubsan-array-index-out-of-bounds-in-rds_c.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-sched-flower-fix-chain-template-offload.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-smc-fix-illegal-rmb_desc-access-in-smc-d-connect.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/net-stmmac-wait-a-bit-for-the-reset-to-take-effect.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/netfilter-nf_tables-restrict-anonymous-set-and-map-n.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/netfilter-nf_tables-validate-nfproto_-family.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/netfilter-nft_limit-reject-configurations-that-cause.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/netfs-fscache-prevent-oops-in-fscache_put_cache.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/netlink-fix-potential-sleeping-issue-in-mqueue_flush.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/rcu-defer-rcu-kthreads-wakeup-when-cpu-is-dying.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/rxrpc-afs-allow-afs-to-pin-rxrpc_peer-objects.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftest-don-t-reuse-port-for-so_incoming_cpu-test.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftests-bonding-do-not-test-arp-ns-target-with-mod.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftests-bonding-increase-timeout-to-1200s.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftests-fill-in-some-missing-configs-for-net.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftests-net-fix-rps_default_mask-with-32-cpus.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/selftests-netdevsim-fix-the-udp_tunnel_nic-test.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/series		patch \| blob \| blame \| history
queue-6.7/sunrpc-use-request-size-to-initialize-bio_vec-in-svc.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tcp-add-memory-barrier-to-tcp_push.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tcp-make-sure-init-the-accept_queue-s-spinlocks-once.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tracing-ensure-visibility-when-inserting-an-element-.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tsnep-fix-xdp_ring_need_wakeup-for-empty-fill-ring.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tsnep-remove-fcs-for-xdp-data-path.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tun-add-missing-rx-stats-accounting-in-tun_xdp_act.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/tun-fix-missing-dropped-counter-in-tun_xdp_act.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/udp-fix-busy-polling.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/vlan-skip-nested-type-that-is-not-ifla_vlan_qos_mapp.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/wifi-mac80211-fix-potential-sta-link-leak.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/xdp-reflect-tail-increase-for-mem_type_xsk_buff_pool.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/xsk-fix-usage-of-multi-buffer-bpf-helpers-for-zc-xdp.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/xsk-make-xsk_buff_pool-responsible-for-clearing-xdp_.patch	[new file with mode: 0644]	patch \| blob
queue-6.7/xsk-recycle-buffer-in-case-rx-queue-was-full.patch	[new file with mode: 0644]	patch \| blob