]> git.ipfire.org Git - thirdparty/kernel/stable-queue.git/commitdiff
6.6-stable patches
authorGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 29 May 2025 12:22:08 +0000 (14:22 +0200)
committerGreg Kroah-Hartman <gregkh@linuxfoundation.org>
Thu, 29 May 2025 12:22:08 +0000 (14:22 +0200)
added patches:
af_unix-add-dead-flag-to-struct-scm_fp_list.patch
af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch
af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch
af_unix-assign-a-unique-index-to-scc.patch
af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch
af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch
af_unix-detect-dead-scc.patch
af_unix-detect-strongly-connected-components.patch
af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch
af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch
af_unix-fix-uninit-value-in-__unix_walk_scc.patch
af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch
af_unix-iterate-all-vertices-by-dfs.patch
af_unix-link-struct-unix_edge-when-queuing-skb.patch
af_unix-remove-config_unix_scm.patch
af_unix-remove-io_uring-code-for-gc.patch
af_unix-remove-lock-dance-in-unix_peek_fds.patch
af_unix-replace-bug_on-with-warn_on_once.patch
af_unix-replace-garbage-collection-algorithm.patch
af_unix-return-struct-unix_sock-from-unix_get_socket.patch
af_unix-run-gc-on-only-one-cpu.patch
af_unix-save-listener-for-embryo-socket.patch
af_unix-save-o-n-setup-of-tarjan-s-algo.patch
af_unix-skip-gc-if-no-cycle-exists.patch
af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch
af_unix-try-to-run-gc-async.patch

27 files changed:
queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch [new file with mode: 0644]
queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch [new file with mode: 0644]
queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch [new file with mode: 0644]
queue-6.6/af_unix-assign-a-unique-index-to-scc.patch [new file with mode: 0644]
queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch [new file with mode: 0644]
queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch [new file with mode: 0644]
queue-6.6/af_unix-detect-dead-scc.patch [new file with mode: 0644]
queue-6.6/af_unix-detect-strongly-connected-components.patch [new file with mode: 0644]
queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch [new file with mode: 0644]
queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch [new file with mode: 0644]
queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch [new file with mode: 0644]
queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch [new file with mode: 0644]
queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch [new file with mode: 0644]
queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch [new file with mode: 0644]
queue-6.6/af_unix-remove-config_unix_scm.patch [new file with mode: 0644]
queue-6.6/af_unix-remove-io_uring-code-for-gc.patch [new file with mode: 0644]
queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch [new file with mode: 0644]
queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch [new file with mode: 0644]
queue-6.6/af_unix-replace-garbage-collection-algorithm.patch [new file with mode: 0644]
queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch [new file with mode: 0644]
queue-6.6/af_unix-run-gc-on-only-one-cpu.patch [new file with mode: 0644]
queue-6.6/af_unix-save-listener-for-embryo-socket.patch [new file with mode: 0644]
queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch [new file with mode: 0644]
queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch [new file with mode: 0644]
queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch [new file with mode: 0644]
queue-6.6/af_unix-try-to-run-gc-async.patch [new file with mode: 0644]
queue-6.6/series

diff --git a/queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch b/queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch
new file mode 100644 (file)
index 0000000..cb74c7b
--- /dev/null
@@ -0,0 +1,110 @@
+From stable+bounces-145871-greg=kroah.com@vger.kernel.org Wed May 21 16:58:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:32 +0000
+Subject: af_unix: Add dead flag to struct scm_fp_list.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-25-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 7172dc93d621d5dc302d007e95ddd1311ec64283 upstream.
+
+Commit 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges()
+during GC.") fixed use-after-free by avoid accessing edge->successor while
+GC is in progress.
+
+However, there could be a small race window where another process could
+call unix_del_edges() while gc_in_progress is true and __skb_queue_purge()
+is on the way.
+
+So, we need another marker for struct scm_fp_list which indicates if the
+skb is garbage-collected.
+
+This patch adds dead flag in struct scm_fp_list and set it true before
+calling __skb_queue_purge().
+
+Fixes: 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240508171150.50601-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/scm.h  |    1 +
+ net/core/scm.c     |    1 +
+ net/unix/garbage.c |   14 ++++++++++----
+ 3 files changed, 12 insertions(+), 4 deletions(-)
+
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -32,6 +32,7 @@ struct scm_fp_list {
+       short                   max;
+ #ifdef CONFIG_UNIX
+       bool                    inflight;
++      bool                    dead;
+       struct list_head        vertices;
+       struct unix_edge        *edges;
+ #endif
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -91,6 +91,7 @@ static int scm_fp_copy(struct cmsghdr *c
+               fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
+               fpl->inflight = false;
++              fpl->dead = false;
+               fpl->edges = NULL;
+               INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -158,13 +158,11 @@ static void unix_add_edge(struct scm_fp_
+       unix_update_graph(unix_edge_successor(edge));
+ }
+-static bool gc_in_progress;
+-
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+       struct unix_vertex *vertex = edge->predecessor->vertex;
+-      if (!gc_in_progress)
++      if (!fpl->dead)
+               unix_update_graph(unix_edge_successor(edge));
+       list_del(&edge->vertex_entry);
+@@ -240,7 +238,7 @@ void unix_del_edges(struct scm_fp_list *
+               unix_del_edge(fpl, edge);
+       } while (i < fpl->count_unix);
+-      if (!gc_in_progress) {
++      if (!fpl->dead) {
+               receiver = fpl->edges[0].successor;
+               receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+       }
+@@ -559,9 +557,12 @@ static void unix_walk_scc_fast(struct sk
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
++static bool gc_in_progress;
++
+ static void __unix_gc(struct work_struct *work)
+ {
+       struct sk_buff_head hitlist;
++      struct sk_buff *skb;
+       spin_lock(&unix_gc_lock);
+@@ -579,6 +580,11 @@ static void __unix_gc(struct work_struct
+       spin_unlock(&unix_gc_lock);
++      skb_queue_walk(&hitlist, skb) {
++              if (UNIXCB(skb).fp)
++                      UNIXCB(skb).fp->dead = true;
++      }
++
+       __skb_queue_purge(&hitlist);
+ skip_gc:
+       WRITE_ONCE(gc_in_progress, false);
diff --git a/queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch b/queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch
new file mode 100644 (file)
index 0000000..8233e90
--- /dev/null
@@ -0,0 +1,115 @@
+From stable+bounces-145855-greg=kroah.com@vger.kernel.org Wed May 21 16:53:51 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:16 +0000
+Subject: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-9-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 29b64e354029cfcf1eea4d91b146c7b769305930 upstream.
+
+As with the previous patch, we preallocate to skb's scm_fp_list an
+array of struct unix_edge in the number of inflight AF_UNIX fds.
+
+There we just preallocate memory and do not use immediately because
+sendmsg() could fail after this point.  The actual use will be in
+the next patch.
+
+When we queue skb with inflight edges, we will set the inflight
+socket's unix_sock as unix_edge->predecessor and the receiver's
+unix_sock as successor, and then we will link the edge to the
+inflight socket's unix_vertex.edges.
+
+Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup()
+so that MSG_PEEK does not change the shape of the directed graph.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    6 ++++++
+ include/net/scm.h     |    5 +++++
+ net/core/scm.c        |    2 ++
+ net/unix/garbage.c    |    6 ++++++
+ 4 files changed, 19 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -33,6 +33,12 @@ struct unix_vertex {
+       unsigned long out_degree;
+ };
++struct unix_edge {
++      struct unix_sock *predecessor;
++      struct unix_sock *successor;
++      struct list_head vertex_entry;
++};
++
+ struct sock *unix_peer_get(struct sock *sk);
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -22,12 +22,17 @@ struct scm_creds {
+       kgid_t  gid;
+ };
++#ifdef CONFIG_UNIX
++struct unix_edge;
++#endif
++
+ struct scm_fp_list {
+       short                   count;
+       short                   count_unix;
+       short                   max;
+ #ifdef CONFIG_UNIX
+       struct list_head        vertices;
++      struct unix_edge        *edges;
+ #endif
+       struct user_struct      *user;
+       struct file             *fp[SCM_MAX_FD];
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c
+               fpl->max = SCM_MAX_FD;
+               fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
++              fpl->edges = NULL;
+               INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+       }
+@@ -383,6 +384,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
+               new_fpl->max = new_fpl->count;
+               new_fpl->user = get_uid(fpl->user);
+ #if IS_ENABLED(CONFIG_UNIX)
++              new_fpl->edges = NULL;
+               INIT_LIST_HEAD(&new_fpl->vertices);
+ #endif
+       }
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list
+               list_add(&vertex->entry, &fpl->vertices);
+       }
++      fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
++                                  GFP_KERNEL_ACCOUNT);
++      if (!fpl->edges)
++              goto err;
++
+       return 0;
+ err:
+@@ -136,6 +141,7 @@ err:
+ void unix_destroy_fpl(struct scm_fp_list *fpl)
+ {
++      kvfree(fpl->edges);
+       unix_free_vertices(fpl);
+ }
diff --git a/queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch b/queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch
new file mode 100644 (file)
index 0000000..9767815
--- /dev/null
@@ -0,0 +1,202 @@
+From stable+bounces-145854-greg=kroah.com@vger.kernel.org Wed May 21 16:53:21 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:15 +0000
+Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-8-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 1fbfdfaa590248c1d86407f578e40e5c65136330 upstream.
+
+We will replace the garbage collection algorithm for AF_UNIX, where
+we will consider each inflight AF_UNIX socket as a vertex and its file
+descriptor as an edge in a directed graph.
+
+This patch introduces a new struct unix_vertex representing a vertex
+in the graph and adds its pointer to struct unix_sock.
+
+When we send a fd using the SCM_RIGHTS message, we allocate struct
+scm_fp_list to struct scm_cookie in scm_fp_copy().  Then, we bump
+each refcount of the inflight fds' struct file and save them in
+scm_fp_list.fp.
+
+After that, unix_attach_fds() inexplicably clones scm_fp_list of
+scm_cookie and sets it to skb.  (We will remove this part after
+replacing GC.)
+
+Here, we add a new function call in unix_attach_fds() to preallocate
+struct unix_vertex per inflight AF_UNIX fd and link each vertex to
+skb's scm_fp_list.vertices.
+
+When sendmsg() succeeds later, if the socket of the inflight fd is
+still not inflight yet, we will set the preallocated vertex to struct
+unix_sock.vertex and link it to a global list unix_unvisited_vertices
+under spin_lock(&unix_gc_lock).
+
+If the socket is already inflight, we free the preallocated vertex.
+This is to avoid taking the lock unnecessarily when sendmsg() could
+fail later.
+
+In the following patch, we will similarly allocate another struct
+per edge, which will finally be linked to the inflight socket's
+unix_vertex.edges.
+
+And then, we will count the number of edges as unix_vertex.out_degree.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    9 +++++++++
+ include/net/scm.h     |    3 +++
+ net/core/scm.c        |    7 +++++++
+ net/unix/af_unix.c    |    6 ++++++
+ net/unix/garbage.c    |   38 ++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 63 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight;
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
++int unix_prepare_fpl(struct scm_fp_list *fpl);
++void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
++struct unix_vertex {
++      struct list_head edges;
++      struct list_head entry;
++      unsigned long out_degree;
++};
++
+ struct sock *unix_peer_get(struct sock *sk);
+ #define UNIX_HASH_MOD (256 - 1)
+@@ -62,6 +70,7 @@ struct unix_sock {
+       struct path             path;
+       struct mutex            iolock, bindlock;
+       struct sock             *peer;
++      struct unix_vertex      *vertex;
+       struct list_head        link;
+       unsigned long           inflight;
+       spinlock_t              lock;
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -26,6 +26,9 @@ struct scm_fp_list {
+       short                   count;
+       short                   count_unix;
+       short                   max;
++#ifdef CONFIG_UNIX
++      struct list_head        vertices;
++#endif
+       struct user_struct      *user;
+       struct file             *fp[SCM_MAX_FD];
+ };
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *c
+               fpl->count_unix = 0;
+               fpl->max = SCM_MAX_FD;
+               fpl->user = NULL;
++#if IS_ENABLED(CONFIG_UNIX)
++              INIT_LIST_HEAD(&fpl->vertices);
++#endif
+       }
+       fpp = &fpl->fp[fpl->count];
+@@ -376,8 +379,12 @@ struct scm_fp_list *scm_fp_dup(struct sc
+       if (new_fpl) {
+               for (i = 0; i < fpl->count; i++)
+                       get_file(fpl->fp[i]);
++
+               new_fpl->max = new_fpl->count;
+               new_fpl->user = get_uid(fpl->user);
++#if IS_ENABLED(CONFIG_UNIX)
++              INIT_LIST_HEAD(&new_fpl->vertices);
++#endif
+       }
+       return new_fpl;
+ }
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -979,6 +979,7 @@ static struct sock *unix_create1(struct
+       sk->sk_destruct         = unix_sock_destructor;
+       u = unix_sk(sk);
+       u->inflight = 0;
++      u->vertex = NULL;
+       u->path.dentry = NULL;
+       u->path.mnt = NULL;
+       spin_lock_init(&u->lock);
+@@ -1782,6 +1783,9 @@ static int unix_attach_fds(struct scm_co
+       for (i = scm->fp->count - 1; i >= 0; i--)
+               unix_inflight(scm->fp->user, scm->fp->fp[i]);
++      if (unix_prepare_fpl(UNIXCB(skb).fp))
++              return -ENOMEM;
++
+       return 0;
+ }
+@@ -1792,6 +1796,8 @@ static void unix_detach_fds(struct scm_c
+       scm->fp = UNIXCB(skb).fp;
+       UNIXCB(skb).fp = NULL;
++      unix_destroy_fpl(scm->fp);
++
+       for (i = scm->fp->count - 1; i >= 0; i--)
+               unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+ }
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct
+       return NULL;
+ }
++static void unix_free_vertices(struct scm_fp_list *fpl)
++{
++      struct unix_vertex *vertex, *next_vertex;
++
++      list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
++              list_del(&vertex->entry);
++              kfree(vertex);
++      }
++}
++
++int unix_prepare_fpl(struct scm_fp_list *fpl)
++{
++      struct unix_vertex *vertex;
++      int i;
++
++      if (!fpl->count_unix)
++              return 0;
++
++      for (i = 0; i < fpl->count_unix; i++) {
++              vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
++              if (!vertex)
++                      goto err;
++
++              list_add(&vertex->entry, &fpl->vertices);
++      }
++
++      return 0;
++
++err:
++      unix_free_vertices(fpl);
++      return -ENOMEM;
++}
++
++void unix_destroy_fpl(struct scm_fp_list *fpl)
++{
++      unix_free_vertices(fpl);
++}
++
+ DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
diff --git a/queue-6.6/af_unix-assign-a-unique-index-to-scc.patch b/queue-6.6/af_unix-assign-a-unique-index-to-scc.patch
new file mode 100644 (file)
index 0000000..c542254
--- /dev/null
@@ -0,0 +1,166 @@
+From stable+bounces-145865-greg=kroah.com@vger.kernel.org Wed May 21 16:55:41 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:26 +0000
+Subject: af_unix: Assign a unique index to SCC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-19-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 upstream.
+
+The definition of the lowlink in Tarjan's algorithm is the
+smallest index of a vertex that is reachable with at most one
+back-edge in SCC.  This is not useful for a cross-edge.
+
+If we start traversing from A in the following graph, the final
+lowlink of D is 3.  The cross-edge here is one between D and C.
+
+  A -> B -> D   D = (4, 3)  (index, lowlink)
+  ^    |    |   C = (3, 1)
+  |    V    |   B = (2, 1)
+  `--- C <--'   A = (1, 1)
+
+This is because the lowlink of D is updated with the index of C.
+
+In the following patch, we detect a dead SCC by checking two
+conditions for each vertex.
+
+  1) vertex has no edge directed to another SCC (no bridge)
+  2) vertex's out_degree is the same as the refcount of its file
+
+If 1) is false, there is a receiver of all fds of the SCC and
+its ancestor SCC.
+
+To evaluate 1), we need to assign a unique index to each SCC and
+assign it to all vertices in the SCC.
+
+This patch changes the lowlink update logic for cross-edge so
+that in the example above, the lowlink of D is updated with the
+lowlink of C.
+
+  A -> B -> D   D = (4, 1)  (index, lowlink)
+  ^    |    |   C = (3, 1)
+  |    V    |   B = (2, 1)
+  `--- C <--'   A = (1, 1)
+
+Then, all vertices in the same SCC have the same lowlink, and we
+can quickly find the bridge connecting to different SCC if exists.
+
+However, it is no longer called lowlink, so we rename it to
+scc_index.  (It's sometimes called lowpoint.)
+
+Also, we add a global variable to hold the last index used in DFS
+so that we do not reset the initial index in each DFS.
+
+This patch can be squashed to the SCC detection patch but is
+split deliberately for anyone wondering why lowlink is not used
+as used in the original Tarjan's algorithm and many reference
+implementations.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    2 +-
+ net/unix/garbage.c    |   29 +++++++++++++++--------------
+ 2 files changed, 16 insertions(+), 15 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -36,7 +36,7 @@ struct unix_vertex {
+       struct list_head scc_entry;
+       unsigned long out_degree;
+       unsigned long index;
+-      unsigned long lowlink;
++      unsigned long scc_index;
+ };
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+-static void __unix_walk_scc(struct unix_vertex *vertex)
++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
+ {
+-      unsigned long index = UNIX_VERTEX_INDEX_START;
+       LIST_HEAD(vertex_stack);
+       struct unix_edge *edge;
+       LIST_HEAD(edge_stack);
+@@ -326,9 +325,9 @@ next_vertex:
+        */
+       list_add(&vertex->scc_entry, &vertex_stack);
+-      vertex->index = index;
+-      vertex->lowlink = index;
+-      index++;
++      vertex->index = *last_index;
++      vertex->scc_index = *last_index;
++      (*last_index)++;
+       /* Explore neighbour vertices (receivers of the current vertex's fd). */
+       list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+@@ -358,30 +357,30 @@ prev_vertex:
+                       next_vertex = vertex;
+                       vertex = edge->predecessor->vertex;
+-                      /* If the successor has a smaller lowlink, two vertices
+-                       * are in the same SCC, so propagate the smaller lowlink
++                      /* If the successor has a smaller scc_index, two vertices
++                       * are in the same SCC, so propagate the smaller scc_index
+                        * to skip SCC finalisation.
+                        */
+-                      vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
++                      vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+               } else if (next_vertex->index != unix_vertex_grouped_index) {
+                       /* Loop detected by a back/cross edge.
+                        *
+-                       * The successor is on vertex_stack, so two vertices are
+-                       * in the same SCC.  If the successor has a smaller index,
++                       * The successor is on vertex_stack, so two vertices are in
++                       * the same SCC.  If the successor has a smaller *scc_index*,
+                        * propagate it to skip SCC finalisation.
+                        */
+-                      vertex->lowlink = min(vertex->lowlink, next_vertex->index);
++                      vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+               } else {
+                       /* The successor was already grouped as another SCC */
+               }
+       }
+-      if (vertex->index == vertex->lowlink) {
++      if (vertex->index == vertex->scc_index) {
+               struct list_head scc;
+               /* SCC finalised.
+                *
+-               * If the lowlink was not updated, all the vertices above on
++               * If the scc_index was not updated, all the vertices above on
+                * vertex_stack are in the same SCC.  Group them using scc_entry.
+                */
+               __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+@@ -407,6 +406,8 @@ prev_vertex:
+ static void unix_walk_scc(void)
+ {
++      unsigned long last_index = UNIX_VERTEX_INDEX_START;
++
+       unix_graph_maybe_cyclic = false;
+       /* Visit every vertex exactly once.
+@@ -416,7 +417,7 @@ static void unix_walk_scc(void)
+               struct unix_vertex *vertex;
+               vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+-              __unix_walk_scc(vertex);
++              __unix_walk_scc(vertex, &last_index);
+       }
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
diff --git a/queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch b/queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch
new file mode 100644 (file)
index 0000000..6a20fb0
--- /dev/null
@@ -0,0 +1,106 @@
+From stable+bounces-145864-greg=kroah.com@vger.kernel.org Wed May 21 16:58:08 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:25 +0000
+Subject: af_unix: Avoid Tarjan's algorithm if unnecessary.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-18-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit ad081928a8b0f57f269df999a28087fce6f2b6ce upstream.
+
+Once a cyclic reference is formed, we need to run GC to check if
+there is dead SCC.
+
+However, we do not need to run Tarjan's algorithm if we know that
+the shape of the inflight graph has not been changed.
+
+If an edge is added/updated/deleted and the edge's successor is
+inflight, we set false to unix_graph_grouped, which means we need
+to re-classify SCC.
+
+Once we finalise SCC, we set true to unix_graph_grouped.
+
+While unix_graph_grouped is true, we can iterate the grouped
+SCC using vertex->scc_entry in unix_walk_scc_fast().
+
+list_add() and list_for_each_entry_reverse() uses seem weird, but
+they are to keep the vertex order consistent and make writing test
+easier.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-12-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   28 +++++++++++++++++++++++++++-
+ 1 file changed, 27 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -113,6 +113,7 @@ static struct unix_vertex *unix_edge_suc
+ }
+ static bool unix_graph_maybe_cyclic;
++static bool unix_graph_grouped;
+ static void unix_update_graph(struct unix_vertex *vertex)
+ {
+@@ -123,6 +124,7 @@ static void unix_update_graph(struct uni
+               return;
+       unix_graph_maybe_cyclic = true;
++      unix_graph_grouped = false;
+ }
+ static LIST_HEAD(unix_unvisited_vertices);
+@@ -144,6 +146,7 @@ static void unix_add_edge(struct scm_fp_
+               vertex->index = unix_vertex_unvisited_index;
+               vertex->out_degree = 0;
+               INIT_LIST_HEAD(&vertex->edges);
++              INIT_LIST_HEAD(&vertex->scc_entry);
+               list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+               edge->predecessor->vertex = vertex;
+@@ -418,6 +421,26 @@ static void unix_walk_scc(void)
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+       swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
++
++      unix_graph_grouped = true;
++}
++
++static void unix_walk_scc_fast(void)
++{
++      while (!list_empty(&unix_unvisited_vertices)) {
++              struct unix_vertex *vertex;
++              struct list_head scc;
++
++              vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
++              list_add(&scc, &vertex->scc_entry);
++
++              list_for_each_entry_reverse(vertex, &scc, scc_entry)
++                      list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++              list_del(&scc);
++      }
++
++      list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+ static LIST_HEAD(gc_candidates);
+@@ -570,7 +593,10 @@ static void __unix_gc(struct work_struct
+       if (!unix_graph_maybe_cyclic)
+               goto skip_gc;
+-      unix_walk_scc();
++      if (unix_graph_grouped)
++              unix_walk_scc_fast();
++      else
++              unix_walk_scc();
+       /* First, select candidates for garbage collection.  Only
+        * in-flight sockets are considered, and from those only ones
diff --git a/queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch b/queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch
new file mode 100644 (file)
index 0000000..9732046
--- /dev/null
@@ -0,0 +1,102 @@
+From stable+bounces-145857-greg=kroah.com@vger.kernel.org Wed May 21 16:54:19 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:18 +0000
+Subject: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-11-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 22c3c0c52d32f41cc38cd936ea0c93f22ced3315 upstream.
+
+Currently, we track the number of inflight sockets in two variables.
+unix_tot_inflight is the total number of inflight AF_UNIX sockets on
+the host, and user->unix_inflight is the number of inflight fds per
+user.
+
+We update them one by one in unix_inflight(), which can be done once
+in batch.  Also, sendmsg() could fail even after unix_inflight(), then
+we need to acquire unix_gc_lock only to decrement the counters.
+
+Let's bulk update the counters in unix_add_edges() and unix_del_edges(),
+which is called only for successfully passed fds.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-5-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   18 +++++++-----------
+ 1 file changed, 7 insertions(+), 11 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -144,6 +144,7 @@ static void unix_free_vertices(struct sc
+ }
+ DEFINE_SPINLOCK(unix_gc_lock);
++unsigned int unix_tot_inflight;
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
+ {
+@@ -168,7 +169,10 @@ void unix_add_edges(struct scm_fp_list *
+               unix_add_edge(fpl, edge);
+       } while (i < fpl->count_unix);
++      WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+ out:
++      WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
++
+       spin_unlock(&unix_gc_lock);
+       fpl->inflight = true;
+@@ -191,7 +195,10 @@ void unix_del_edges(struct scm_fp_list *
+               unix_del_edge(fpl, edge);
+       } while (i < fpl->count_unix);
++      WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
++      WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
++
+       spin_unlock(&unix_gc_lock);
+       fpl->inflight = false;
+@@ -234,7 +241,6 @@ void unix_destroy_fpl(struct scm_fp_list
+       unix_free_vertices(fpl);
+ }
+-unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
+@@ -255,13 +261,8 @@ void unix_inflight(struct user_struct *u
+                       WARN_ON_ONCE(list_empty(&u->link));
+               }
+               u->inflight++;
+-
+-              /* Paired with READ_ONCE() in wait_for_unix_gc() */
+-              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
+       }
+-      WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+-
+       spin_unlock(&unix_gc_lock);
+ }
+@@ -278,13 +279,8 @@ void unix_notinflight(struct user_struct
+               u->inflight--;
+               if (!u->inflight)
+                       list_del_init(&u->link);
+-
+-              /* Paired with READ_ONCE() in wait_for_unix_gc() */
+-              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
+       }
+-      WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+-
+       spin_unlock(&unix_gc_lock);
+ }
diff --git a/queue-6.6/af_unix-detect-dead-scc.patch b/queue-6.6/af_unix-detect-dead-scc.patch
new file mode 100644 (file)
index 0000000..a7c720f
--- /dev/null
@@ -0,0 +1,112 @@
+From stable+bounces-145866-greg=kroah.com@vger.kernel.org Wed May 21 16:59:06 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:27 +0000
+Subject: af_unix: Detect dead SCC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-20-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit a15702d8b3aad8ce5268c565bd29f0e02fd2db83 upstream.
+
+When iterating SCC, we call unix_vertex_dead() for each vertex
+to check if the vertex is close()d and has no bridge to another
+SCC.
+
+If both conditions are true for every vertex in SCC, we can
+execute garbage collection for all skb in the SCC.
+
+The actual garbage collection is done in the following patch,
+replacing the old implementation.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-14-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   44 +++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 43 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -289,6 +289,39 @@ void unix_destroy_fpl(struct scm_fp_list
+       unix_free_vertices(fpl);
+ }
++static bool unix_vertex_dead(struct unix_vertex *vertex)
++{
++      struct unix_edge *edge;
++      struct unix_sock *u;
++      long total_ref;
++
++      list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++              struct unix_vertex *next_vertex = unix_edge_successor(edge);
++
++              /* The vertex's fd can be received by a non-inflight socket. */
++              if (!next_vertex)
++                      return false;
++
++              /* The vertex's fd can be received by an inflight socket in
++               * another SCC.
++               */
++              if (next_vertex->scc_index != vertex->scc_index)
++                      return false;
++      }
++
++      /* No receiver exists out of the same SCC. */
++
++      edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
++      u = edge->predecessor;
++      total_ref = file_count(u->sk.sk_socket->file);
++
++      /* If not close()d, total_ref > out_degree. */
++      if (total_ref != vertex->out_degree)
++              return false;
++
++      return true;
++}
++
+ static bool unix_scc_cyclic(struct list_head *scc)
+ {
+       struct unix_vertex *vertex;
+@@ -377,6 +410,7 @@ prev_vertex:
+       if (vertex->index == vertex->scc_index) {
+               struct list_head scc;
++              bool scc_dead = true;
+               /* SCC finalised.
+                *
+@@ -391,6 +425,9 @@ prev_vertex:
+                       /* Mark vertex as off-stack. */
+                       vertex->index = unix_vertex_grouped_index;
++
++                      if (scc_dead)
++                              scc_dead = unix_vertex_dead(vertex);
+               }
+               if (!unix_graph_maybe_cyclic)
+@@ -431,13 +468,18 @@ static void unix_walk_scc_fast(void)
+       while (!list_empty(&unix_unvisited_vertices)) {
+               struct unix_vertex *vertex;
+               struct list_head scc;
++              bool scc_dead = true;
+               vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+               list_add(&scc, &vertex->scc_entry);
+-              list_for_each_entry_reverse(vertex, &scc, scc_entry)
++              list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+                       list_move_tail(&vertex->entry, &unix_visited_vertices);
++                      if (scc_dead)
++                              scc_dead = unix_vertex_dead(vertex);
++              }
++
+               list_del(&scc);
+       }
diff --git a/queue-6.6/af_unix-detect-strongly-connected-components.patch b/queue-6.6/af_unix-detect-strongly-connected-components.patch
new file mode 100644 (file)
index 0000000..68ff173
--- /dev/null
@@ -0,0 +1,198 @@
+From stable+bounces-145859-greg=kroah.com@vger.kernel.org Wed May 21 16:55:45 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:20 +0000
+Subject: af_unix: Detect Strongly Connected Components.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-13-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 3484f063172dd88776b062046d721d7c2ae1af7c upstream.
+
+In the new GC, we use a simple graph algorithm, Tarjan's Strongly
+Connected Components (SCC) algorithm, to find cyclic references.
+
+The algorithm visits every vertex exactly once using depth-first
+search (DFS).
+
+DFS starts by pushing an input vertex to a stack and assigning it
+a unique number.  Two fields, index and lowlink, are initialised
+with the number, but lowlink could be updated later during DFS.
+
+If a vertex has an edge to an unvisited inflight vertex, we visit
+it and do the same processing.  So, we will have vertices in the
+stack in the order they appear and number them consecutively in
+the same order.
+
+If a vertex has a back-edge to a visited vertex in the stack,
+we update the predecessor's lowlink with the successor's index.
+
+After iterating edges from the vertex, we check if its index
+equals its lowlink.
+
+If the lowlink is different from the index, it shows there was a
+back-edge.  Then, we go backtracking and propagate the lowlink to
+its predecessor and resume the previous edge iteration from the
+next edge.
+
+If the lowlink is the same as the index, we pop vertices before
+and including the vertex from the stack.  Then, the set of vertices
+is SCC, possibly forming a cycle.  At the same time, we move the
+vertices to unix_visited_vertices.
+
+When we finish the algorithm, all vertices in each SCC will be
+linked via unix_vertex.scc_entry.
+
+Let's take an example.  We have a graph including five inflight
+vertices (F is not inflight):
+
+  A -> B -> C -> D -> E (-> F)
+       ^         |
+       `---------'
+
+Suppose that we start DFS from C.  We will visit C, D, and B first
+and initialise their index and lowlink.  Then, the stack looks like
+this:
+
+  > B = (3, 3)  (index, lowlink)
+    D = (2, 2)
+    C = (1, 1)
+
+When checking B's edge to C, we update B's lowlink with C's index
+and propagate it to D.
+
+    B = (3, 1)  (index, lowlink)
+  > D = (2, 1)
+    C = (1, 1)
+
+Next, we visit E, which has no edge to an inflight vertex.
+
+  > E = (4, 4)  (index, lowlink)
+    B = (3, 1)
+    D = (2, 1)
+    C = (1, 1)
+
+When we leave from E, its index and lowlink are the same, so we
+pop E from the stack as single-vertex SCC.  Next, we leave from
+B and D but do nothing because their lowlink are different from
+their index.
+
+    B = (3, 1)  (index, lowlink)
+    D = (2, 1)
+  > C = (1, 1)
+
+Then, we leave from C, whose index and lowlink are the same, so
+we pop B, D and C as SCC.
+
+Last, we do DFS for the rest of vertices, A, which is also a
+single-vertex SCC.
+
+Finally, each unix_vertex.scc_entry is linked as follows:
+
+  A -.  B -> C -> D  E -.
+  ^  |  ^         |  ^  |
+  `--'  `---------'  `--'
+
+We use SCC later to decide whether we can garbage-collect the
+sockets.
+
+Note that we still cannot detect SCC properly if an edge points
+to an embryo socket.  The following two patches will sort it out.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    3 +++
+ net/unix/garbage.c    |   46 ++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 47 insertions(+), 2 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list
+ struct unix_vertex {
+       struct list_head edges;
+       struct list_head entry;
++      struct list_head scc_entry;
+       unsigned long out_degree;
+       unsigned long index;
++      unsigned long lowlink;
++      bool on_stack;
+ };
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices);
+ static void __unix_walk_scc(struct unix_vertex *vertex)
+ {
+       unsigned long index = UNIX_VERTEX_INDEX_START;
++      LIST_HEAD(vertex_stack);
+       struct unix_edge *edge;
+       LIST_HEAD(edge_stack);
+ next_vertex:
++      /* Push vertex to vertex_stack.
++       * The vertex will be popped when finalising SCC later.
++       */
++      vertex->on_stack = true;
++      list_add(&vertex->scc_entry, &vertex_stack);
++
+       vertex->index = index;
++      vertex->lowlink = index;
+       index++;
+       /* Explore neighbour vertices (receivers of the current vertex's fd). */
+@@ -283,12 +291,46 @@ prev_vertex:
+                       edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+                       list_del_init(&edge->stack_entry);
++                      next_vertex = vertex;
+                       vertex = edge->predecessor->vertex;
++
++                      /* If the successor has a smaller lowlink, two vertices
++                       * are in the same SCC, so propagate the smaller lowlink
++                       * to skip SCC finalisation.
++                       */
++                      vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
++              } else if (next_vertex->on_stack) {
++                      /* Loop detected by a back/cross edge.
++                       *
++                       * The successor is on vertex_stack, so two vertices are
++                       * in the same SCC.  If the successor has a smaller index,
++                       * propagate it to skip SCC finalisation.
++                       */
++                      vertex->lowlink = min(vertex->lowlink, next_vertex->index);
++              } else {
++                      /* The successor was already grouped as another SCC */
+               }
+       }
+-      /* Don't restart DFS from this vertex in unix_walk_scc(). */
+-      list_move_tail(&vertex->entry, &unix_visited_vertices);
++      if (vertex->index == vertex->lowlink) {
++              struct list_head scc;
++
++              /* SCC finalised.
++               *
++               * If the lowlink was not updated, all the vertices above on
++               * vertex_stack are in the same SCC.  Group them using scc_entry.
++               */
++              __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
++
++              list_for_each_entry_reverse(vertex, &scc, scc_entry) {
++                      /* Don't restart DFS from this vertex in unix_walk_scc(). */
++                      list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++                      vertex->on_stack = false;
++              }
++
++              list_del(&scc);
++      }
+       /* Need backtracking ? */
+       if (!list_empty(&edge_stack))
diff --git a/queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch b/queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch
new file mode 100644 (file)
index 0000000..f7ba044
--- /dev/null
@@ -0,0 +1,230 @@
+From stable+bounces-145870-greg=kroah.com@vger.kernel.org Wed May 21 17:01:08 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:31 +0000
+Subject: af_unix: Don't access successor in unix_del_edges() during GC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com
+Message-ID: <20250521144803.2050504-24-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 1af2dface5d286dd1f2f3405a0d6fa9f2c8fb998 upstream.
+
+syzbot reported use-after-free in unix_del_edges().  [0]
+
+What the repro does is basically repeat the following quickly.
+
+  1. pass a fd of an AF_UNIX socket to itself
+
+    socketpair(AF_UNIX, SOCK_DGRAM, 0, [3, 4]) = 0
+    sendmsg(3, {..., msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET,
+                                   cmsg_type=SCM_RIGHTS, cmsg_data=[4]}], ...}, 0) = 0
+
+  2. pass other fds of AF_UNIX sockets to the socket above
+
+    socketpair(AF_UNIX, SOCK_SEQPACKET, 0, [5, 6]) = 0
+    sendmsg(3, {..., msg_control=[{cmsg_len=48, cmsg_level=SOL_SOCKET,
+                                   cmsg_type=SCM_RIGHTS, cmsg_data=[5, 6]}], ...}, 0) = 0
+
+  3. close all sockets
+
+Here, two skb are created, and every unix_edge->successor is the first
+socket.  Then, __unix_gc() will garbage-collect the two skb:
+
+  (a) free skb with self-referencing fd
+  (b) free skb holding other sockets
+
+After (a), the self-referencing socket will be scheduled to be freed
+later by the delayed_fput() task.
+
+syzbot repeated the sequences above (1. ~ 3.) quickly and triggered
+the task concurrently while GC was running.
+
+So, at (b), the socket was already freed, and accessing it was illegal.
+
+unix_del_edges() accesses the receiver socket as edge->successor to
+optimise GC.  However, we should not do it during GC.
+
+Garbage-collecting sockets does not change the shape of the rest
+of the graph, so we need not call unix_update_graph() to update
+unix_graph_grouped when we purge skb.
+
+However, if we clean up all loops in the unix_walk_scc_fast() path,
+unix_graph_maybe_cyclic remains unchanged (true), and __unix_gc()
+will call unix_walk_scc_fast() continuously even though there is no
+socket to garbage-collect.
+
+To keep that optimisation while fixing UAF, let's add the same
+updating logic of unix_graph_maybe_cyclic in unix_walk_scc_fast()
+as done in unix_walk_scc() and __unix_walk_scc().
+
+Note that when unix_del_edges() is called from other places, the
+receiver socket is always alive:
+
+  - sendmsg: the successor's sk_refcnt is bumped by sock_hold()
+             unix_find_other() for SOCK_DGRAM, connect() for SOCK_STREAM
+
+  - recvmsg: the successor is the receiver, and its fd is alive
+
+[0]:
+BUG: KASAN: slab-use-after-free in unix_edge_successor net/unix/garbage.c:109 [inline]
+BUG: KASAN: slab-use-after-free in unix_del_edge net/unix/garbage.c:165 [inline]
+BUG: KASAN: slab-use-after-free in unix_del_edges+0x148/0x630 net/unix/garbage.c:237
+Read of size 8 at addr ffff888079c6e640 by task kworker/u8:6/1099
+
+CPU: 0 PID: 1099 Comm: kworker/u8:6 Not tainted 6.9.0-rc4-next-20240418-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024
+Workqueue: events_unbound __unix_gc
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0x169/0x550 mm/kasan/report.c:488
+ kasan_report+0x143/0x180 mm/kasan/report.c:601
+ unix_edge_successor net/unix/garbage.c:109 [inline]
+ unix_del_edge net/unix/garbage.c:165 [inline]
+ unix_del_edges+0x148/0x630 net/unix/garbage.c:237
+ unix_destroy_fpl+0x59/0x210 net/unix/garbage.c:298
+ unix_detach_fds net/unix/af_unix.c:1811 [inline]
+ unix_destruct_scm+0x13e/0x210 net/unix/af_unix.c:1826
+ skb_release_head_state+0x100/0x250 net/core/skbuff.c:1127
+ skb_release_all net/core/skbuff.c:1138 [inline]
+ __kfree_skb net/core/skbuff.c:1154 [inline]
+ kfree_skb_reason+0x16d/0x3b0 net/core/skbuff.c:1190
+ __skb_queue_purge_reason include/linux/skbuff.h:3251 [inline]
+ __skb_queue_purge include/linux/skbuff.h:3256 [inline]
+ __unix_gc+0x1732/0x1830 net/unix/garbage.c:575
+ process_one_work kernel/workqueue.c:3218 [inline]
+ process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299
+ worker_thread+0x86d/0xd70 kernel/workqueue.c:3380
+ kthread+0x2f0/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+ </TASK>
+
+Allocated by task 14427:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+ unpoison_slab_object mm/kasan/common.c:312 [inline]
+ __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338
+ kasan_slab_alloc include/linux/kasan.h:201 [inline]
+ slab_post_alloc_hook mm/slub.c:3897 [inline]
+ slab_alloc_node mm/slub.c:3957 [inline]
+ kmem_cache_alloc_noprof+0x135/0x290 mm/slub.c:3964
+ sk_prot_alloc+0x58/0x210 net/core/sock.c:2074
+ sk_alloc+0x38/0x370 net/core/sock.c:2133
+ unix_create1+0xb4/0x770
+ unix_create+0x14e/0x200 net/unix/af_unix.c:1034
+ __sock_create+0x490/0x920 net/socket.c:1571
+ sock_create net/socket.c:1622 [inline]
+ __sys_socketpair+0x33e/0x720 net/socket.c:1773
+ __do_sys_socketpair net/socket.c:1822 [inline]
+ __se_sys_socketpair net/socket.c:1819 [inline]
+ __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Freed by task 1805:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+ kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579
+ poison_slab_object+0xe0/0x150 mm/kasan/common.c:240
+ __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256
+ kasan_slab_free include/linux/kasan.h:184 [inline]
+ slab_free_hook mm/slub.c:2190 [inline]
+ slab_free mm/slub.c:4393 [inline]
+ kmem_cache_free+0x145/0x340 mm/slub.c:4468
+ sk_prot_free net/core/sock.c:2114 [inline]
+ __sk_destruct+0x467/0x5f0 net/core/sock.c:2208
+ sock_put include/net/sock.h:1948 [inline]
+ unix_release_sock+0xa8b/0xd20 net/unix/af_unix.c:665
+ unix_release+0x91/0xc0 net/unix/af_unix.c:1049
+ __sock_release net/socket.c:659 [inline]
+ sock_close+0xbc/0x240 net/socket.c:1421
+ __fput+0x406/0x8b0 fs/file_table.c:422
+ delayed_fput+0x59/0x80 fs/file_table.c:445
+ process_one_work kernel/workqueue.c:3218 [inline]
+ process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299
+ worker_thread+0x86d/0xd70 kernel/workqueue.c:3380
+ kthread+0x2f0/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+The buggy address belongs to the object at ffff888079c6e000
+ which belongs to the cache UNIX of size 1920
+The buggy address is located 1600 bytes inside of
+ freed 1920-byte region [ffff888079c6e000, ffff888079c6e780)
+
+Reported-by: syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=f3f3eef1d2100200e593
+Fixes: 77e5593aebba ("af_unix: Skip GC if no cycle exists.")
+Fixes: fd86344823b5 ("af_unix: Try not to hold unix_gc_lock during accept().")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240419235102.31707-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -158,11 +158,14 @@ static void unix_add_edge(struct scm_fp_
+       unix_update_graph(unix_edge_successor(edge));
+ }
++static bool gc_in_progress;
++
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+       struct unix_vertex *vertex = edge->predecessor->vertex;
+-      unix_update_graph(unix_edge_successor(edge));
++      if (!gc_in_progress)
++              unix_update_graph(unix_edge_successor(edge));
+       list_del(&edge->vertex_entry);
+       vertex->out_degree--;
+@@ -237,8 +240,10 @@ void unix_del_edges(struct scm_fp_list *
+               unix_del_edge(fpl, edge);
+       } while (i < fpl->count_unix);
+-      receiver = fpl->edges[0].successor;
+-      receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
++      if (!gc_in_progress) {
++              receiver = fpl->edges[0].successor;
++              receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
++      }
+       WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
+       WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+@@ -526,6 +531,8 @@ static void unix_walk_scc(struct sk_buff
+ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+ {
++      unix_graph_maybe_cyclic = false;
++
+       while (!list_empty(&unix_unvisited_vertices)) {
+               struct unix_vertex *vertex;
+               struct list_head scc;
+@@ -543,6 +550,8 @@ static void unix_walk_scc_fast(struct sk
+               if (scc_dead)
+                       unix_collect_skb(&scc, hitlist);
++              else if (!unix_graph_maybe_cyclic)
++                      unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+               list_del(&scc);
+       }
+@@ -550,8 +559,6 @@ static void unix_walk_scc_fast(struct sk
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+-static bool gc_in_progress;
+-
+ static void __unix_gc(struct work_struct *work)
+ {
+       struct sk_buff_head hitlist;
diff --git a/queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch b/queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch
new file mode 100644 (file)
index 0000000..782f945
--- /dev/null
@@ -0,0 +1,99 @@
+From stable+bounces-145872-greg=kroah.com@vger.kernel.org Wed May 21 17:02:13 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:33 +0000
+Subject: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-26-lee@kernel.org>
+
+From: Michal Luczaj <mhal@rbox.co>
+
+commit 041933a1ec7b4173a8e638cae4f8e394331d7e54 upstream.
+
+GC attempts to explicitly drop oob_skb's reference before purging the hit
+list.
+
+The problem is with embryos: kfree_skb(u->oob_skb) is never called on an
+embryo socket.
+
+The python script below [0] sends a listener's fd to its embryo as OOB
+data.  While GC does collect the embryo's queue, it fails to drop the OOB
+skb's refcount.  The skb which was in embryo's receive queue stays as
+unix_sk(sk)->oob_skb and keeps the listener's refcount [1].
+
+Tell GC to dispose embryo's oob_skb.
+
+[0]:
+from array import array
+from socket import *
+
+addr = '\x00unix-oob'
+lis = socket(AF_UNIX, SOCK_STREAM)
+lis.bind(addr)
+lis.listen(1)
+
+s = socket(AF_UNIX, SOCK_STREAM)
+s.connect(addr)
+scm = (SOL_SOCKET, SCM_RIGHTS, array('i', [lis.fileno()]))
+s.sendmsg([b'x'], [scm], MSG_OOB)
+lis.close()
+
+[1]
+$ grep unix-oob /proc/net/unix
+$ ./unix-oob.py
+$ grep unix-oob /proc/net/unix
+0000000000000000: 00000002 00000000 00000000 0001 02     0 @unix-oob
+0000000000000000: 00000002 00000000 00010000 0001 01  6072 @unix-oob
+
+Fixes: 4090fa373f0e ("af_unix: Replace garbage collection algorithm.")
+Signed-off-by: Michal Luczaj <mhal@rbox.co>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   23 ++++++++++++++---------
+ 1 file changed, 14 insertions(+), 9 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -342,6 +342,18 @@ enum unix_recv_queue_lock_class {
+       U_RECVQ_LOCK_EMBRYO,
+ };
++static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
++{
++      skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
++
++#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
++      if (u->oob_skb) {
++              WARN_ON_ONCE(skb_unref(u->oob_skb));
++              u->oob_skb = NULL;
++      }
++#endif
++}
++
+ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
+ {
+       struct unix_vertex *vertex;
+@@ -365,18 +377,11 @@ static void unix_collect_skb(struct list
+                               /* listener -> embryo order, the inversion never happens. */
+                               spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+-                              skb_queue_splice_init(embryo_queue, hitlist);
++                              unix_collect_queue(unix_sk(skb->sk), hitlist);
+                               spin_unlock(&embryo_queue->lock);
+                       }
+               } else {
+-                      skb_queue_splice_init(queue, hitlist);
+-
+-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+-                      if (u->oob_skb) {
+-                              kfree_skb(u->oob_skb);
+-                              u->oob_skb = NULL;
+-                      }
+-#endif
++                      unix_collect_queue(u, hitlist);
+               }
+               spin_unlock(&queue->lock);
diff --git a/queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch b/queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch
new file mode 100644 (file)
index 0000000..8f8d3a7
--- /dev/null
@@ -0,0 +1,106 @@
+From stable+bounces-145873-greg=kroah.com@vger.kernel.org Wed May 21 17:02:42 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:34 +0000
+Subject: af_unix: Fix uninit-value in __unix_walk_scc()
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, Shigeru Yoshida <syoshida@redhat.com>, syzkaller <syzkaller@googlegroups.com>
+Message-ID: <20250521144803.2050504-27-lee@kernel.org>
+
+From: Shigeru Yoshida <syoshida@redhat.com>
+
+commit 927fa5b3e4f52e0967bfc859afc98ad1c523d2d5 upstream.
+
+KMSAN reported uninit-value access in __unix_walk_scc() [1].
+
+In the list_for_each_entry_reverse() loop, when the vertex's index
+equals it's scc_index, the loop uses the variable vertex as a
+temporary variable that points to a vertex in scc. And when the loop
+is finished, the variable vertex points to the list head, in this case
+scc, which is a local variable on the stack (more precisely, it's not
+even scc and might underflow the call stack of __unix_walk_scc():
+container_of(&scc, struct unix_vertex, scc_entry)).
+
+However, the variable vertex is used under the label prev_vertex. So
+if the edge_stack is not empty and the function jumps to the
+prev_vertex label, the function will access invalid data on the
+stack. This causes the uninit-value access issue.
+
+Fix this by introducing a new temporary variable for the loop.
+
+[1]
+BUG: KMSAN: uninit-value in __unix_walk_scc net/unix/garbage.c:478 [inline]
+BUG: KMSAN: uninit-value in unix_walk_scc net/unix/garbage.c:526 [inline]
+BUG: KMSAN: uninit-value in __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584
+ __unix_walk_scc net/unix/garbage.c:478 [inline]
+ unix_walk_scc net/unix/garbage.c:526 [inline]
+ __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584
+ process_one_work kernel/workqueue.c:3231 [inline]
+ process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312
+ worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393
+ kthread+0x3c4/0x530 kernel/kthread.c:389
+ ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+Uninit was stored to memory at:
+ unix_walk_scc net/unix/garbage.c:526 [inline]
+ __unix_gc+0x2adf/0x3c20 net/unix/garbage.c:584
+ process_one_work kernel/workqueue.c:3231 [inline]
+ process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312
+ worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393
+ kthread+0x3c4/0x530 kernel/kthread.c:389
+ ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+Local variable entries created at:
+ ref_tracker_free+0x48/0xf30 lib/ref_tracker.c:222
+ netdev_tracker_free include/linux/netdevice.h:4058 [inline]
+ netdev_put include/linux/netdevice.h:4075 [inline]
+ dev_put include/linux/netdevice.h:4101 [inline]
+ update_gid_event_work_handler+0xaa/0x1b0 drivers/infiniband/core/roce_gid_mgmt.c:813
+
+CPU: 1 PID: 12763 Comm: kworker/u8:31 Not tainted 6.10.0-rc4-00217-g35bb670d65fc #32
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014
+Workqueue: events_unbound __unix_gc
+
+Fixes: 3484f063172d ("af_unix: Detect Strongly Connected Components.")
+Reported-by: syzkaller <syzkaller@googlegroups.com>
+Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20240702160428.10153-1-syoshida@redhat.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |    9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -476,6 +476,7 @@ prev_vertex:
+       }
+       if (vertex->index == vertex->scc_index) {
++              struct unix_vertex *v;
+               struct list_head scc;
+               bool scc_dead = true;
+@@ -486,15 +487,15 @@ prev_vertex:
+                */
+               __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+-              list_for_each_entry_reverse(vertex, &scc, scc_entry) {
++              list_for_each_entry_reverse(v, &scc, scc_entry) {
+                       /* Don't restart DFS from this vertex in unix_walk_scc(). */
+-                      list_move_tail(&vertex->entry, &unix_visited_vertices);
++                      list_move_tail(&v->entry, &unix_visited_vertices);
+                       /* Mark vertex as off-stack. */
+-                      vertex->index = unix_vertex_grouped_index;
++                      v->index = unix_vertex_grouped_index;
+                       if (scc_dead)
+-                              scc_dead = unix_vertex_dead(vertex);
++                              scc_dead = unix_vertex_dead(v);
+               }
+               if (scc_dead)
diff --git a/queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch b/queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch
new file mode 100644 (file)
index 0000000..1c256c3
--- /dev/null
@@ -0,0 +1,134 @@
+From stable+bounces-145861-greg=kroah.com@vger.kernel.org Wed May 21 16:54:34 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:22 +0000
+Subject: af_unix: Fix up unix_edge.successor for embryo socket.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-15-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit dcf70df2048d27c5d186f013f101a4aefd63aa41 upstream.
+
+To garbage collect inflight AF_UNIX sockets, we must define the
+cyclic reference appropriately.  This is a bit tricky if the loop
+consists of embryo sockets.
+
+Suppose that the fd of AF_UNIX socket A is passed to D and the fd B
+to C and that C and D are embryo sockets of A and B, respectively.
+It may appear that there are two separate graphs, A (-> D) and
+B (-> C), but this is not correct.
+
+     A --. .-- B
+          X
+     C <-' `-> D
+
+Now, D holds A's refcount, and C has B's refcount, so unix_release()
+will never be called for A and B when we close() them.  However, no
+one can call close() for D and C to free skbs holding refcounts of A
+and B because C/D is in A/B's receive queue, which should have been
+purged by unix_release() for A and B.
+
+So, here's another type of cyclic reference.  When a fd of an AF_UNIX
+socket is passed to an embryo socket, the reference is indirectly held
+by its parent listening socket.
+
+  .-> A                            .-> B
+  |   `- sk_receive_queue          |   `- sk_receive_queue
+  |      `- skb                    |      `- skb
+  |         `- sk == C             |         `- sk == D
+  |            `- sk_receive_queue |           `- sk_receive_queue
+  |               `- skb +---------'               `- skb +-.
+  |                                                         |
+  `---------------------------------------------------------'
+
+Technically, the graph must be denoted as A <-> B instead of A (-> D)
+and B (-> C) to find such a cyclic reference without touching each
+socket's receive queue.
+
+  .-> A --. .-- B <-.
+  |        X        |  ==  A <-> B
+  `-- C <-' `-> D --'
+
+We apply this fixup during GC by fetching the real successor by
+unix_edge_successor().
+
+When we call accept(), we clear unix_sock.listener under unix_gc_lock
+not to confuse GC.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 +
+ net/unix/af_unix.c    |    2 +-
+ net/unix/garbage.c    |   20 +++++++++++++++++++-
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *u
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
++void unix_update_edges(struct unix_sock *receiver);
+ int unix_prepare_fpl(struct scm_fp_list *fpl);
+ void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1705,7 +1705,7 @@ static int unix_accept(struct socket *so
+       }
+       tsk = skb->sk;
+-      unix_sk(tsk)->listener = NULL;
++      unix_update_edges(unix_sk(tsk));
+       skb_free_datagram(sk, skb);
+       wake_up_interruptible(&unix_sk(sk)->peer_wait);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct
+       return NULL;
+ }
++static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
++{
++      /* If an embryo socket has a fd,
++       * the listener indirectly holds the fd's refcnt.
++       */
++      if (edge->successor->listener)
++              return unix_sk(edge->successor->listener)->vertex;
++
++      return edge->successor->vertex;
++}
++
+ static LIST_HEAD(unix_unvisited_vertices);
+ enum unix_vertex_index {
+@@ -209,6 +220,13 @@ out:
+       fpl->inflight = false;
+ }
++void unix_update_edges(struct unix_sock *receiver)
++{
++      spin_lock(&unix_gc_lock);
++      receiver->listener = NULL;
++      spin_unlock(&unix_gc_lock);
++}
++
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
+ {
+       struct unix_vertex *vertex;
+@@ -268,7 +286,7 @@ next_vertex:
+       /* Explore neighbour vertices (receivers of the current vertex's fd). */
+       list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+-              struct unix_vertex *next_vertex = edge->successor->vertex;
++              struct unix_vertex *next_vertex = unix_edge_successor(edge);
+               if (!next_vertex)
+                       continue;
diff --git a/queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch b/queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch
new file mode 100644 (file)
index 0000000..1e05a4c
--- /dev/null
@@ -0,0 +1,155 @@
+From stable+bounces-145858-greg=kroah.com@vger.kernel.org Wed May 21 16:53:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:19 +0000
+Subject: af_unix: Iterate all vertices by DFS.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-12-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 6ba76fd2848e107594ea4f03b737230f74bc23ea upstream.
+
+The new GC will use a depth first search graph algorithm to find
+cyclic references.  The algorithm visits every vertex exactly once.
+
+Here, we implement the DFS part without recursion so that no one
+can abuse it.
+
+unix_walk_scc() marks every vertex unvisited by initialising index
+as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in
+unix_unvisited_vertices and call __unix_walk_scc() to start DFS from
+an arbitrary vertex.
+
+__unix_walk_scc() iterates all edges starting from the vertex and
+explores the neighbour vertices with DFS using edge_stack.
+
+After visiting all neighbours, __unix_walk_scc() moves the visited
+vertex to unix_visited_vertices so that unix_walk_scc() will not
+restart DFS from the visited vertex.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    2 +
+ net/unix/garbage.c    |   74 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 76 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -33,12 +33,14 @@ struct unix_vertex {
+       struct list_head edges;
+       struct list_head entry;
+       unsigned long out_degree;
++      unsigned long index;
+ };
+ struct unix_edge {
+       struct unix_sock *predecessor;
+       struct unix_sock *successor;
+       struct list_head vertex_entry;
++      struct list_head stack_entry;
+ };
+ struct sock *unix_peer_get(struct sock *sk);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct
+ static LIST_HEAD(unix_unvisited_vertices);
++enum unix_vertex_index {
++      UNIX_VERTEX_INDEX_UNVISITED,
++      UNIX_VERTEX_INDEX_START,
++};
++
+ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+       struct unix_vertex *vertex = edge->predecessor->vertex;
+@@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list
+       unix_free_vertices(fpl);
+ }
++static LIST_HEAD(unix_visited_vertices);
++
++static void __unix_walk_scc(struct unix_vertex *vertex)
++{
++      unsigned long index = UNIX_VERTEX_INDEX_START;
++      struct unix_edge *edge;
++      LIST_HEAD(edge_stack);
++
++next_vertex:
++      vertex->index = index;
++      index++;
++
++      /* Explore neighbour vertices (receivers of the current vertex's fd). */
++      list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++              struct unix_vertex *next_vertex = edge->successor->vertex;
++
++              if (!next_vertex)
++                      continue;
++
++              if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
++                      /* Iterative deepening depth first search
++                       *
++                       *   1. Push a forward edge to edge_stack and set
++                       *      the successor to vertex for the next iteration.
++                       */
++                      list_add(&edge->stack_entry, &edge_stack);
++
++                      vertex = next_vertex;
++                      goto next_vertex;
++
++                      /*   2. Pop the edge directed to the current vertex
++                       *      and restore the ancestor for backtracking.
++                       */
++prev_vertex:
++                      edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
++                      list_del_init(&edge->stack_entry);
++
++                      vertex = edge->predecessor->vertex;
++              }
++      }
++
++      /* Don't restart DFS from this vertex in unix_walk_scc(). */
++      list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++      /* Need backtracking ? */
++      if (!list_empty(&edge_stack))
++              goto prev_vertex;
++}
++
++static void unix_walk_scc(void)
++{
++      struct unix_vertex *vertex;
++
++      list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
++              vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
++
++      /* Visit every vertex exactly once.
++       * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
++       */
++      while (!list_empty(&unix_unvisited_vertices)) {
++              vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
++              __unix_walk_scc(vertex);
++      }
++
++      list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
++}
++
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
+@@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct
+       spin_lock(&unix_gc_lock);
++      unix_walk_scc();
++
+       /* First, select candidates for garbage collection.  Only
+        * in-flight sockets are considered, and from those only ones
+        * which don't have any external reference.
diff --git a/queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch b/queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch
new file mode 100644 (file)
index 0000000..7ec2029
--- /dev/null
@@ -0,0 +1,259 @@
+From stable+bounces-145856-greg=kroah.com@vger.kernel.org Wed May 21 16:52:44 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:17 +0000
+Subject: af_unix: Link struct unix_edge when queuing skb.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-10-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 42f298c06b30bfe0a8cbee5d38644e618699e26e upstream.
+
+Just before queuing skb with inflight fds, we call scm_stat_add(),
+which is a good place to set up the preallocated struct unix_vertex
+and struct unix_edge in UNIXCB(skb).fp.
+
+Then, we call unix_add_edges() and construct the directed graph
+as follows:
+
+  1. Set the inflight socket's unix_sock to unix_edge.predecessor.
+  2. Set the receiver's unix_sock to unix_edge.successor.
+  3. Set the preallocated vertex to inflight socket's unix_sock.vertex.
+  4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices.
+  5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges.
+
+Let's say we pass the fd of AF_UNIX socket A to B and the fd of B
+to C.  The graph looks like this:
+
+  +-------------------------+
+  | unix_unvisited_vertices | <-------------------------.
+  +-------------------------+                           |
+  +                                                     |
+  |     +--------------+             +--------------+   |         +--------------+
+  |     |  unix_sock A | <---. .---> |  unix_sock B | <-|-. .---> |  unix_sock C |
+  |     +--------------+     | |     +--------------+   | | |     +--------------+
+  | .-+ |    vertex    |     | | .-+ |    vertex    |   | | |     |    vertex    |
+  | |   +--------------+     | | |   +--------------+   | | |     +--------------+
+  | |                        | | |                      | | |
+  | |   +--------------+     | | |   +--------------+   | | |
+  | '-> |  unix_vertex |     | | '-> |  unix_vertex |   | | |
+  |     +--------------+     | |     +--------------+   | | |
+  `---> |    entry     | +---------> |    entry     | +-' | |
+        |--------------|     | |     |--------------|     | |
+        |    edges     | <-. | |     |    edges     | <-. | |
+        +--------------+   | | |     +--------------+   | | |
+                           | | |                        | | |
+    .----------------------' | | .----------------------' | |
+    |                        | | |                        | |
+    |   +--------------+     | | |   +--------------+     | |
+    |   |   unix_edge  |     | | |   |   unix_edge  |     | |
+    |   +--------------+     | | |   +--------------+     | |
+    `-> | vertex_entry |     | | `-> | vertex_entry |     | |
+        |--------------|     | |     |--------------|     | |
+        |  predecessor | +---' |     |  predecessor | +---' |
+        |--------------|       |     |--------------|       |
+        |   successor  | +-----'     |   successor  | +-----'
+        +--------------+             +--------------+
+
+Henceforth, we denote such a graph as A -> B (-> C).
+
+Now, we can express all inflight fd graphs that do not contain
+embryo sockets.  We will support the particular case later.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    2 +
+ include/net/scm.h     |    1 
+ net/core/scm.c        |    2 +
+ net/unix/af_unix.c    |    8 +++-
+ net/unix/garbage.c    |   90 +++++++++++++++++++++++++++++++++++++++++++++++++-
+ 5 files changed, 100 insertions(+), 3 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight;
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
++void unix_del_edges(struct scm_fp_list *fpl);
+ int unix_prepare_fpl(struct scm_fp_list *fpl);
+ void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -31,6 +31,7 @@ struct scm_fp_list {
+       short                   count_unix;
+       short                   max;
+ #ifdef CONFIG_UNIX
++      bool                    inflight;
+       struct list_head        vertices;
+       struct unix_edge        *edges;
+ #endif
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c
+               fpl->max = SCM_MAX_FD;
+               fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
++              fpl->inflight = false;
+               fpl->edges = NULL;
+               INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+@@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
+               new_fpl->max = new_fpl->count;
+               new_fpl->user = get_uid(fpl->user);
+ #if IS_ENABLED(CONFIG_UNIX)
++              new_fpl->inflight = false;
+               new_fpl->edges = NULL;
+               INIT_LIST_HEAD(&new_fpl->vertices);
+ #endif
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1920,8 +1920,10 @@ static void scm_stat_add(struct sock *sk
+       struct scm_fp_list *fp = UNIXCB(skb).fp;
+       struct unix_sock *u = unix_sk(sk);
+-      if (unlikely(fp && fp->count))
++      if (unlikely(fp && fp->count)) {
+               atomic_add(fp->count, &u->scm_stat.nr_fds);
++              unix_add_edges(fp, u);
++      }
+ }
+ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
+@@ -1929,8 +1931,10 @@ static void scm_stat_del(struct sock *sk
+       struct scm_fp_list *fp = UNIXCB(skb).fp;
+       struct unix_sock *u = unix_sk(sk);
+-      if (unlikely(fp && fp->count))
++      if (unlikely(fp && fp->count)) {
+               atomic_sub(fp->count, &u->scm_stat.nr_fds);
++              unix_del_edges(fp);
++      }
+ }
+ /*
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct
+       return NULL;
+ }
++static LIST_HEAD(unix_unvisited_vertices);
++
++static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
++{
++      struct unix_vertex *vertex = edge->predecessor->vertex;
++
++      if (!vertex) {
++              vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
++              vertex->out_degree = 0;
++              INIT_LIST_HEAD(&vertex->edges);
++
++              list_move_tail(&vertex->entry, &unix_unvisited_vertices);
++              edge->predecessor->vertex = vertex;
++      }
++
++      vertex->out_degree++;
++      list_add_tail(&edge->vertex_entry, &vertex->edges);
++}
++
++static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
++{
++      struct unix_vertex *vertex = edge->predecessor->vertex;
++
++      list_del(&edge->vertex_entry);
++      vertex->out_degree--;
++
++      if (!vertex->out_degree) {
++              edge->predecessor->vertex = NULL;
++              list_move_tail(&vertex->entry, &fpl->vertices);
++      }
++}
++
+ static void unix_free_vertices(struct scm_fp_list *fpl)
+ {
+       struct unix_vertex *vertex, *next_vertex;
+@@ -111,6 +143,60 @@ static void unix_free_vertices(struct sc
+       }
+ }
++DEFINE_SPINLOCK(unix_gc_lock);
++
++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
++{
++      int i = 0, j = 0;
++
++      spin_lock(&unix_gc_lock);
++
++      if (!fpl->count_unix)
++              goto out;
++
++      do {
++              struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
++              struct unix_edge *edge;
++
++              if (!inflight)
++                      continue;
++
++              edge = fpl->edges + i++;
++              edge->predecessor = inflight;
++              edge->successor = receiver;
++
++              unix_add_edge(fpl, edge);
++      } while (i < fpl->count_unix);
++
++out:
++      spin_unlock(&unix_gc_lock);
++
++      fpl->inflight = true;
++
++      unix_free_vertices(fpl);
++}
++
++void unix_del_edges(struct scm_fp_list *fpl)
++{
++      int i = 0;
++
++      spin_lock(&unix_gc_lock);
++
++      if (!fpl->count_unix)
++              goto out;
++
++      do {
++              struct unix_edge *edge = fpl->edges + i++;
++
++              unix_del_edge(fpl, edge);
++      } while (i < fpl->count_unix);
++
++out:
++      spin_unlock(&unix_gc_lock);
++
++      fpl->inflight = false;
++}
++
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
+ {
+       struct unix_vertex *vertex;
+@@ -141,11 +227,13 @@ err:
+ void unix_destroy_fpl(struct scm_fp_list *fpl)
+ {
++      if (fpl->inflight)
++              unix_del_edges(fpl);
++
+       kvfree(fpl->edges);
+       unix_free_vertices(fpl);
+ }
+-DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
diff --git a/queue-6.6/af_unix-remove-config_unix_scm.patch b/queue-6.6/af_unix-remove-config_unix_scm.patch
new file mode 100644 (file)
index 0000000..826f7f1
--- /dev/null
@@ -0,0 +1,439 @@
+From stable+bounces-145853-greg=kroah.com@vger.kernel.org Wed May 21 16:51:31 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:14 +0000
+Subject: af_unix: Remove CONFIG_UNIX_SCM.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-7-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 upstream.
+
+Originally, the code related to garbage collection was all in garbage.c.
+
+Commit f4e65870e5ce ("net: split out functions related to registering
+inflight socket files") moved some functions to scm.c for io_uring and
+added CONFIG_UNIX_SCM just in case AF_UNIX was built as module.
+
+However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX
+bool"), AF_UNIX is no longer built separately.  Also, io_uring does not
+support SCM_RIGHTS now.
+
+Let's move the functions back to garbage.c
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    7 +-
+ net/Makefile          |    2 
+ net/unix/Kconfig      |    5 -
+ net/unix/Makefile     |    2 
+ net/unix/af_unix.c    |   63 ++++++++++++++++++++-
+ net/unix/garbage.c    |   73 +++++++++++++++++++++++-
+ net/unix/scm.c        |  150 --------------------------------------------------
+ net/unix/scm.h        |   10 ---
+ 8 files changed, 137 insertions(+), 175 deletions(-)
+ delete mode 100644 net/unix/scm.c
+ delete mode 100644 net/unix/scm.h
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get
+ }
+ #endif
++extern spinlock_t unix_gc_lock;
++extern unsigned int unix_tot_inflight;
++
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+-void unix_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
++
+ struct sock *unix_peer_get(struct sock *sk);
+ #define UNIX_HASH_MOD (256 - 1)
+ #define UNIX_HASH_SIZE        (256 * 2)
+ #define UNIX_HASH_BITS        8
+-extern unsigned int unix_tot_inflight;
+-
+ struct unix_address {
+       refcount_t      refcnt;
+       int             len;
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER)              += netfilter/
+ obj-$(CONFIG_INET)            += ipv4/
+ obj-$(CONFIG_TLS)             += tls/
+ obj-$(CONFIG_XFRM)            += xfrm/
+-obj-$(CONFIG_UNIX_SCM)                += unix/
++obj-$(CONFIG_UNIX)            += unix/
+ obj-y                         += ipv6/
+ obj-$(CONFIG_BPFILTER)                += bpfilter/
+ obj-$(CONFIG_PACKET)          += packet/
+--- a/net/unix/Kconfig
++++ b/net/unix/Kconfig
+@@ -16,11 +16,6 @@ config UNIX
+         Say Y unless you know what you are doing.
+-config UNIX_SCM
+-      bool
+-      depends on UNIX
+-      default y
+-
+ config        AF_UNIX_OOB
+       bool
+       depends on UNIX
+--- a/net/unix/Makefile
++++ b/net/unix/Makefile
+@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
+ obj-$(CONFIG_UNIX_DIAG)       += unix_diag.o
+ unix_diag-y           := diag.o
+-
+-obj-$(CONFIG_UNIX_SCM)        += scm.o
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -117,8 +117,6 @@
+ #include <linux/file.h>
+ #include <linux/btf_ids.h>
+-#include "scm.h"
+-
+ static atomic_long_t unix_nr_socks;
+ static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
+ static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
+@@ -1752,6 +1750,52 @@ out:
+       return err;
+ }
++/* The "user->unix_inflight" variable is protected by the garbage
++ * collection lock, and we just read it locklessly here. If you go
++ * over the limit, there might be a tiny race in actually noticing
++ * it across threads. Tough.
++ */
++static inline bool too_many_unix_fds(struct task_struct *p)
++{
++      struct user_struct *user = current_user();
++
++      if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
++              return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
++      return false;
++}
++
++static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++      int i;
++
++      if (too_many_unix_fds(current))
++              return -ETOOMANYREFS;
++
++      /* Need to duplicate file references for the sake of garbage
++       * collection.  Otherwise a socket in the fps might become a
++       * candidate for GC while the skb is not yet queued.
++       */
++      UNIXCB(skb).fp = scm_fp_dup(scm->fp);
++      if (!UNIXCB(skb).fp)
++              return -ENOMEM;
++
++      for (i = scm->fp->count - 1; i >= 0; i--)
++              unix_inflight(scm->fp->user, scm->fp->fp[i]);
++
++      return 0;
++}
++
++static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++      int i;
++
++      scm->fp = UNIXCB(skb).fp;
++      UNIXCB(skb).fp = NULL;
++
++      for (i = scm->fp->count - 1; i >= 0; i--)
++              unix_notinflight(scm->fp->user, scm->fp->fp[i]);
++}
++
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+       scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+@@ -1799,6 +1843,21 @@ static void unix_peek_fds(struct scm_coo
+       spin_unlock(&unix_gc_lock);
+ }
++static void unix_destruct_scm(struct sk_buff *skb)
++{
++      struct scm_cookie scm;
++
++      memset(&scm, 0, sizeof(scm));
++      scm.pid  = UNIXCB(skb).pid;
++      if (UNIXCB(skb).fp)
++              unix_detach_fds(&scm, skb);
++
++      /* Alas, it calls VFS */
++      /* So fscking what? fput() had been SMP-safe since the last Summer */
++      scm_destroy(&scm);
++      sock_wfree(skb);
++}
++
+ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+ {
+       int err = 0;
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -81,11 +81,80 @@
+ #include <net/scm.h>
+ #include <net/tcp_states.h>
+-#include "scm.h"
++struct unix_sock *unix_get_socket(struct file *filp)
++{
++      struct inode *inode = file_inode(filp);
+-/* Internal data structures and random procedures: */
++      /* Socket ? */
++      if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
++              struct socket *sock = SOCKET_I(inode);
++              const struct proto_ops *ops;
++              struct sock *sk = sock->sk;
++              ops = READ_ONCE(sock->ops);
++
++              /* PF_UNIX ? */
++              if (sk && ops && ops->family == PF_UNIX)
++                      return unix_sk(sk);
++      }
++
++      return NULL;
++}
++
++DEFINE_SPINLOCK(unix_gc_lock);
++unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
++static LIST_HEAD(gc_inflight_list);
++
++/* Keep the number of times in flight count for the file
++ * descriptor if it is for an AF_UNIX socket.
++ */
++void unix_inflight(struct user_struct *user, struct file *filp)
++{
++      struct unix_sock *u = unix_get_socket(filp);
++
++      spin_lock(&unix_gc_lock);
++
++      if (u) {
++              if (!u->inflight) {
++                      WARN_ON_ONCE(!list_empty(&u->link));
++                      list_add_tail(&u->link, &gc_inflight_list);
++              } else {
++                      WARN_ON_ONCE(list_empty(&u->link));
++              }
++              u->inflight++;
++
++              /* Paired with READ_ONCE() in wait_for_unix_gc() */
++              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
++      }
++
++      WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
++
++      spin_unlock(&unix_gc_lock);
++}
++
++void unix_notinflight(struct user_struct *user, struct file *filp)
++{
++      struct unix_sock *u = unix_get_socket(filp);
++
++      spin_lock(&unix_gc_lock);
++
++      if (u) {
++              WARN_ON_ONCE(!u->inflight);
++              WARN_ON_ONCE(list_empty(&u->link));
++
++              u->inflight--;
++              if (!u->inflight)
++                      list_del_init(&u->link);
++
++              /* Paired with READ_ONCE() in wait_for_unix_gc() */
++              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
++      }
++
++      WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
++
++      spin_unlock(&unix_gc_lock);
++}
+ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+                         struct sk_buff_head *hitlist)
+--- a/net/unix/scm.c
++++ /dev/null
+@@ -1,150 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0
+-#include <linux/module.h>
+-#include <linux/kernel.h>
+-#include <linux/string.h>
+-#include <linux/socket.h>
+-#include <linux/net.h>
+-#include <linux/fs.h>
+-#include <net/af_unix.h>
+-#include <net/scm.h>
+-#include <linux/init.h>
+-#include <linux/io_uring.h>
+-
+-#include "scm.h"
+-
+-unsigned int unix_tot_inflight;
+-EXPORT_SYMBOL(unix_tot_inflight);
+-
+-LIST_HEAD(gc_inflight_list);
+-EXPORT_SYMBOL(gc_inflight_list);
+-
+-DEFINE_SPINLOCK(unix_gc_lock);
+-EXPORT_SYMBOL(unix_gc_lock);
+-
+-struct unix_sock *unix_get_socket(struct file *filp)
+-{
+-      struct inode *inode = file_inode(filp);
+-
+-      /* Socket ? */
+-      if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+-              struct socket *sock = SOCKET_I(inode);
+-              const struct proto_ops *ops = READ_ONCE(sock->ops);
+-              struct sock *s = sock->sk;
+-
+-              /* PF_UNIX ? */
+-              if (s && ops && ops->family == PF_UNIX)
+-                      return unix_sk(s);
+-      }
+-
+-      return NULL;
+-}
+-EXPORT_SYMBOL(unix_get_socket);
+-
+-/* Keep the number of times in flight count for the file
+- * descriptor if it is for an AF_UNIX socket.
+- */
+-void unix_inflight(struct user_struct *user, struct file *fp)
+-{
+-      struct unix_sock *u = unix_get_socket(fp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (u) {
+-              if (!u->inflight) {
+-                      WARN_ON_ONCE(!list_empty(&u->link));
+-                      list_add_tail(&u->link, &gc_inflight_list);
+-              } else {
+-                      WARN_ON_ONCE(list_empty(&u->link));
+-              }
+-              u->inflight++;
+-              /* Paired with READ_ONCE() in wait_for_unix_gc() */
+-              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
+-      }
+-      WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+-void unix_notinflight(struct user_struct *user, struct file *fp)
+-{
+-      struct unix_sock *u = unix_get_socket(fp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (u) {
+-              WARN_ON_ONCE(!u->inflight);
+-              WARN_ON_ONCE(list_empty(&u->link));
+-
+-              u->inflight--;
+-              if (!u->inflight)
+-                      list_del_init(&u->link);
+-              /* Paired with READ_ONCE() in wait_for_unix_gc() */
+-              WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
+-      }
+-      WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+-/*
+- * The "user->unix_inflight" variable is protected by the garbage
+- * collection lock, and we just read it locklessly here. If you go
+- * over the limit, there might be a tiny race in actually noticing
+- * it across threads. Tough.
+- */
+-static inline bool too_many_unix_fds(struct task_struct *p)
+-{
+-      struct user_struct *user = current_user();
+-
+-      if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
+-              return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+-      return false;
+-}
+-
+-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+-{
+-      int i;
+-
+-      if (too_many_unix_fds(current))
+-              return -ETOOMANYREFS;
+-
+-      /*
+-       * Need to duplicate file references for the sake of garbage
+-       * collection.  Otherwise a socket in the fps might become a
+-       * candidate for GC while the skb is not yet queued.
+-       */
+-      UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+-      if (!UNIXCB(skb).fp)
+-              return -ENOMEM;
+-
+-      for (i = scm->fp->count - 1; i >= 0; i--)
+-              unix_inflight(scm->fp->user, scm->fp->fp[i]);
+-      return 0;
+-}
+-EXPORT_SYMBOL(unix_attach_fds);
+-
+-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+-{
+-      int i;
+-
+-      scm->fp = UNIXCB(skb).fp;
+-      UNIXCB(skb).fp = NULL;
+-
+-      for (i = scm->fp->count-1; i >= 0; i--)
+-              unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+-}
+-EXPORT_SYMBOL(unix_detach_fds);
+-
+-void unix_destruct_scm(struct sk_buff *skb)
+-{
+-      struct scm_cookie scm;
+-
+-      memset(&scm, 0, sizeof(scm));
+-      scm.pid  = UNIXCB(skb).pid;
+-      if (UNIXCB(skb).fp)
+-              unix_detach_fds(&scm, skb);
+-
+-      /* Alas, it calls VFS */
+-      /* So fscking what? fput() had been SMP-safe since the last Summer */
+-      scm_destroy(&scm);
+-      sock_wfree(skb);
+-}
+-EXPORT_SYMBOL(unix_destruct_scm);
+--- a/net/unix/scm.h
++++ /dev/null
+@@ -1,10 +0,0 @@
+-#ifndef NET_UNIX_SCM_H
+-#define NET_UNIX_SCM_H
+-
+-extern struct list_head gc_inflight_list;
+-extern spinlock_t unix_gc_lock;
+-
+-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+-
+-#endif
diff --git a/queue-6.6/af_unix-remove-io_uring-code-for-gc.patch b/queue-6.6/af_unix-remove-io_uring-code-for-gc.patch
new file mode 100644 (file)
index 0000000..c3f48ef
--- /dev/null
@@ -0,0 +1,106 @@
+From stable+bounces-145852-greg=kroah.com@vger.kernel.org Wed May 21 16:51:43 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:13 +0000
+Subject: af_unix: Remove io_uring code for GC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-6-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 11498715f266a3fb4caabba9dd575636cbcaa8f1 upstream.
+
+Since commit 705318a99a13 ("io_uring/af_unix: disable sending
+io_uring over sockets"), io_uring's unix socket cannot be passed
+via SCM_RIGHTS, so it does not contribute to cyclic reference and
+no longer be candidate for garbage collection.
+
+Also, commit 6e5e6d274956 ("io_uring: drop any code related to
+SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring.
+
+Let's do it in AF_UNIX as well by reverting commit 0091bfc81741
+("io_uring/af_unix: defer registered files gc to io_uring release")
+and commit 10369080454d ("net: reclaim skb->scm_io_uring bit").
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 -
+ net/unix/garbage.c    |   25 ++-----------------------
+ net/unix/scm.c        |    6 ------
+ 3 files changed, 2 insertions(+), 30 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -20,7 +20,6 @@ static inline struct unix_sock *unix_get
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_destruct_scm(struct sk_buff *skb);
+-void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
+ struct sock *unix_peer_get(struct sock *sk);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -184,12 +184,10 @@ static bool gc_in_progress;
+ static void __unix_gc(struct work_struct *work)
+ {
+-      struct sk_buff *next_skb, *skb;
+-      struct unix_sock *u;
+-      struct unix_sock *next;
+       struct sk_buff_head hitlist;
+-      struct list_head cursor;
++      struct unix_sock *u, *next;
+       LIST_HEAD(not_cycle_list);
++      struct list_head cursor;
+       spin_lock(&unix_gc_lock);
+@@ -293,30 +291,11 @@ static void __unix_gc(struct work_struct
+       spin_unlock(&unix_gc_lock);
+-      /* We need io_uring to clean its registered files, ignore all io_uring
+-       * originated skbs. It's fine as io_uring doesn't keep references to
+-       * other io_uring instances and so killing all other files in the cycle
+-       * will put all io_uring references forcing it to go through normal
+-       * release.path eventually putting registered files.
+-       */
+-      skb_queue_walk_safe(&hitlist, skb, next_skb) {
+-              if (skb->destructor == io_uring_destruct_scm) {
+-                      __skb_unlink(skb, &hitlist);
+-                      skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+-              }
+-      }
+-
+       /* Here we are. Hitlist is filled. Die. */
+       __skb_queue_purge(&hitlist);
+       spin_lock(&unix_gc_lock);
+-      /* There could be io_uring registered files, just push them back to
+-       * the inflight list
+-       */
+-      list_for_each_entry_safe(u, next, &gc_candidates, link)
+-              list_move_tail(&u->link, &gc_inflight_list);
+-
+       /* All candidates should have been detached by now. */
+       WARN_ON_ONCE(!list_empty(&gc_candidates));
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -148,9 +148,3 @@ void unix_destruct_scm(struct sk_buff *s
+       sock_wfree(skb);
+ }
+ EXPORT_SYMBOL(unix_destruct_scm);
+-
+-void io_uring_destruct_scm(struct sk_buff *skb)
+-{
+-      unix_destruct_scm(skb);
+-}
+-EXPORT_SYMBOL(io_uring_destruct_scm);
diff --git a/queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch b/queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch
new file mode 100644 (file)
index 0000000..59578d3
--- /dev/null
@@ -0,0 +1,115 @@
+From stable+bounces-145868-greg=kroah.com@vger.kernel.org Wed May 21 16:56:33 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:29 +0000
+Subject: af_unix: Remove lock dance in unix_peek_fds().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-22-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 118f457da9ed58a79e24b73c2ef0aa1987241f0e upstream.
+
+In the previous GC implementation, the shape of the inflight socket
+graph was not expected to change while GC was in progress.
+
+MSG_PEEK was tricky because it could install inflight fd silently
+and transform the graph.
+
+Let's say we peeked a fd, which was a listening socket, and accept()ed
+some embryo sockets from it.  The garbage collection algorithm would
+have been confused because the set of sockets visited in scan_inflight()
+would change within the same GC invocation.
+
+That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in
+unix_peek_fds() with a fat comment.
+
+In the new GC implementation, we no longer garbage-collect the socket
+if it exists in another queue, that is, if it has a bridge to another
+SCC.  Also, accept() will require the lock if it has edges.
+
+Thus, we need not do the complicated lock dance.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 -
+ net/unix/af_unix.c    |   42 ------------------------------------------
+ net/unix/garbage.c    |    2 +-
+ 3 files changed, 1 insertion(+), 44 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get
+ }
+ #endif
+-extern spinlock_t unix_gc_lock;
+ extern unsigned int unix_tot_inflight;
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1796,48 +1796,6 @@ static void unix_detach_fds(struct scm_c
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+       scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+-
+-      /*
+-       * Garbage collection of unix sockets starts by selecting a set of
+-       * candidate sockets which have reference only from being in flight
+-       * (total_refs == inflight_refs).  This condition is checked once during
+-       * the candidate collection phase, and candidates are marked as such, so
+-       * that non-candidates can later be ignored.  While inflight_refs is
+-       * protected by unix_gc_lock, total_refs (file count) is not, hence this
+-       * is an instantaneous decision.
+-       *
+-       * Once a candidate, however, the socket must not be reinstalled into a
+-       * file descriptor while the garbage collection is in progress.
+-       *
+-       * If the above conditions are met, then the directed graph of
+-       * candidates (*) does not change while unix_gc_lock is held.
+-       *
+-       * Any operations that changes the file count through file descriptors
+-       * (dup, close, sendmsg) does not change the graph since candidates are
+-       * not installed in fds.
+-       *
+-       * Dequeing a candidate via recvmsg would install it into an fd, but
+-       * that takes unix_gc_lock to decrement the inflight count, so it's
+-       * serialized with garbage collection.
+-       *
+-       * MSG_PEEK is special in that it does not change the inflight count,
+-       * yet does install the socket into an fd.  The following lock/unlock
+-       * pair is to ensure serialization with garbage collection.  It must be
+-       * done between incrementing the file count and installing the file into
+-       * an fd.
+-       *
+-       * If garbage collection starts after the barrier provided by the
+-       * lock/unlock, then it will see the elevated refcount and not mark this
+-       * as a candidate.  If a garbage collection is already in progress
+-       * before the file count was incremented, then the lock/unlock pair will
+-       * ensure that garbage collection is finished before progressing to
+-       * installing the fd.
+-       *
+-       * (*) A -> B where B is on the queue of A or B is on the queue of C
+-       * which is on the queue of listening socket A.
+-       */
+-      spin_lock(&unix_gc_lock);
+-      spin_unlock(&unix_gc_lock);
+ }
+ static void unix_destruct_scm(struct sk_buff *skb)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -183,7 +183,7 @@ static void unix_free_vertices(struct sc
+       }
+ }
+-DEFINE_SPINLOCK(unix_gc_lock);
++static DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
diff --git a/queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch b/queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch
new file mode 100644 (file)
index 0000000..9e6c46a
--- /dev/null
@@ -0,0 +1,83 @@
+From stable+bounces-145851-greg=kroah.com@vger.kernel.org Wed May 21 16:51:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:12 +0000
+Subject: af_unix: Replace BUG_ON() with WARN_ON_ONCE().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-5-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit d0f6dc26346863e1f4a23117f5468614e54df064 upstream.
+
+This is a prep patch for the last patch in this series so that
+checkpatch will not warn about BUG_ON().
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-2-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |    8 ++++----
+ net/unix/scm.c     |    8 ++++----
+ 2 files changed, 8 insertions(+), 8 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -145,7 +145,7 @@ static void scan_children(struct sock *x
+                       /* An embryo cannot be in-flight, so it's safe
+                        * to use the list link.
+                        */
+-                      BUG_ON(!list_empty(&u->link));
++                      WARN_ON_ONCE(!list_empty(&u->link));
+                       list_add_tail(&u->link, &embryos);
+               }
+               spin_unlock(&x->sk_receive_queue.lock);
+@@ -224,8 +224,8 @@ static void __unix_gc(struct work_struct
+               total_refs = file_count(sk->sk_socket->file);
+-              BUG_ON(!u->inflight);
+-              BUG_ON(total_refs < u->inflight);
++              WARN_ON_ONCE(!u->inflight);
++              WARN_ON_ONCE(total_refs < u->inflight);
+               if (total_refs == u->inflight) {
+                       list_move_tail(&u->link, &gc_candidates);
+                       __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+@@ -318,7 +318,7 @@ static void __unix_gc(struct work_struct
+               list_move_tail(&u->link, &gc_inflight_list);
+       /* All candidates should have been detached by now. */
+-      BUG_ON(!list_empty(&gc_candidates));
++      WARN_ON_ONCE(!list_empty(&gc_candidates));
+       /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+       WRITE_ONCE(gc_in_progress, false);
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -51,10 +51,10 @@ void unix_inflight(struct user_struct *u
+       if (u) {
+               if (!u->inflight) {
+-                      BUG_ON(!list_empty(&u->link));
++                      WARN_ON_ONCE(!list_empty(&u->link));
+                       list_add_tail(&u->link, &gc_inflight_list);
+               } else {
+-                      BUG_ON(list_empty(&u->link));
++                      WARN_ON_ONCE(list_empty(&u->link));
+               }
+               u->inflight++;
+               /* Paired with READ_ONCE() in wait_for_unix_gc() */
+@@ -71,8 +71,8 @@ void unix_notinflight(struct user_struct
+       spin_lock(&unix_gc_lock);
+       if (u) {
+-              BUG_ON(!u->inflight);
+-              BUG_ON(list_empty(&u->link));
++              WARN_ON_ONCE(!u->inflight);
++              WARN_ON_ONCE(list_empty(&u->link));
+               u->inflight--;
+               if (!u->inflight)
diff --git a/queue-6.6/af_unix-replace-garbage-collection-algorithm.patch b/queue-6.6/af_unix-replace-garbage-collection-algorithm.patch
new file mode 100644 (file)
index 0000000..dbbecab
--- /dev/null
@@ -0,0 +1,497 @@
+From stable+bounces-145867-greg=kroah.com@vger.kernel.org Wed May 21 16:59:26 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:28 +0000
+Subject: af_unix: Replace garbage collection algorithm.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-21-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 4090fa373f0e763c43610853d2774b5979915959 upstream.
+
+If we find a dead SCC during iteration, we call unix_collect_skb()
+to splice all skb in the SCC to the global sk_buff_head, hitlist.
+
+After iterating all SCC, we unlock unix_gc_lock and purge the queue.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    8 -
+ net/unix/af_unix.c    |   12 -
+ net/unix/garbage.c    |  318 ++++++++++----------------------------------------
+ 3 files changed, 64 insertions(+), 274 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get
+ extern spinlock_t unix_gc_lock;
+ extern unsigned int unix_tot_inflight;
+-
+-void unix_inflight(struct user_struct *user, struct file *fp);
+-void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
+ void unix_update_edges(struct unix_sock *receiver);
+@@ -85,12 +82,7 @@ struct unix_sock {
+       struct sock             *peer;
+       struct sock             *listener;
+       struct unix_vertex      *vertex;
+-      struct list_head        link;
+-      unsigned long           inflight;
+       spinlock_t              lock;
+-      unsigned long           gc_flags;
+-#define UNIX_GC_CANDIDATE     0
+-#define UNIX_GC_MAYBE_CYCLE   1
+       struct socket_wq        peer_wq;
+       wait_queue_entry_t      peer_wake;
+       struct scm_stat         scm_stat;
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -979,12 +979,10 @@ static struct sock *unix_create1(struct
+       sk->sk_destruct         = unix_sock_destructor;
+       u = unix_sk(sk);
+       u->listener = NULL;
+-      u->inflight = 0;
+       u->vertex = NULL;
+       u->path.dentry = NULL;
+       u->path.mnt = NULL;
+       spin_lock_init(&u->lock);
+-      INIT_LIST_HEAD(&u->link);
+       mutex_init(&u->iolock); /* single task reading lock */
+       mutex_init(&u->bindlock); /* single task binding lock */
+       init_waitqueue_head(&u->peer_wait);
+@@ -1770,8 +1768,6 @@ static inline bool too_many_unix_fds(str
+ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+-      int i;
+-
+       if (too_many_unix_fds(current))
+               return -ETOOMANYREFS;
+@@ -1783,9 +1779,6 @@ static int unix_attach_fds(struct scm_co
+       if (!UNIXCB(skb).fp)
+               return -ENOMEM;
+-      for (i = scm->fp->count - 1; i >= 0; i--)
+-              unix_inflight(scm->fp->user, scm->fp->fp[i]);
+-
+       if (unix_prepare_fpl(UNIXCB(skb).fp))
+               return -ENOMEM;
+@@ -1794,15 +1787,10 @@ static int unix_attach_fds(struct scm_co
+ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+-      int i;
+-
+       scm->fp = UNIXCB(skb).fp;
+       UNIXCB(skb).fp = NULL;
+       unix_destroy_fpl(scm->fp);
+-
+-      for (i = scm->fp->count - 1; i >= 0; i--)
+-              unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+ }
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix
+       return true;
+ }
++enum unix_recv_queue_lock_class {
++      U_RECVQ_LOCK_NORMAL,
++      U_RECVQ_LOCK_EMBRYO,
++};
++
++static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
++{
++      struct unix_vertex *vertex;
++
++      list_for_each_entry_reverse(vertex, scc, scc_entry) {
++              struct sk_buff_head *queue;
++              struct unix_edge *edge;
++              struct unix_sock *u;
++
++              edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
++              u = edge->predecessor;
++              queue = &u->sk.sk_receive_queue;
++
++              spin_lock(&queue->lock);
++
++              if (u->sk.sk_state == TCP_LISTEN) {
++                      struct sk_buff *skb;
++
++                      skb_queue_walk(queue, skb) {
++                              struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
++
++                              /* listener -> embryo order, the inversion never happens. */
++                              spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
++                              skb_queue_splice_init(embryo_queue, hitlist);
++                              spin_unlock(&embryo_queue->lock);
++                      }
++              } else {
++                      skb_queue_splice_init(queue, hitlist);
++
++#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
++                      if (u->oob_skb) {
++                              kfree_skb(u->oob_skb);
++                              u->oob_skb = NULL;
++                      }
++#endif
++              }
++
++              spin_unlock(&queue->lock);
++      }
++}
++
+ static bool unix_scc_cyclic(struct list_head *scc)
+ {
+       struct unix_vertex *vertex;
+@@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+-static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
++                          struct sk_buff_head *hitlist)
+ {
+       LIST_HEAD(vertex_stack);
+       struct unix_edge *edge;
+@@ -430,7 +477,9 @@ prev_vertex:
+                               scc_dead = unix_vertex_dead(vertex);
+               }
+-              if (!unix_graph_maybe_cyclic)
++              if (scc_dead)
++                      unix_collect_skb(&scc, hitlist);
++              else if (!unix_graph_maybe_cyclic)
+                       unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+               list_del(&scc);
+@@ -441,7 +490,7 @@ prev_vertex:
+               goto prev_vertex;
+ }
+-static void unix_walk_scc(void)
++static void unix_walk_scc(struct sk_buff_head *hitlist)
+ {
+       unsigned long last_index = UNIX_VERTEX_INDEX_START;
+@@ -454,7 +503,7 @@ static void unix_walk_scc(void)
+               struct unix_vertex *vertex;
+               vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+-              __unix_walk_scc(vertex, &last_index);
++              __unix_walk_scc(vertex, &last_index, hitlist);
+       }
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+@@ -463,7 +512,7 @@ static void unix_walk_scc(void)
+       unix_graph_grouped = true;
+ }
+-static void unix_walk_scc_fast(void)
++static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+ {
+       while (!list_empty(&unix_unvisited_vertices)) {
+               struct unix_vertex *vertex;
+@@ -480,279 +529,40 @@ static void unix_walk_scc_fast(void)
+                               scc_dead = unix_vertex_dead(vertex);
+               }
++              if (scc_dead)
++                      unix_collect_skb(&scc, hitlist);
++
+               list_del(&scc);
+       }
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+-static LIST_HEAD(gc_candidates);
+-static LIST_HEAD(gc_inflight_list);
+-
+-/* Keep the number of times in flight count for the file
+- * descriptor if it is for an AF_UNIX socket.
+- */
+-void unix_inflight(struct user_struct *user, struct file *filp)
+-{
+-      struct unix_sock *u = unix_get_socket(filp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (u) {
+-              if (!u->inflight) {
+-                      WARN_ON_ONCE(!list_empty(&u->link));
+-                      list_add_tail(&u->link, &gc_inflight_list);
+-              } else {
+-                      WARN_ON_ONCE(list_empty(&u->link));
+-              }
+-              u->inflight++;
+-      }
+-
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+-void unix_notinflight(struct user_struct *user, struct file *filp)
+-{
+-      struct unix_sock *u = unix_get_socket(filp);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      if (u) {
+-              WARN_ON_ONCE(!u->inflight);
+-              WARN_ON_ONCE(list_empty(&u->link));
+-
+-              u->inflight--;
+-              if (!u->inflight)
+-                      list_del_init(&u->link);
+-      }
+-
+-      spin_unlock(&unix_gc_lock);
+-}
+-
+-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+-                        struct sk_buff_head *hitlist)
+-{
+-      struct sk_buff *skb;
+-      struct sk_buff *next;
+-
+-      spin_lock(&x->sk_receive_queue.lock);
+-      skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+-              /* Do we have file descriptors ? */
+-              if (UNIXCB(skb).fp) {
+-                      bool hit = false;
+-                      /* Process the descriptors of this socket */
+-                      int nfd = UNIXCB(skb).fp->count;
+-                      struct file **fp = UNIXCB(skb).fp->fp;
+-
+-                      while (nfd--) {
+-                              /* Get the socket the fd matches if it indeed does so */
+-                              struct unix_sock *u = unix_get_socket(*fp++);
+-
+-                              /* Ignore non-candidates, they could have been added
+-                               * to the queues after starting the garbage collection
+-                               */
+-                              if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+-                                      hit = true;
+-
+-                                      func(u);
+-                              }
+-                      }
+-                      if (hit && hitlist != NULL) {
+-                              __skb_unlink(skb, &x->sk_receive_queue);
+-                              __skb_queue_tail(hitlist, skb);
+-                      }
+-              }
+-      }
+-      spin_unlock(&x->sk_receive_queue.lock);
+-}
+-
+-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
+-                        struct sk_buff_head *hitlist)
+-{
+-      if (x->sk_state != TCP_LISTEN) {
+-              scan_inflight(x, func, hitlist);
+-      } else {
+-              struct sk_buff *skb;
+-              struct sk_buff *next;
+-              struct unix_sock *u;
+-              LIST_HEAD(embryos);
+-
+-              /* For a listening socket collect the queued embryos
+-               * and perform a scan on them as well.
+-               */
+-              spin_lock(&x->sk_receive_queue.lock);
+-              skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+-                      u = unix_sk(skb->sk);
+-
+-                      /* An embryo cannot be in-flight, so it's safe
+-                       * to use the list link.
+-                       */
+-                      WARN_ON_ONCE(!list_empty(&u->link));
+-                      list_add_tail(&u->link, &embryos);
+-              }
+-              spin_unlock(&x->sk_receive_queue.lock);
+-
+-              while (!list_empty(&embryos)) {
+-                      u = list_entry(embryos.next, struct unix_sock, link);
+-                      scan_inflight(&u->sk, func, hitlist);
+-                      list_del_init(&u->link);
+-              }
+-      }
+-}
+-
+-static void dec_inflight(struct unix_sock *usk)
+-{
+-      usk->inflight--;
+-}
+-
+-static void inc_inflight(struct unix_sock *usk)
+-{
+-      usk->inflight++;
+-}
+-
+-static void inc_inflight_move_tail(struct unix_sock *u)
+-{
+-      u->inflight++;
+-
+-      /* If this still might be part of a cycle, move it to the end
+-       * of the list, so that it's checked even if it was already
+-       * passed over
+-       */
+-      if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
+-              list_move_tail(&u->link, &gc_candidates);
+-}
+-
+ static bool gc_in_progress;
+ static void __unix_gc(struct work_struct *work)
+ {
+       struct sk_buff_head hitlist;
+-      struct unix_sock *u, *next;
+-      LIST_HEAD(not_cycle_list);
+-      struct list_head cursor;
+       spin_lock(&unix_gc_lock);
+-      if (!unix_graph_maybe_cyclic)
++      if (!unix_graph_maybe_cyclic) {
++              spin_unlock(&unix_gc_lock);
+               goto skip_gc;
+-
+-      if (unix_graph_grouped)
+-              unix_walk_scc_fast();
+-      else
+-              unix_walk_scc();
+-
+-      /* First, select candidates for garbage collection.  Only
+-       * in-flight sockets are considered, and from those only ones
+-       * which don't have any external reference.
+-       *
+-       * Holding unix_gc_lock will protect these candidates from
+-       * being detached, and hence from gaining an external
+-       * reference.  Since there are no possible receivers, all
+-       * buffers currently on the candidates' queues stay there
+-       * during the garbage collection.
+-       *
+-       * We also know that no new candidate can be added onto the
+-       * receive queues.  Other, non candidate sockets _can_ be
+-       * added to queue, so we must make sure only to touch
+-       * candidates.
+-       *
+-       * Embryos, though never candidates themselves, affect which
+-       * candidates are reachable by the garbage collector.  Before
+-       * being added to a listener's queue, an embryo may already
+-       * receive data carrying SCM_RIGHTS, potentially making the
+-       * passed socket a candidate that is not yet reachable by the
+-       * collector.  It becomes reachable once the embryo is
+-       * enqueued.  Therefore, we must ensure that no SCM-laden
+-       * embryo appears in a (candidate) listener's queue between
+-       * consecutive scan_children() calls.
+-       */
+-      list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
+-              struct sock *sk = &u->sk;
+-              long total_refs;
+-
+-              total_refs = file_count(sk->sk_socket->file);
+-
+-              WARN_ON_ONCE(!u->inflight);
+-              WARN_ON_ONCE(total_refs < u->inflight);
+-              if (total_refs == u->inflight) {
+-                      list_move_tail(&u->link, &gc_candidates);
+-                      __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+-                      __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+-
+-                      if (sk->sk_state == TCP_LISTEN) {
+-                              unix_state_lock_nested(sk, U_LOCK_GC_LISTENER);
+-                              unix_state_unlock(sk);
+-                      }
+-              }
+-      }
+-
+-      /* Now remove all internal in-flight reference to children of
+-       * the candidates.
+-       */
+-      list_for_each_entry(u, &gc_candidates, link)
+-              scan_children(&u->sk, dec_inflight, NULL);
+-
+-      /* Restore the references for children of all candidates,
+-       * which have remaining references.  Do this recursively, so
+-       * only those remain, which form cyclic references.
+-       *
+-       * Use a "cursor" link, to make the list traversal safe, even
+-       * though elements might be moved about.
+-       */
+-      list_add(&cursor, &gc_candidates);
+-      while (cursor.next != &gc_candidates) {
+-              u = list_entry(cursor.next, struct unix_sock, link);
+-
+-              /* Move cursor to after the current position. */
+-              list_move(&cursor, &u->link);
+-
+-              if (u->inflight) {
+-                      list_move_tail(&u->link, &not_cycle_list);
+-                      __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+-                      scan_children(&u->sk, inc_inflight_move_tail, NULL);
+-              }
+       }
+-      list_del(&cursor);
+-      /* Now gc_candidates contains only garbage.  Restore original
+-       * inflight counters for these as well, and remove the skbuffs
+-       * which are creating the cycle(s).
+-       */
+-      skb_queue_head_init(&hitlist);
+-      list_for_each_entry(u, &gc_candidates, link) {
+-              scan_children(&u->sk, inc_inflight, &hitlist);
+-
+-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+-              if (u->oob_skb) {
+-                      kfree_skb(u->oob_skb);
+-                      u->oob_skb = NULL;
+-              }
+-#endif
+-      }
++      __skb_queue_head_init(&hitlist);
+-      /* not_cycle_list contains those sockets which do not make up a
+-       * cycle.  Restore these to the inflight list.
+-       */
+-      while (!list_empty(&not_cycle_list)) {
+-              u = list_entry(not_cycle_list.next, struct unix_sock, link);
+-              __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+-              list_move_tail(&u->link, &gc_inflight_list);
+-      }
++      if (unix_graph_grouped)
++              unix_walk_scc_fast(&hitlist);
++      else
++              unix_walk_scc(&hitlist);
+       spin_unlock(&unix_gc_lock);
+-      /* Here we are. Hitlist is filled. Die. */
+       __skb_queue_purge(&hitlist);
+-
+-      spin_lock(&unix_gc_lock);
+-
+-      /* All candidates should have been detached by now. */
+-      WARN_ON_ONCE(!list_empty(&gc_candidates));
+ skip_gc:
+-      /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+       WRITE_ONCE(gc_in_progress, false);
+-
+-      spin_unlock(&unix_gc_lock);
+ }
+ static DECLARE_WORK(unix_gc_work, __unix_gc);
diff --git a/queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch b/queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch
new file mode 100644 (file)
index 0000000..d669ef5
--- /dev/null
@@ -0,0 +1,129 @@
+From stable+bounces-145848-greg=kroah.com@vger.kernel.org Wed May 21 16:50:06 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:09 +0000
+Subject: af_unix: Return struct unix_sock from unix_get_socket().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, Simon Horman <horms@kernel.org>
+Message-ID: <20250521144803.2050504-2-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 5b17307bd0789edea0675d524a2b277b93bbde62 upstream.
+
+Currently, unix_get_socket() returns struct sock, but after calling
+it, we always cast it to unix_sk().
+
+Let's return struct unix_sock from unix_get_socket().
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Pavel Begunkov <asml.silence@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    2 +-
+ net/unix/garbage.c    |   19 +++++++------------
+ net/unix/scm.c        |   19 +++++++------------
+ 3 files changed, 15 insertions(+), 25 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *s
+ void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(void);
+-struct sock *unix_get_socket(struct file *filp);
++struct unix_sock *unix_get_socket(struct file *filp);
+ struct sock *unix_peer_get(struct sock *sk);
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x
+                       while (nfd--) {
+                               /* Get the socket the fd matches if it indeed does so */
+-                              struct sock *sk = unix_get_socket(*fp++);
++                              struct unix_sock *u = unix_get_socket(*fp++);
+-                              if (sk) {
+-                                      struct unix_sock *u = unix_sk(sk);
++                              /* Ignore non-candidates, they could have been added
++                               * to the queues after starting the garbage collection
++                               */
++                              if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
++                                      hit = true;
+-                                      /* Ignore non-candidates, they could
+-                                       * have been added to the queues after
+-                                       * starting the garbage collection
+-                                       */
+-                                      if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+-                                              hit = true;
+-
+-                                              func(u);
+-                                      }
++                                      func(u);
+                               }
+                       }
+                       if (hit && hitlist != NULL) {
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
+ DEFINE_SPINLOCK(unix_gc_lock);
+ EXPORT_SYMBOL(unix_gc_lock);
+-struct sock *unix_get_socket(struct file *filp)
++struct unix_sock *unix_get_socket(struct file *filp)
+ {
+-      struct sock *u_sock = NULL;
+       struct inode *inode = file_inode(filp);
+       /* Socket ? */
+@@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file
+               /* PF_UNIX ? */
+               if (s && ops && ops->family == PF_UNIX)
+-                      u_sock = s;
++                      return unix_sk(s);
+       }
+-      return u_sock;
++      return NULL;
+ }
+ EXPORT_SYMBOL(unix_get_socket);
+@@ -46,13 +45,11 @@ EXPORT_SYMBOL(unix_get_socket);
+  */
+ void unix_inflight(struct user_struct *user, struct file *fp)
+ {
+-      struct sock *s = unix_get_socket(fp);
++      struct unix_sock *u = unix_get_socket(fp);
+       spin_lock(&unix_gc_lock);
+-      if (s) {
+-              struct unix_sock *u = unix_sk(s);
+-
++      if (u) {
+               if (!u->inflight) {
+                       BUG_ON(!list_empty(&u->link));
+                       list_add_tail(&u->link, &gc_inflight_list);
+@@ -69,13 +66,11 @@ void unix_inflight(struct user_struct *u
+ void unix_notinflight(struct user_struct *user, struct file *fp)
+ {
+-      struct sock *s = unix_get_socket(fp);
++      struct unix_sock *u = unix_get_socket(fp);
+       spin_lock(&unix_gc_lock);
+-      if (s) {
+-              struct unix_sock *u = unix_sk(s);
+-
++      if (u) {
+               BUG_ON(!u->inflight);
+               BUG_ON(list_empty(&u->link));
diff --git a/queue-6.6/af_unix-run-gc-on-only-one-cpu.patch b/queue-6.6/af_unix-run-gc-on-only-one-cpu.patch
new file mode 100644 (file)
index 0000000..e9610ea
--- /dev/null
@@ -0,0 +1,131 @@
+From stable+bounces-145849-greg=kroah.com@vger.kernel.org Wed May 21 16:50:23 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:10 +0000
+Subject: af_unix: Run GC on only one CPU.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-3-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 8b90a9f819dc2a06baae4ec1a64d875e53b824ec upstream.
+
+If more than 16000 inflight AF_UNIX sockets exist and the garbage
+collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
+Also, they wait for unix_gc() to complete.
+
+In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
+and more if they are the GC candidate.  Thus, sendmsg() significantly
+slows down with too many inflight AF_UNIX sockets.
+
+There is a small window to invoke multiple unix_gc() instances, which
+will then be blocked by the same spinlock except for one.
+
+Let's convert unix_gc() to use struct work so that it will not consume
+CPUs unnecessarily.
+
+Note WRITE_ONCE(gc_in_progress, true) is moved before running GC.
+If we leave the WRITE_ONCE() as is and use the following test to
+call flush_work(), a process might not call it.
+
+    CPU 0                                     CPU 1
+    ---                                       ---
+                                              start work and call __unix_gc()
+    if (work_pending(&unix_gc_work) ||        <-- false
+        READ_ONCE(gc_in_progress))            <-- false
+            flush_work();                     <-- missed!
+                                             WRITE_ONCE(gc_in_progress, true)
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240123170856.41348-5-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   54 ++++++++++++++++++++++++++---------------------------
+ 1 file changed, 27 insertions(+), 27 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -86,7 +86,6 @@
+ /* Internal data structures and random procedures: */
+ static LIST_HEAD(gc_candidates);
+-static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+                         struct sk_buff_head *hitlist)
+@@ -182,23 +181,8 @@ static void inc_inflight_move_tail(struc
+ }
+ static bool gc_in_progress;
+-#define UNIX_INFLIGHT_TRIGGER_GC 16000
+-
+-void wait_for_unix_gc(void)
+-{
+-      /* If number of inflight sockets is insane,
+-       * force a garbage collect right now.
+-       * Paired with the WRITE_ONCE() in unix_inflight(),
+-       * unix_notinflight() and gc_in_progress().
+-       */
+-      if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
+-          !READ_ONCE(gc_in_progress))
+-              unix_gc();
+-      wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress));
+-}
+-/* The external entry point: unix_gc() */
+-void unix_gc(void)
++static void __unix_gc(struct work_struct *work)
+ {
+       struct sk_buff *next_skb, *skb;
+       struct unix_sock *u;
+@@ -209,13 +193,6 @@ void unix_gc(void)
+       spin_lock(&unix_gc_lock);
+-      /* Avoid a recursive GC. */
+-      if (gc_in_progress)
+-              goto out;
+-
+-      /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+-      WRITE_ONCE(gc_in_progress, true);
+-
+       /* First, select candidates for garbage collection.  Only
+        * in-flight sockets are considered, and from those only ones
+        * which don't have any external reference.
+@@ -346,8 +323,31 @@ void unix_gc(void)
+       /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+       WRITE_ONCE(gc_in_progress, false);
+-      wake_up(&unix_gc_wait);
+-
+- out:
+       spin_unlock(&unix_gc_lock);
+ }
++
++static DECLARE_WORK(unix_gc_work, __unix_gc);
++
++void unix_gc(void)
++{
++      WRITE_ONCE(gc_in_progress, true);
++      queue_work(system_unbound_wq, &unix_gc_work);
++}
++
++#define UNIX_INFLIGHT_TRIGGER_GC 16000
++
++void wait_for_unix_gc(void)
++{
++      /* If number of inflight sockets is insane,
++       * force a garbage collect right now.
++       *
++       * Paired with the WRITE_ONCE() in unix_inflight(),
++       * unix_notinflight(), and __unix_gc().
++       */
++      if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
++          !READ_ONCE(gc_in_progress))
++              unix_gc();
++
++      if (READ_ONCE(gc_in_progress))
++              flush_work(&unix_gc_work);
++}
diff --git a/queue-6.6/af_unix-save-listener-for-embryo-socket.patch b/queue-6.6/af_unix-save-listener-for-embryo-socket.patch
new file mode 100644 (file)
index 0000000..3e449bd
--- /dev/null
@@ -0,0 +1,79 @@
+From stable+bounces-145860-greg=kroah.com@vger.kernel.org Wed May 21 16:55:46 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:21 +0000
+Subject: af_unix: Save listener for embryo socket.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-14-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit aed6ecef55d70de3762ce41c561b7f547dbaf107 upstream.
+
+This is a prep patch for the following change, where we need to
+fetch the listening socket from the successor embryo socket
+during GC.
+
+We add a new field to struct unix_sock to save a pointer to a
+listening socket.
+
+We set it when connect() creates a new socket, and clear it when
+accept() is called.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 +
+ net/unix/af_unix.c    |    5 ++++-
+ 2 files changed, 5 insertions(+), 1 deletion(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -83,6 +83,7 @@ struct unix_sock {
+       struct path             path;
+       struct mutex            iolock, bindlock;
+       struct sock             *peer;
++      struct sock             *listener;
+       struct unix_vertex      *vertex;
+       struct list_head        link;
+       unsigned long           inflight;
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -978,6 +978,7 @@ static struct sock *unix_create1(struct
+       sk->sk_max_ack_backlog  = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
+       sk->sk_destruct         = unix_sock_destructor;
+       u = unix_sk(sk);
++      u->listener = NULL;
+       u->inflight = 0;
+       u->vertex = NULL;
+       u->path.dentry = NULL;
+@@ -1582,6 +1583,7 @@ restart:
+       newsk->sk_type          = sk->sk_type;
+       init_peercred(newsk);
+       newu = unix_sk(newsk);
++      newu->listener = other;
+       RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
+       otheru = unix_sk(other);
+@@ -1677,8 +1679,8 @@ static int unix_accept(struct socket *so
+                      bool kern)
+ {
+       struct sock *sk = sock->sk;
+-      struct sock *tsk;
+       struct sk_buff *skb;
++      struct sock *tsk;
+       int err;
+       err = -EOPNOTSUPP;
+@@ -1703,6 +1705,7 @@ static int unix_accept(struct socket *so
+       }
+       tsk = skb->sk;
++      unix_sk(tsk)->listener = NULL;
+       skb_free_datagram(sk, skb);
+       wake_up_interruptible(&unix_sk(sk)->peer_wait);
diff --git a/queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch b/queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch
new file mode 100644 (file)
index 0000000..64a019f
--- /dev/null
@@ -0,0 +1,158 @@
+From stable+bounces-145862-greg=kroah.com@vger.kernel.org Wed May 21 16:54:49 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:23 +0000
+Subject: af_unix: Save O(n) setup of Tarjan's algo.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-16-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit ba31b4a4e1018f5844c6eb31734976e2184f2f9a upstream.
+
+Before starting Tarjan's algorithm, we need to mark all vertices
+as unvisited.  We can save this O(n) setup by reserving two special
+indices (0, 1) and using two variables.
+
+The first time we link a vertex to unix_unvisited_vertices, we set
+unix_vertex_unvisited_index to index.
+
+During DFS, we can see that the index of unvisited vertices is the
+same as unix_vertex_unvisited_index.
+
+When we finalise SCC later, we set unix_vertex_grouped_index to each
+vertex's index.
+
+Then, we can know (i) that the vertex is on the stack if the index
+of a visited vertex is >= 2 and (ii) that it is not on the stack and
+belongs to a different SCC if the index is unix_vertex_grouped_index.
+
+After the whole algorithm, all indices of vertices are set as
+unix_vertex_grouped_index.
+
+Next time we start DFS, we know that all unvisited vertices have
+unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index
+as the not-on-stack marker.
+
+To use the same variable in __unix_walk_scc(), we can swap
+unix_vertex_(grouped|unvisited)_index at the end of Tarjan's
+algorithm.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 -
+ net/unix/garbage.c    |   26 +++++++++++++++-----------
+ 2 files changed, 15 insertions(+), 12 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -37,7 +37,6 @@ struct unix_vertex {
+       unsigned long out_degree;
+       unsigned long index;
+       unsigned long lowlink;
+-      bool on_stack;
+ };
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_suc
+ static LIST_HEAD(unix_unvisited_vertices);
+ enum unix_vertex_index {
+-      UNIX_VERTEX_INDEX_UNVISITED,
++      UNIX_VERTEX_INDEX_MARK1,
++      UNIX_VERTEX_INDEX_MARK2,
+       UNIX_VERTEX_INDEX_START,
+ };
++static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
++
+ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+       struct unix_vertex *vertex = edge->predecessor->vertex;
+       if (!vertex) {
+               vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
++              vertex->index = unix_vertex_unvisited_index;
+               vertex->out_degree = 0;
+               INIT_LIST_HEAD(&vertex->edges);
+@@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list
+ }
+ static LIST_HEAD(unix_visited_vertices);
++static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+ static void __unix_walk_scc(struct unix_vertex *vertex)
+ {
+@@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_
+       LIST_HEAD(edge_stack);
+ next_vertex:
+-      /* Push vertex to vertex_stack.
++      /* Push vertex to vertex_stack and mark it as on-stack
++       * (index >= UNIX_VERTEX_INDEX_START).
+        * The vertex will be popped when finalising SCC later.
+        */
+-      vertex->on_stack = true;
+       list_add(&vertex->scc_entry, &vertex_stack);
+       vertex->index = index;
+@@ -291,7 +296,7 @@ next_vertex:
+               if (!next_vertex)
+                       continue;
+-              if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
++              if (next_vertex->index == unix_vertex_unvisited_index) {
+                       /* Iterative deepening depth first search
+                        *
+                        *   1. Push a forward edge to edge_stack and set
+@@ -317,7 +322,7 @@ prev_vertex:
+                        * to skip SCC finalisation.
+                        */
+                       vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
+-              } else if (next_vertex->on_stack) {
++              } else if (next_vertex->index != unix_vertex_grouped_index) {
+                       /* Loop detected by a back/cross edge.
+                        *
+                        * The successor is on vertex_stack, so two vertices are
+@@ -344,7 +349,8 @@ prev_vertex:
+                       /* Don't restart DFS from this vertex in unix_walk_scc(). */
+                       list_move_tail(&vertex->entry, &unix_visited_vertices);
+-                      vertex->on_stack = false;
++                      /* Mark vertex as off-stack. */
++                      vertex->index = unix_vertex_grouped_index;
+               }
+               list_del(&scc);
+@@ -357,20 +363,18 @@ prev_vertex:
+ static void unix_walk_scc(void)
+ {
+-      struct unix_vertex *vertex;
+-
+-      list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
+-              vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
+-
+       /* Visit every vertex exactly once.
+        * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+        */
+       while (!list_empty(&unix_unvisited_vertices)) {
++              struct unix_vertex *vertex;
++
+               vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+               __unix_walk_scc(vertex);
+       }
+       list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
++      swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
+ }
+ static LIST_HEAD(gc_candidates);
diff --git a/queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch b/queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch
new file mode 100644 (file)
index 0000000..8cc410d
--- /dev/null
@@ -0,0 +1,157 @@
+From stable+bounces-145863-greg=kroah.com@vger.kernel.org Wed May 21 16:57:29 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:24 +0000
+Subject: af_unix: Skip GC if no cycle exists.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-17-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 77e5593aebba823bcbcf2c4b58b07efcd63933b8 upstream.
+
+We do not need to run GC if there is no possible cyclic reference.
+We use unix_graph_maybe_cyclic to decide if we should run GC.
+
+If a fd of an AF_UNIX socket is passed to an already inflight AF_UNIX
+socket, they could form a cyclic reference.  Then, we set true to
+unix_graph_maybe_cyclic and later run Tarjan's algorithm to group
+them into SCC.
+
+Once we run Tarjan's algorithm, we are 100% sure whether cyclic
+references exist or not.  If there is no cycle, we set false to
+unix_graph_maybe_cyclic and can skip the entire garbage collection
+next time.
+
+When finalising SCC, we set true to unix_graph_maybe_cyclic if SCC
+consists of multiple vertices.
+
+Even if SCC is a single vertex, a cycle might exist as self-fd passing.
+Given the corner case is rare, we detect it by checking all edges of
+the vertex and set true to unix_graph_maybe_cyclic.
+
+With this change, __unix_gc() is just a spin_lock() dance in the normal
+usage.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-11-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c |   48 +++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 47 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -112,6 +112,19 @@ static struct unix_vertex *unix_edge_suc
+       return edge->successor->vertex;
+ }
++static bool unix_graph_maybe_cyclic;
++
++static void unix_update_graph(struct unix_vertex *vertex)
++{
++      /* If the receiver socket is not inflight, no cyclic
++       * reference could be formed.
++       */
++      if (!vertex)
++              return;
++
++      unix_graph_maybe_cyclic = true;
++}
++
+ static LIST_HEAD(unix_unvisited_vertices);
+ enum unix_vertex_index {
+@@ -138,12 +151,16 @@ static void unix_add_edge(struct scm_fp_
+       vertex->out_degree++;
+       list_add_tail(&edge->vertex_entry, &vertex->edges);
++
++      unix_update_graph(unix_edge_successor(edge));
+ }
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+       struct unix_vertex *vertex = edge->predecessor->vertex;
++      unix_update_graph(unix_edge_successor(edge));
++
+       list_del(&edge->vertex_entry);
+       vertex->out_degree--;
+@@ -227,6 +244,7 @@ out:
+ void unix_update_edges(struct unix_sock *receiver)
+ {
+       spin_lock(&unix_gc_lock);
++      unix_update_graph(unix_sk(receiver->listener)->vertex);
+       receiver->listener = NULL;
+       spin_unlock(&unix_gc_lock);
+ }
+@@ -268,6 +286,26 @@ void unix_destroy_fpl(struct scm_fp_list
+       unix_free_vertices(fpl);
+ }
++static bool unix_scc_cyclic(struct list_head *scc)
++{
++      struct unix_vertex *vertex;
++      struct unix_edge *edge;
++
++      /* SCC containing multiple vertices ? */
++      if (!list_is_singular(scc))
++              return true;
++
++      vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
++
++      /* Self-reference or a embryo-listener circle ? */
++      list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++              if (unix_edge_successor(edge) == vertex)
++                      return true;
++      }
++
++      return false;
++}
++
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+@@ -353,6 +391,9 @@ prev_vertex:
+                       vertex->index = unix_vertex_grouped_index;
+               }
++              if (!unix_graph_maybe_cyclic)
++                      unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
++
+               list_del(&scc);
+       }
+@@ -363,6 +404,8 @@ prev_vertex:
+ static void unix_walk_scc(void)
+ {
++      unix_graph_maybe_cyclic = false;
++
+       /* Visit every vertex exactly once.
+        * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+        */
+@@ -524,6 +567,9 @@ static void __unix_gc(struct work_struct
+       spin_lock(&unix_gc_lock);
++      if (!unix_graph_maybe_cyclic)
++              goto skip_gc;
++
+       unix_walk_scc();
+       /* First, select candidates for garbage collection.  Only
+@@ -633,7 +679,7 @@ static void __unix_gc(struct work_struct
+       /* All candidates should have been detached by now. */
+       WARN_ON_ONCE(!list_empty(&gc_candidates));
+-
++skip_gc:
+       /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+       WRITE_ONCE(gc_in_progress, false);
diff --git a/queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch b/queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch
new file mode 100644 (file)
index 0000000..6b0fc63
--- /dev/null
@@ -0,0 +1,120 @@
+From stable+bounces-145869-greg=kroah.com@vger.kernel.org Wed May 21 17:00:36 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:30 +0000
+Subject: af_unix: Try not to hold unix_gc_lock during accept().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, kernel test robot <oliver.sang@intel.com>
+Message-ID: <20250521144803.2050504-23-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit fd86344823b521149bb31d91eba900ba3525efa6 upstream.
+
+Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo
+socket.") added spin_lock(&unix_gc_lock) in accept() path, and it
+caused regression in a stress test as reported by kernel test robot.
+
+If the embryo socket is not part of the inflight graph, we need not
+hold the lock.
+
+To decide that in O(1) time and avoid the regression in the normal
+use case,
+
+  1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds
+
+  2. count the number of inflight AF_UNIX sockets in the receive
+     queue under unix_state_lock()
+
+  3. move unix_update_edges() call under unix_state_lock()
+
+  4. avoid locking if nr_unix_fds is 0 in unix_update_edges()
+
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |    1 +
+ net/unix/af_unix.c    |    2 +-
+ net/unix/garbage.c    |   20 ++++++++++++++++----
+ 3 files changed, 18 insertions(+), 5 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -67,6 +67,7 @@ struct unix_skb_parms {
+ struct scm_stat {
+       atomic_t nr_fds;
++      unsigned long nr_unix_fds;
+ };
+ #define UNIXCB(skb)   (*(struct unix_skb_parms *)&((skb)->cb))
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1703,12 +1703,12 @@ static int unix_accept(struct socket *so
+       }
+       tsk = skb->sk;
+-      unix_update_edges(unix_sk(tsk));
+       skb_free_datagram(sk, skb);
+       wake_up_interruptible(&unix_sk(sk)->peer_wait);
+       /* attach accepted sock to socket */
+       unix_state_lock(tsk);
++      unix_update_edges(unix_sk(tsk));
+       newsock->state = SS_CONNECTED;
+       unix_sock_inherit_flags(sock, newsock);
+       sock_graft(tsk, newsock);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list *
+               unix_add_edge(fpl, edge);
+       } while (i < fpl->count_unix);
++      receiver->scm_stat.nr_unix_fds += fpl->count_unix;
+       WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+ out:
+       WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
+@@ -222,6 +223,7 @@ out:
+ void unix_del_edges(struct scm_fp_list *fpl)
+ {
++      struct unix_sock *receiver;
+       int i = 0;
+       spin_lock(&unix_gc_lock);
+@@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list *
+               unix_del_edge(fpl, edge);
+       } while (i < fpl->count_unix);
++      receiver = fpl->edges[0].successor;
++      receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+       WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
+       WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+@@ -246,10 +250,18 @@ out:
+ void unix_update_edges(struct unix_sock *receiver)
+ {
+-      spin_lock(&unix_gc_lock);
+-      unix_update_graph(unix_sk(receiver->listener)->vertex);
+-      receiver->listener = NULL;
+-      spin_unlock(&unix_gc_lock);
++      /* nr_unix_fds is only updated under unix_state_lock().
++       * If it's 0 here, the embryo socket is not part of the
++       * inflight graph, and GC will not see it, so no lock needed.
++       */
++      if (!receiver->scm_stat.nr_unix_fds) {
++              receiver->listener = NULL;
++      } else {
++              spin_lock(&unix_gc_lock);
++              unix_update_graph(unix_sk(receiver->listener)->vertex);
++              receiver->listener = NULL;
++              spin_unlock(&unix_gc_lock);
++      }
+ }
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
diff --git a/queue-6.6/af_unix-try-to-run-gc-async.patch b/queue-6.6/af_unix-try-to-run-gc-async.patch
new file mode 100644 (file)
index 0000000..ecdf887
--- /dev/null
@@ -0,0 +1,200 @@
+From stable+bounces-145850-greg=kroah.com@vger.kernel.org Wed May 21 16:50:41 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:11 +0000
+Subject: af_unix: Try to run GC async.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-4-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit d9f21b3613337b55cc9d4a6ead484dca68475143 upstream.
+
+If more than 16000 inflight AF_UNIX sockets exist and the garbage
+collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
+Also, they wait for unix_gc() to complete.
+
+In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
+and more if they are the GC candidate.  Thus, sendmsg() significantly
+slows down with too many inflight AF_UNIX sockets.
+
+However, if a process sends data with no AF_UNIX FD, the sendmsg() call
+does not need to wait for GC.  After this change, only the process that
+meets the condition below will be blocked under such a situation.
+
+  1) cmsg contains AF_UNIX socket
+  2) more than 32 AF_UNIX sent by the same user are still inflight
+
+Note that even a sendmsg() call that does not meet the condition but has
+AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
+but we allow that as a bonus for sane users.
+
+The results below are the time spent in unix_dgram_sendmsg() sending 1
+byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
+sockets exist.
+
+Without series: the sane sendmsg() needs to wait gc unreasonably.
+
+  $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
+  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
+  ^C
+       nsecs               : count     distribution
+  [...]
+      524288 -> 1048575    : 0        |                                        |
+     1048576 -> 2097151    : 3881     |****************************************|
+     2097152 -> 4194303    : 214      |**                                      |
+     4194304 -> 8388607    : 1        |                                        |
+
+  avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
+
+With series: the sane sendmsg() can finish much faster.
+
+  $ sudo /usr/share/bcc/tools/funclatency -p 8702  unix_dgram_sendmsg
+  Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
+  ^C
+       nsecs               : count     distribution
+  [...]
+         128 -> 255        : 0        |                                        |
+         256 -> 511        : 4092     |****************************************|
+         512 -> 1023       : 2        |                                        |
+        1024 -> 2047       : 0        |                                        |
+        2048 -> 4095       : 0        |                                        |
+        4096 -> 8191       : 1        |                                        |
+        8192 -> 16383      : 1        |                                        |
+
+  avg = 410 nsecs, total: 1680510 nsecs, count: 4096
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h |   12 ++++++++++--
+ include/net/scm.h     |    1 +
+ net/core/scm.c        |    5 +++++
+ net/unix/af_unix.c    |    6 ++++--
+ net/unix/garbage.c    |   10 +++++++++-
+ 5 files changed, 29 insertions(+), 5 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -8,13 +8,21 @@
+ #include <linux/refcount.h>
+ #include <net/sock.h>
++#if IS_ENABLED(CONFIG_UNIX)
++struct unix_sock *unix_get_socket(struct file *filp);
++#else
++static inline struct unix_sock *unix_get_socket(struct file *filp)
++{
++      return NULL;
++}
++#endif
++
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_destruct_scm(struct sk_buff *skb);
+ void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+-void wait_for_unix_gc(void);
+-struct unix_sock *unix_get_socket(struct file *filp);
++void wait_for_unix_gc(struct scm_fp_list *fpl);
+ struct sock *unix_peer_get(struct sock *sk);
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -24,6 +24,7 @@ struct scm_creds {
+ struct scm_fp_list {
+       short                   count;
++      short                   count_unix;
+       short                   max;
+       struct user_struct      *user;
+       struct file             *fp[SCM_MAX_FD];
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -36,6 +36,7 @@
+ #include <net/compat.h>
+ #include <net/scm.h>
+ #include <net/cls_cgroup.h>
++#include <net/af_unix.h>
+ /*
+@@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *c
+                       return -ENOMEM;
+               *fplp = fpl;
+               fpl->count = 0;
++              fpl->count_unix = 0;
+               fpl->max = SCM_MAX_FD;
+               fpl->user = NULL;
+       }
+@@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *c
+                       fput(file);
+                       return -EINVAL;
+               }
++              if (unix_get_socket(file))
++                      fpl->count_unix++;
++
+               *fpp++ = file;
+               fpl->count++;
+       }
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1885,11 +1885,12 @@ static int unix_dgram_sendmsg(struct soc
+       long timeo;
+       int err;
+-      wait_for_unix_gc();
+       err = scm_send(sock, msg, &scm, false);
+       if (err < 0)
+               return err;
++      wait_for_unix_gc(scm.fp);
++
+       err = -EOPNOTSUPP;
+       if (msg->msg_flags&MSG_OOB)
+               goto out;
+@@ -2157,11 +2158,12 @@ static int unix_stream_sendmsg(struct so
+       bool fds_sent = false;
+       int data_len;
+-      wait_for_unix_gc();
+       err = scm_send(sock, msg, &scm, false);
+       if (err < 0)
+               return err;
++      wait_for_unix_gc(scm.fp);
++
+       err = -EOPNOTSUPP;
+       if (msg->msg_flags & MSG_OOB) {
+ #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -335,8 +335,9 @@ void unix_gc(void)
+ }
+ #define UNIX_INFLIGHT_TRIGGER_GC 16000
++#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
+-void wait_for_unix_gc(void)
++void wait_for_unix_gc(struct scm_fp_list *fpl)
+ {
+       /* If number of inflight sockets is insane,
+        * force a garbage collect right now.
+@@ -348,6 +349,13 @@ void wait_for_unix_gc(void)
+           !READ_ONCE(gc_in_progress))
+               unix_gc();
++      /* Penalise users who want to send AF_UNIX sockets
++       * but whose sockets have not been received yet.
++       */
++      if (!fpl || !fpl->count_unix ||
++          READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
++              return;
++
+       if (READ_ONCE(gc_in_progress))
+               flush_work(&unix_gc_work);
+ }
index 19f974e8dd3292b9a215cca2447d6e284fec79dc..d42c7b4d037bae55a7697e5b1a4620f92515a3e8 100644 (file)
@@ -392,3 +392,29 @@ drm-gem-internally-test-import_attach-for-imported-objects.patch
 can-kvaser_pciefd-force-irq-edge-in-case-of-nested-irq.patch
 hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch
 btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch
+af_unix-return-struct-unix_sock-from-unix_get_socket.patch
+af_unix-run-gc-on-only-one-cpu.patch
+af_unix-try-to-run-gc-async.patch
+af_unix-replace-bug_on-with-warn_on_once.patch
+af_unix-remove-io_uring-code-for-gc.patch
+af_unix-remove-config_unix_scm.patch
+af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch
+af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch
+af_unix-link-struct-unix_edge-when-queuing-skb.patch
+af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch
+af_unix-iterate-all-vertices-by-dfs.patch
+af_unix-detect-strongly-connected-components.patch
+af_unix-save-listener-for-embryo-socket.patch
+af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch
+af_unix-save-o-n-setup-of-tarjan-s-algo.patch
+af_unix-skip-gc-if-no-cycle-exists.patch
+af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch
+af_unix-assign-a-unique-index-to-scc.patch
+af_unix-detect-dead-scc.patch
+af_unix-replace-garbage-collection-algorithm.patch
+af_unix-remove-lock-dance-in-unix_peek_fds.patch
+af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch
+af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch
+af_unix-add-dead-flag-to-struct-scm_fp_list.patch
+af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch
+af_unix-fix-uninit-value-in-__unix_walk_scc.patch