--- /dev/null
+From stable+bounces-145871-greg=kroah.com@vger.kernel.org Wed May 21 16:58:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:32 +0000
+Subject: af_unix: Add dead flag to struct scm_fp_list.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-25-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 7172dc93d621d5dc302d007e95ddd1311ec64283 upstream.
+
+Commit 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges()
+during GC.") fixed use-after-free by avoid accessing edge->successor while
+GC is in progress.
+
+However, there could be a small race window where another process could
+call unix_del_edges() while gc_in_progress is true and __skb_queue_purge()
+is on the way.
+
+So, we need another marker for struct scm_fp_list which indicates if the
+skb is garbage-collected.
+
+This patch adds dead flag in struct scm_fp_list and set it true before
+calling __skb_queue_purge().
+
+Fixes: 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240508171150.50601-1-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/scm.h | 1 +
+ net/core/scm.c | 1 +
+ net/unix/garbage.c | 14 ++++++++++----
+ 3 files changed, 12 insertions(+), 4 deletions(-)
+
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -32,6 +32,7 @@ struct scm_fp_list {
+ short max;
+ #ifdef CONFIG_UNIX
+ bool inflight;
++ bool dead;
+ struct list_head vertices;
+ struct unix_edge *edges;
+ #endif
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -91,6 +91,7 @@ static int scm_fp_copy(struct cmsghdr *c
+ fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
+ fpl->inflight = false;
++ fpl->dead = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -158,13 +158,11 @@ static void unix_add_edge(struct scm_fp_
+ unix_update_graph(unix_edge_successor(edge));
+ }
+
+-static bool gc_in_progress;
+-
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+- if (!gc_in_progress)
++ if (!fpl->dead)
+ unix_update_graph(unix_edge_successor(edge));
+
+ list_del(&edge->vertex_entry);
+@@ -240,7 +238,7 @@ void unix_del_edges(struct scm_fp_list *
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
+- if (!gc_in_progress) {
++ if (!fpl->dead) {
+ receiver = fpl->edges[0].successor;
+ receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+ }
+@@ -559,9 +557,12 @@ static void unix_walk_scc_fast(struct sk
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+
++static bool gc_in_progress;
++
+ static void __unix_gc(struct work_struct *work)
+ {
+ struct sk_buff_head hitlist;
++ struct sk_buff *skb;
+
+ spin_lock(&unix_gc_lock);
+
+@@ -579,6 +580,11 @@ static void __unix_gc(struct work_struct
+
+ spin_unlock(&unix_gc_lock);
+
++ skb_queue_walk(&hitlist, skb) {
++ if (UNIXCB(skb).fp)
++ UNIXCB(skb).fp->dead = true;
++ }
++
+ __skb_queue_purge(&hitlist);
+ skip_gc:
+ WRITE_ONCE(gc_in_progress, false);
--- /dev/null
+From stable+bounces-145855-greg=kroah.com@vger.kernel.org Wed May 21 16:53:51 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:16 +0000
+Subject: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-9-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 29b64e354029cfcf1eea4d91b146c7b769305930 upstream.
+
+As with the previous patch, we preallocate to skb's scm_fp_list an
+array of struct unix_edge in the number of inflight AF_UNIX fds.
+
+There we just preallocate memory and do not use immediately because
+sendmsg() could fail after this point. The actual use will be in
+the next patch.
+
+When we queue skb with inflight edges, we will set the inflight
+socket's unix_sock as unix_edge->predecessor and the receiver's
+unix_sock as successor, and then we will link the edge to the
+inflight socket's unix_vertex.edges.
+
+Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup()
+so that MSG_PEEK does not change the shape of the directed graph.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 6 ++++++
+ include/net/scm.h | 5 +++++
+ net/core/scm.c | 2 ++
+ net/unix/garbage.c | 6 ++++++
+ 4 files changed, 19 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -33,6 +33,12 @@ struct unix_vertex {
+ unsigned long out_degree;
+ };
+
++struct unix_edge {
++ struct unix_sock *predecessor;
++ struct unix_sock *successor;
++ struct list_head vertex_entry;
++};
++
+ struct sock *unix_peer_get(struct sock *sk);
+
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -22,12 +22,17 @@ struct scm_creds {
+ kgid_t gid;
+ };
+
++#ifdef CONFIG_UNIX
++struct unix_edge;
++#endif
++
+ struct scm_fp_list {
+ short count;
+ short count_unix;
+ short max;
+ #ifdef CONFIG_UNIX
+ struct list_head vertices;
++ struct unix_edge *edges;
+ #endif
+ struct user_struct *user;
+ struct file *fp[SCM_MAX_FD];
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
++ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+ }
+@@ -383,6 +384,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
+ new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
+ #if IS_ENABLED(CONFIG_UNIX)
++ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+ #endif
+ }
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list
+ list_add(&vertex->entry, &fpl->vertices);
+ }
+
++ fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges),
++ GFP_KERNEL_ACCOUNT);
++ if (!fpl->edges)
++ goto err;
++
+ return 0;
+
+ err:
+@@ -136,6 +141,7 @@ err:
+
+ void unix_destroy_fpl(struct scm_fp_list *fpl)
+ {
++ kvfree(fpl->edges);
+ unix_free_vertices(fpl);
+ }
+
--- /dev/null
+From stable+bounces-145854-greg=kroah.com@vger.kernel.org Wed May 21 16:53:21 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:15 +0000
+Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-8-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 1fbfdfaa590248c1d86407f578e40e5c65136330 upstream.
+
+We will replace the garbage collection algorithm for AF_UNIX, where
+we will consider each inflight AF_UNIX socket as a vertex and its file
+descriptor as an edge in a directed graph.
+
+This patch introduces a new struct unix_vertex representing a vertex
+in the graph and adds its pointer to struct unix_sock.
+
+When we send a fd using the SCM_RIGHTS message, we allocate struct
+scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump
+each refcount of the inflight fds' struct file and save them in
+scm_fp_list.fp.
+
+After that, unix_attach_fds() inexplicably clones scm_fp_list of
+scm_cookie and sets it to skb. (We will remove this part after
+replacing GC.)
+
+Here, we add a new function call in unix_attach_fds() to preallocate
+struct unix_vertex per inflight AF_UNIX fd and link each vertex to
+skb's scm_fp_list.vertices.
+
+When sendmsg() succeeds later, if the socket of the inflight fd is
+still not inflight yet, we will set the preallocated vertex to struct
+unix_sock.vertex and link it to a global list unix_unvisited_vertices
+under spin_lock(&unix_gc_lock).
+
+If the socket is already inflight, we free the preallocated vertex.
+This is to avoid taking the lock unnecessarily when sendmsg() could
+fail later.
+
+In the following patch, we will similarly allocate another struct
+per edge, which will finally be linked to the inflight socket's
+unix_vertex.edges.
+
+And then, we will count the number of edges as unix_vertex.out_degree.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 9 +++++++++
+ include/net/scm.h | 3 +++
+ net/core/scm.c | 7 +++++++
+ net/unix/af_unix.c | 6 ++++++
+ net/unix/garbage.c | 38 ++++++++++++++++++++++++++++++++++++++
+ 5 files changed, 63 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight;
+
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
++int unix_prepare_fpl(struct scm_fp_list *fpl);
++void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
+
++struct unix_vertex {
++ struct list_head edges;
++ struct list_head entry;
++ unsigned long out_degree;
++};
++
+ struct sock *unix_peer_get(struct sock *sk);
+
+ #define UNIX_HASH_MOD (256 - 1)
+@@ -62,6 +70,7 @@ struct unix_sock {
+ struct path path;
+ struct mutex iolock, bindlock;
+ struct sock *peer;
++ struct unix_vertex *vertex;
+ struct list_head link;
+ unsigned long inflight;
+ spinlock_t lock;
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -26,6 +26,9 @@ struct scm_fp_list {
+ short count;
+ short count_unix;
+ short max;
++#ifdef CONFIG_UNIX
++ struct list_head vertices;
++#endif
+ struct user_struct *user;
+ struct file *fp[SCM_MAX_FD];
+ };
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *c
+ fpl->count_unix = 0;
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
++#if IS_ENABLED(CONFIG_UNIX)
++ INIT_LIST_HEAD(&fpl->vertices);
++#endif
+ }
+ fpp = &fpl->fp[fpl->count];
+
+@@ -376,8 +379,12 @@ struct scm_fp_list *scm_fp_dup(struct sc
+ if (new_fpl) {
+ for (i = 0; i < fpl->count; i++)
+ get_file(fpl->fp[i]);
++
+ new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
++#if IS_ENABLED(CONFIG_UNIX)
++ INIT_LIST_HEAD(&new_fpl->vertices);
++#endif
+ }
+ return new_fpl;
+ }
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -979,6 +979,7 @@ static struct sock *unix_create1(struct
+ sk->sk_destruct = unix_sock_destructor;
+ u = unix_sk(sk);
+ u->inflight = 0;
++ u->vertex = NULL;
+ u->path.dentry = NULL;
+ u->path.mnt = NULL;
+ spin_lock_init(&u->lock);
+@@ -1782,6 +1783,9 @@ static int unix_attach_fds(struct scm_co
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_inflight(scm->fp->user, scm->fp->fp[i]);
+
++ if (unix_prepare_fpl(UNIXCB(skb).fp))
++ return -ENOMEM;
++
+ return 0;
+ }
+
+@@ -1792,6 +1796,8 @@ static void unix_detach_fds(struct scm_c
+ scm->fp = UNIXCB(skb).fp;
+ UNIXCB(skb).fp = NULL;
+
++ unix_destroy_fpl(scm->fp);
++
+ for (i = scm->fp->count - 1; i >= 0; i--)
+ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+ }
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct
+ return NULL;
+ }
+
++static void unix_free_vertices(struct scm_fp_list *fpl)
++{
++ struct unix_vertex *vertex, *next_vertex;
++
++ list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) {
++ list_del(&vertex->entry);
++ kfree(vertex);
++ }
++}
++
++int unix_prepare_fpl(struct scm_fp_list *fpl)
++{
++ struct unix_vertex *vertex;
++ int i;
++
++ if (!fpl->count_unix)
++ return 0;
++
++ for (i = 0; i < fpl->count_unix; i++) {
++ vertex = kmalloc(sizeof(*vertex), GFP_KERNEL);
++ if (!vertex)
++ goto err;
++
++ list_add(&vertex->entry, &fpl->vertices);
++ }
++
++ return 0;
++
++err:
++ unix_free_vertices(fpl);
++ return -ENOMEM;
++}
++
++void unix_destroy_fpl(struct scm_fp_list *fpl)
++{
++ unix_free_vertices(fpl);
++}
++
+ DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
--- /dev/null
+From stable+bounces-145865-greg=kroah.com@vger.kernel.org Wed May 21 16:55:41 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:26 +0000
+Subject: af_unix: Assign a unique index to SCC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-19-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 upstream.
+
+The definition of the lowlink in Tarjan's algorithm is the
+smallest index of a vertex that is reachable with at most one
+back-edge in SCC. This is not useful for a cross-edge.
+
+If we start traversing from A in the following graph, the final
+lowlink of D is 3. The cross-edge here is one between D and C.
+
+ A -> B -> D D = (4, 3) (index, lowlink)
+ ^ | | C = (3, 1)
+ | V | B = (2, 1)
+ `--- C <--' A = (1, 1)
+
+This is because the lowlink of D is updated with the index of C.
+
+In the following patch, we detect a dead SCC by checking two
+conditions for each vertex.
+
+ 1) vertex has no edge directed to another SCC (no bridge)
+ 2) vertex's out_degree is the same as the refcount of its file
+
+If 1) is false, there is a receiver of all fds of the SCC and
+its ancestor SCC.
+
+To evaluate 1), we need to assign a unique index to each SCC and
+assign it to all vertices in the SCC.
+
+This patch changes the lowlink update logic for cross-edge so
+that in the example above, the lowlink of D is updated with the
+lowlink of C.
+
+ A -> B -> D D = (4, 1) (index, lowlink)
+ ^ | | C = (3, 1)
+ | V | B = (2, 1)
+ `--- C <--' A = (1, 1)
+
+Then, all vertices in the same SCC have the same lowlink, and we
+can quickly find the bridge connecting to different SCC if exists.
+
+However, it is no longer called lowlink, so we rename it to
+scc_index. (It's sometimes called lowpoint.)
+
+Also, we add a global variable to hold the last index used in DFS
+so that we do not reset the initial index in each DFS.
+
+This patch can be squashed to the SCC detection patch but is
+split deliberately for anyone wondering why lowlink is not used
+as used in the original Tarjan's algorithm and many reference
+implementations.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 2 +-
+ net/unix/garbage.c | 29 +++++++++++++++--------------
+ 2 files changed, 16 insertions(+), 15 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -36,7 +36,7 @@ struct unix_vertex {
+ struct list_head scc_entry;
+ unsigned long out_degree;
+ unsigned long index;
+- unsigned long lowlink;
++ unsigned long scc_index;
+ };
+
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+-static void __unix_walk_scc(struct unix_vertex *vertex)
++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
+ {
+- unsigned long index = UNIX_VERTEX_INDEX_START;
+ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+ LIST_HEAD(edge_stack);
+@@ -326,9 +325,9 @@ next_vertex:
+ */
+ list_add(&vertex->scc_entry, &vertex_stack);
+
+- vertex->index = index;
+- vertex->lowlink = index;
+- index++;
++ vertex->index = *last_index;
++ vertex->scc_index = *last_index;
++ (*last_index)++;
+
+ /* Explore neighbour vertices (receivers of the current vertex's fd). */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+@@ -358,30 +357,30 @@ prev_vertex:
+ next_vertex = vertex;
+ vertex = edge->predecessor->vertex;
+
+- /* If the successor has a smaller lowlink, two vertices
+- * are in the same SCC, so propagate the smaller lowlink
++ /* If the successor has a smaller scc_index, two vertices
++ * are in the same SCC, so propagate the smaller scc_index
+ * to skip SCC finalisation.
+ */
+- vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
++ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else if (next_vertex->index != unix_vertex_grouped_index) {
+ /* Loop detected by a back/cross edge.
+ *
+- * The successor is on vertex_stack, so two vertices are
+- * in the same SCC. If the successor has a smaller index,
++ * The successor is on vertex_stack, so two vertices are in
++ * the same SCC. If the successor has a smaller *scc_index*,
+ * propagate it to skip SCC finalisation.
+ */
+- vertex->lowlink = min(vertex->lowlink, next_vertex->index);
++ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index);
+ } else {
+ /* The successor was already grouped as another SCC */
+ }
+ }
+
+- if (vertex->index == vertex->lowlink) {
++ if (vertex->index == vertex->scc_index) {
+ struct list_head scc;
+
+ /* SCC finalised.
+ *
+- * If the lowlink was not updated, all the vertices above on
++ * If the scc_index was not updated, all the vertices above on
+ * vertex_stack are in the same SCC. Group them using scc_entry.
+ */
+ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+@@ -407,6 +406,8 @@ prev_vertex:
+
+ static void unix_walk_scc(void)
+ {
++ unsigned long last_index = UNIX_VERTEX_INDEX_START;
++
+ unix_graph_maybe_cyclic = false;
+
+ /* Visit every vertex exactly once.
+@@ -416,7 +417,7 @@ static void unix_walk_scc(void)
+ struct unix_vertex *vertex;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+- __unix_walk_scc(vertex);
++ __unix_walk_scc(vertex, &last_index);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
--- /dev/null
+From stable+bounces-145864-greg=kroah.com@vger.kernel.org Wed May 21 16:58:08 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:25 +0000
+Subject: af_unix: Avoid Tarjan's algorithm if unnecessary.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-18-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit ad081928a8b0f57f269df999a28087fce6f2b6ce upstream.
+
+Once a cyclic reference is formed, we need to run GC to check if
+there is dead SCC.
+
+However, we do not need to run Tarjan's algorithm if we know that
+the shape of the inflight graph has not been changed.
+
+If an edge is added/updated/deleted and the edge's successor is
+inflight, we set false to unix_graph_grouped, which means we need
+to re-classify SCC.
+
+Once we finalise SCC, we set true to unix_graph_grouped.
+
+While unix_graph_grouped is true, we can iterate the grouped
+SCC using vertex->scc_entry in unix_walk_scc_fast().
+
+list_add() and list_for_each_entry_reverse() uses seem weird, but
+they are to keep the vertex order consistent and make writing test
+easier.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-12-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 28 +++++++++++++++++++++++++++-
+ 1 file changed, 27 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -113,6 +113,7 @@ static struct unix_vertex *unix_edge_suc
+ }
+
+ static bool unix_graph_maybe_cyclic;
++static bool unix_graph_grouped;
+
+ static void unix_update_graph(struct unix_vertex *vertex)
+ {
+@@ -123,6 +124,7 @@ static void unix_update_graph(struct uni
+ return;
+
+ unix_graph_maybe_cyclic = true;
++ unix_graph_grouped = false;
+ }
+
+ static LIST_HEAD(unix_unvisited_vertices);
+@@ -144,6 +146,7 @@ static void unix_add_edge(struct scm_fp_
+ vertex->index = unix_vertex_unvisited_index;
+ vertex->out_degree = 0;
+ INIT_LIST_HEAD(&vertex->edges);
++ INIT_LIST_HEAD(&vertex->scc_entry);
+
+ list_move_tail(&vertex->entry, &unix_unvisited_vertices);
+ edge->predecessor->vertex = vertex;
+@@ -418,6 +421,26 @@ static void unix_walk_scc(void)
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
++
++ unix_graph_grouped = true;
++}
++
++static void unix_walk_scc_fast(void)
++{
++ while (!list_empty(&unix_unvisited_vertices)) {
++ struct unix_vertex *vertex;
++ struct list_head scc;
++
++ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
++ list_add(&scc, &vertex->scc_entry);
++
++ list_for_each_entry_reverse(vertex, &scc, scc_entry)
++ list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++ list_del(&scc);
++ }
++
++ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+
+ static LIST_HEAD(gc_candidates);
+@@ -570,7 +593,10 @@ static void __unix_gc(struct work_struct
+ if (!unix_graph_maybe_cyclic)
+ goto skip_gc;
+
+- unix_walk_scc();
++ if (unix_graph_grouped)
++ unix_walk_scc_fast();
++ else
++ unix_walk_scc();
+
+ /* First, select candidates for garbage collection. Only
+ * in-flight sockets are considered, and from those only ones
--- /dev/null
+From stable+bounces-145857-greg=kroah.com@vger.kernel.org Wed May 21 16:54:19 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:18 +0000
+Subject: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-11-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 22c3c0c52d32f41cc38cd936ea0c93f22ced3315 upstream.
+
+Currently, we track the number of inflight sockets in two variables.
+unix_tot_inflight is the total number of inflight AF_UNIX sockets on
+the host, and user->unix_inflight is the number of inflight fds per
+user.
+
+We update them one by one in unix_inflight(), which can be done once
+in batch. Also, sendmsg() could fail even after unix_inflight(), then
+we need to acquire unix_gc_lock only to decrement the counters.
+
+Let's bulk update the counters in unix_add_edges() and unix_del_edges(),
+which is called only for successfully passed fds.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-5-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 18 +++++++-----------
+ 1 file changed, 7 insertions(+), 11 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -144,6 +144,7 @@ static void unix_free_vertices(struct sc
+ }
+
+ DEFINE_SPINLOCK(unix_gc_lock);
++unsigned int unix_tot_inflight;
+
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
+ {
+@@ -168,7 +169,10 @@ void unix_add_edges(struct scm_fp_list *
+ unix_add_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+ out:
++ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
++
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = true;
+@@ -191,7 +195,10 @@ void unix_del_edges(struct scm_fp_list *
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
++ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
++
+ spin_unlock(&unix_gc_lock);
+
+ fpl->inflight = false;
+@@ -234,7 +241,6 @@ void unix_destroy_fpl(struct scm_fp_list
+ unix_free_vertices(fpl);
+ }
+
+-unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
+
+@@ -255,13 +261,8 @@ void unix_inflight(struct user_struct *u
+ WARN_ON_ONCE(list_empty(&u->link));
+ }
+ u->inflight++;
+-
+- /* Paired with READ_ONCE() in wait_for_unix_gc() */
+- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
+ }
+
+- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+-
+ spin_unlock(&unix_gc_lock);
+ }
+
+@@ -278,13 +279,8 @@ void unix_notinflight(struct user_struct
+ u->inflight--;
+ if (!u->inflight)
+ list_del_init(&u->link);
+-
+- /* Paired with READ_ONCE() in wait_for_unix_gc() */
+- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
+ }
+
+- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+-
+ spin_unlock(&unix_gc_lock);
+ }
+
--- /dev/null
+From stable+bounces-145866-greg=kroah.com@vger.kernel.org Wed May 21 16:59:06 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:27 +0000
+Subject: af_unix: Detect dead SCC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-20-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit a15702d8b3aad8ce5268c565bd29f0e02fd2db83 upstream.
+
+When iterating SCC, we call unix_vertex_dead() for each vertex
+to check if the vertex is close()d and has no bridge to another
+SCC.
+
+If both conditions are true for every vertex in SCC, we can
+execute garbage collection for all skb in the SCC.
+
+The actual garbage collection is done in the following patch,
+replacing the old implementation.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-14-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 44 +++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 43 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -289,6 +289,39 @@ void unix_destroy_fpl(struct scm_fp_list
+ unix_free_vertices(fpl);
+ }
+
++static bool unix_vertex_dead(struct unix_vertex *vertex)
++{
++ struct unix_edge *edge;
++ struct unix_sock *u;
++ long total_ref;
++
++ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++ struct unix_vertex *next_vertex = unix_edge_successor(edge);
++
++ /* The vertex's fd can be received by a non-inflight socket. */
++ if (!next_vertex)
++ return false;
++
++ /* The vertex's fd can be received by an inflight socket in
++ * another SCC.
++ */
++ if (next_vertex->scc_index != vertex->scc_index)
++ return false;
++ }
++
++ /* No receiver exists out of the same SCC. */
++
++ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
++ u = edge->predecessor;
++ total_ref = file_count(u->sk.sk_socket->file);
++
++ /* If not close()d, total_ref > out_degree. */
++ if (total_ref != vertex->out_degree)
++ return false;
++
++ return true;
++}
++
+ static bool unix_scc_cyclic(struct list_head *scc)
+ {
+ struct unix_vertex *vertex;
+@@ -377,6 +410,7 @@ prev_vertex:
+
+ if (vertex->index == vertex->scc_index) {
+ struct list_head scc;
++ bool scc_dead = true;
+
+ /* SCC finalised.
+ *
+@@ -391,6 +425,9 @@ prev_vertex:
+
+ /* Mark vertex as off-stack. */
+ vertex->index = unix_vertex_grouped_index;
++
++ if (scc_dead)
++ scc_dead = unix_vertex_dead(vertex);
+ }
+
+ if (!unix_graph_maybe_cyclic)
+@@ -431,13 +468,18 @@ static void unix_walk_scc_fast(void)
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+ struct list_head scc;
++ bool scc_dead = true;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ list_add(&scc, &vertex->scc_entry);
+
+- list_for_each_entry_reverse(vertex, &scc, scc_entry)
++ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
++ if (scc_dead)
++ scc_dead = unix_vertex_dead(vertex);
++ }
++
+ list_del(&scc);
+ }
+
--- /dev/null
+From stable+bounces-145859-greg=kroah.com@vger.kernel.org Wed May 21 16:55:45 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:20 +0000
+Subject: af_unix: Detect Strongly Connected Components.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-13-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 3484f063172dd88776b062046d721d7c2ae1af7c upstream.
+
+In the new GC, we use a simple graph algorithm, Tarjan's Strongly
+Connected Components (SCC) algorithm, to find cyclic references.
+
+The algorithm visits every vertex exactly once using depth-first
+search (DFS).
+
+DFS starts by pushing an input vertex to a stack and assigning it
+a unique number. Two fields, index and lowlink, are initialised
+with the number, but lowlink could be updated later during DFS.
+
+If a vertex has an edge to an unvisited inflight vertex, we visit
+it and do the same processing. So, we will have vertices in the
+stack in the order they appear and number them consecutively in
+the same order.
+
+If a vertex has a back-edge to a visited vertex in the stack,
+we update the predecessor's lowlink with the successor's index.
+
+After iterating edges from the vertex, we check if its index
+equals its lowlink.
+
+If the lowlink is different from the index, it shows there was a
+back-edge. Then, we go backtracking and propagate the lowlink to
+its predecessor and resume the previous edge iteration from the
+next edge.
+
+If the lowlink is the same as the index, we pop vertices before
+and including the vertex from the stack. Then, the set of vertices
+is SCC, possibly forming a cycle. At the same time, we move the
+vertices to unix_visited_vertices.
+
+When we finish the algorithm, all vertices in each SCC will be
+linked via unix_vertex.scc_entry.
+
+Let's take an example. We have a graph including five inflight
+vertices (F is not inflight):
+
+ A -> B -> C -> D -> E (-> F)
+ ^ |
+ `---------'
+
+Suppose that we start DFS from C. We will visit C, D, and B first
+and initialise their index and lowlink. Then, the stack looks like
+this:
+
+ > B = (3, 3) (index, lowlink)
+ D = (2, 2)
+ C = (1, 1)
+
+When checking B's edge to C, we update B's lowlink with C's index
+and propagate it to D.
+
+ B = (3, 1) (index, lowlink)
+ > D = (2, 1)
+ C = (1, 1)
+
+Next, we visit E, which has no edge to an inflight vertex.
+
+ > E = (4, 4) (index, lowlink)
+ B = (3, 1)
+ D = (2, 1)
+ C = (1, 1)
+
+When we leave from E, its index and lowlink are the same, so we
+pop E from the stack as single-vertex SCC. Next, we leave from
+B and D but do nothing because their lowlink are different from
+their index.
+
+ B = (3, 1) (index, lowlink)
+ D = (2, 1)
+ > C = (1, 1)
+
+Then, we leave from C, whose index and lowlink are the same, so
+we pop B, D and C as SCC.
+
+Last, we do DFS for the rest of vertices, A, which is also a
+single-vertex SCC.
+
+Finally, each unix_vertex.scc_entry is linked as follows:
+
+ A -. B -> C -> D E -.
+ ^ | ^ | ^ |
+ `--' `---------' `--'
+
+We use SCC later to decide whether we can garbage-collect the
+sockets.
+
+Note that we still cannot detect SCC properly if an edge points
+to an embryo socket. The following two patches will sort it out.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 3 +++
+ net/unix/garbage.c | 46 ++++++++++++++++++++++++++++++++++++++++++++--
+ 2 files changed, 47 insertions(+), 2 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list
+ struct unix_vertex {
+ struct list_head edges;
+ struct list_head entry;
++ struct list_head scc_entry;
+ unsigned long out_degree;
+ unsigned long index;
++ unsigned long lowlink;
++ bool on_stack;
+ };
+
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices);
+ static void __unix_walk_scc(struct unix_vertex *vertex)
+ {
+ unsigned long index = UNIX_VERTEX_INDEX_START;
++ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+ LIST_HEAD(edge_stack);
+
+ next_vertex:
++ /* Push vertex to vertex_stack.
++ * The vertex will be popped when finalising SCC later.
++ */
++ vertex->on_stack = true;
++ list_add(&vertex->scc_entry, &vertex_stack);
++
+ vertex->index = index;
++ vertex->lowlink = index;
+ index++;
+
+ /* Explore neighbour vertices (receivers of the current vertex's fd). */
+@@ -283,12 +291,46 @@ prev_vertex:
+ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
+ list_del_init(&edge->stack_entry);
+
++ next_vertex = vertex;
+ vertex = edge->predecessor->vertex;
++
++ /* If the successor has a smaller lowlink, two vertices
++ * are in the same SCC, so propagate the smaller lowlink
++ * to skip SCC finalisation.
++ */
++ vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
++ } else if (next_vertex->on_stack) {
++ /* Loop detected by a back/cross edge.
++ *
++ * The successor is on vertex_stack, so two vertices are
++ * in the same SCC. If the successor has a smaller index,
++ * propagate it to skip SCC finalisation.
++ */
++ vertex->lowlink = min(vertex->lowlink, next_vertex->index);
++ } else {
++ /* The successor was already grouped as another SCC */
+ }
+ }
+
+- /* Don't restart DFS from this vertex in unix_walk_scc(). */
+- list_move_tail(&vertex->entry, &unix_visited_vertices);
++ if (vertex->index == vertex->lowlink) {
++ struct list_head scc;
++
++ /* SCC finalised.
++ *
++ * If the lowlink was not updated, all the vertices above on
++ * vertex_stack are in the same SCC. Group them using scc_entry.
++ */
++ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
++
++ list_for_each_entry_reverse(vertex, &scc, scc_entry) {
++ /* Don't restart DFS from this vertex in unix_walk_scc(). */
++ list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++ vertex->on_stack = false;
++ }
++
++ list_del(&scc);
++ }
+
+ /* Need backtracking ? */
+ if (!list_empty(&edge_stack))
--- /dev/null
+From stable+bounces-145870-greg=kroah.com@vger.kernel.org Wed May 21 17:01:08 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:31 +0000
+Subject: af_unix: Don't access successor in unix_del_edges() during GC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com
+Message-ID: <20250521144803.2050504-24-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 1af2dface5d286dd1f2f3405a0d6fa9f2c8fb998 upstream.
+
+syzbot reported use-after-free in unix_del_edges(). [0]
+
+What the repro does is basically repeat the following quickly.
+
+ 1. pass a fd of an AF_UNIX socket to itself
+
+ socketpair(AF_UNIX, SOCK_DGRAM, 0, [3, 4]) = 0
+ sendmsg(3, {..., msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET,
+ cmsg_type=SCM_RIGHTS, cmsg_data=[4]}], ...}, 0) = 0
+
+ 2. pass other fds of AF_UNIX sockets to the socket above
+
+ socketpair(AF_UNIX, SOCK_SEQPACKET, 0, [5, 6]) = 0
+ sendmsg(3, {..., msg_control=[{cmsg_len=48, cmsg_level=SOL_SOCKET,
+ cmsg_type=SCM_RIGHTS, cmsg_data=[5, 6]}], ...}, 0) = 0
+
+ 3. close all sockets
+
+Here, two skb are created, and every unix_edge->successor is the first
+socket. Then, __unix_gc() will garbage-collect the two skb:
+
+ (a) free skb with self-referencing fd
+ (b) free skb holding other sockets
+
+After (a), the self-referencing socket will be scheduled to be freed
+later by the delayed_fput() task.
+
+syzbot repeated the sequences above (1. ~ 3.) quickly and triggered
+the task concurrently while GC was running.
+
+So, at (b), the socket was already freed, and accessing it was illegal.
+
+unix_del_edges() accesses the receiver socket as edge->successor to
+optimise GC. However, we should not do it during GC.
+
+Garbage-collecting sockets does not change the shape of the rest
+of the graph, so we need not call unix_update_graph() to update
+unix_graph_grouped when we purge skb.
+
+However, if we clean up all loops in the unix_walk_scc_fast() path,
+unix_graph_maybe_cyclic remains unchanged (true), and __unix_gc()
+will call unix_walk_scc_fast() continuously even though there is no
+socket to garbage-collect.
+
+To keep that optimisation while fixing UAF, let's add the same
+updating logic of unix_graph_maybe_cyclic in unix_walk_scc_fast()
+as done in unix_walk_scc() and __unix_walk_scc().
+
+Note that when unix_del_edges() is called from other places, the
+receiver socket is always alive:
+
+ - sendmsg: the successor's sk_refcnt is bumped by sock_hold()
+ unix_find_other() for SOCK_DGRAM, connect() for SOCK_STREAM
+
+ - recvmsg: the successor is the receiver, and its fd is alive
+
+[0]:
+BUG: KASAN: slab-use-after-free in unix_edge_successor net/unix/garbage.c:109 [inline]
+BUG: KASAN: slab-use-after-free in unix_del_edge net/unix/garbage.c:165 [inline]
+BUG: KASAN: slab-use-after-free in unix_del_edges+0x148/0x630 net/unix/garbage.c:237
+Read of size 8 at addr ffff888079c6e640 by task kworker/u8:6/1099
+
+CPU: 0 PID: 1099 Comm: kworker/u8:6 Not tainted 6.9.0-rc4-next-20240418-syzkaller #0
+Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024
+Workqueue: events_unbound __unix_gc
+Call Trace:
+ <TASK>
+ __dump_stack lib/dump_stack.c:88 [inline]
+ dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114
+ print_address_description mm/kasan/report.c:377 [inline]
+ print_report+0x169/0x550 mm/kasan/report.c:488
+ kasan_report+0x143/0x180 mm/kasan/report.c:601
+ unix_edge_successor net/unix/garbage.c:109 [inline]
+ unix_del_edge net/unix/garbage.c:165 [inline]
+ unix_del_edges+0x148/0x630 net/unix/garbage.c:237
+ unix_destroy_fpl+0x59/0x210 net/unix/garbage.c:298
+ unix_detach_fds net/unix/af_unix.c:1811 [inline]
+ unix_destruct_scm+0x13e/0x210 net/unix/af_unix.c:1826
+ skb_release_head_state+0x100/0x250 net/core/skbuff.c:1127
+ skb_release_all net/core/skbuff.c:1138 [inline]
+ __kfree_skb net/core/skbuff.c:1154 [inline]
+ kfree_skb_reason+0x16d/0x3b0 net/core/skbuff.c:1190
+ __skb_queue_purge_reason include/linux/skbuff.h:3251 [inline]
+ __skb_queue_purge include/linux/skbuff.h:3256 [inline]
+ __unix_gc+0x1732/0x1830 net/unix/garbage.c:575
+ process_one_work kernel/workqueue.c:3218 [inline]
+ process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299
+ worker_thread+0x86d/0xd70 kernel/workqueue.c:3380
+ kthread+0x2f0/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+ </TASK>
+
+Allocated by task 14427:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+ unpoison_slab_object mm/kasan/common.c:312 [inline]
+ __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338
+ kasan_slab_alloc include/linux/kasan.h:201 [inline]
+ slab_post_alloc_hook mm/slub.c:3897 [inline]
+ slab_alloc_node mm/slub.c:3957 [inline]
+ kmem_cache_alloc_noprof+0x135/0x290 mm/slub.c:3964
+ sk_prot_alloc+0x58/0x210 net/core/sock.c:2074
+ sk_alloc+0x38/0x370 net/core/sock.c:2133
+ unix_create1+0xb4/0x770
+ unix_create+0x14e/0x200 net/unix/af_unix.c:1034
+ __sock_create+0x490/0x920 net/socket.c:1571
+ sock_create net/socket.c:1622 [inline]
+ __sys_socketpair+0x33e/0x720 net/socket.c:1773
+ __do_sys_socketpair net/socket.c:1822 [inline]
+ __se_sys_socketpair net/socket.c:1819 [inline]
+ __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819
+ do_syscall_x64 arch/x86/entry/common.c:52 [inline]
+ do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83
+ entry_SYSCALL_64_after_hwframe+0x77/0x7f
+
+Freed by task 1805:
+ kasan_save_stack mm/kasan/common.c:47 [inline]
+ kasan_save_track+0x3f/0x80 mm/kasan/common.c:68
+ kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579
+ poison_slab_object+0xe0/0x150 mm/kasan/common.c:240
+ __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256
+ kasan_slab_free include/linux/kasan.h:184 [inline]
+ slab_free_hook mm/slub.c:2190 [inline]
+ slab_free mm/slub.c:4393 [inline]
+ kmem_cache_free+0x145/0x340 mm/slub.c:4468
+ sk_prot_free net/core/sock.c:2114 [inline]
+ __sk_destruct+0x467/0x5f0 net/core/sock.c:2208
+ sock_put include/net/sock.h:1948 [inline]
+ unix_release_sock+0xa8b/0xd20 net/unix/af_unix.c:665
+ unix_release+0x91/0xc0 net/unix/af_unix.c:1049
+ __sock_release net/socket.c:659 [inline]
+ sock_close+0xbc/0x240 net/socket.c:1421
+ __fput+0x406/0x8b0 fs/file_table.c:422
+ delayed_fput+0x59/0x80 fs/file_table.c:445
+ process_one_work kernel/workqueue.c:3218 [inline]
+ process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299
+ worker_thread+0x86d/0xd70 kernel/workqueue.c:3380
+ kthread+0x2f0/0x390 kernel/kthread.c:389
+ ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+The buggy address belongs to the object at ffff888079c6e000
+ which belongs to the cache UNIX of size 1920
+The buggy address is located 1600 bytes inside of
+ freed 1920-byte region [ffff888079c6e000, ffff888079c6e780)
+
+Reported-by: syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com
+Closes: https://syzkaller.appspot.com/bug?extid=f3f3eef1d2100200e593
+Fixes: 77e5593aebba ("af_unix: Skip GC if no cycle exists.")
+Fixes: fd86344823b5 ("af_unix: Try not to hold unix_gc_lock during accept().")
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240419235102.31707-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 17 ++++++++++++-----
+ 1 file changed, 12 insertions(+), 5 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -158,11 +158,14 @@ static void unix_add_edge(struct scm_fp_
+ unix_update_graph(unix_edge_successor(edge));
+ }
+
++static bool gc_in_progress;
++
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+- unix_update_graph(unix_edge_successor(edge));
++ if (!gc_in_progress)
++ unix_update_graph(unix_edge_successor(edge));
+
+ list_del(&edge->vertex_entry);
+ vertex->out_degree--;
+@@ -237,8 +240,10 @@ void unix_del_edges(struct scm_fp_list *
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
+- receiver = fpl->edges[0].successor;
+- receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
++ if (!gc_in_progress) {
++ receiver = fpl->edges[0].successor;
++ receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
++ }
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+@@ -526,6 +531,8 @@ static void unix_walk_scc(struct sk_buff
+
+ static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+ {
++ unix_graph_maybe_cyclic = false;
++
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+ struct list_head scc;
+@@ -543,6 +550,8 @@ static void unix_walk_scc_fast(struct sk
+
+ if (scc_dead)
+ unix_collect_skb(&scc, hitlist);
++ else if (!unix_graph_maybe_cyclic)
++ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+ list_del(&scc);
+ }
+@@ -550,8 +559,6 @@ static void unix_walk_scc_fast(struct sk
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+
+-static bool gc_in_progress;
+-
+ static void __unix_gc(struct work_struct *work)
+ {
+ struct sk_buff_head hitlist;
--- /dev/null
+From stable+bounces-145872-greg=kroah.com@vger.kernel.org Wed May 21 17:02:13 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:33 +0000
+Subject: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-26-lee@kernel.org>
+
+From: Michal Luczaj <mhal@rbox.co>
+
+commit 041933a1ec7b4173a8e638cae4f8e394331d7e54 upstream.
+
+GC attempts to explicitly drop oob_skb's reference before purging the hit
+list.
+
+The problem is with embryos: kfree_skb(u->oob_skb) is never called on an
+embryo socket.
+
+The python script below [0] sends a listener's fd to its embryo as OOB
+data. While GC does collect the embryo's queue, it fails to drop the OOB
+skb's refcount. The skb which was in embryo's receive queue stays as
+unix_sk(sk)->oob_skb and keeps the listener's refcount [1].
+
+Tell GC to dispose embryo's oob_skb.
+
+[0]:
+from array import array
+from socket import *
+
+addr = '\x00unix-oob'
+lis = socket(AF_UNIX, SOCK_STREAM)
+lis.bind(addr)
+lis.listen(1)
+
+s = socket(AF_UNIX, SOCK_STREAM)
+s.connect(addr)
+scm = (SOL_SOCKET, SCM_RIGHTS, array('i', [lis.fileno()]))
+s.sendmsg([b'x'], [scm], MSG_OOB)
+lis.close()
+
+[1]
+$ grep unix-oob /proc/net/unix
+$ ./unix-oob.py
+$ grep unix-oob /proc/net/unix
+0000000000000000: 00000002 00000000 00000000 0001 02 0 @unix-oob
+0000000000000000: 00000002 00000000 00010000 0001 01 6072 @unix-oob
+
+Fixes: 4090fa373f0e ("af_unix: Replace garbage collection algorithm.")
+Signed-off-by: Michal Luczaj <mhal@rbox.co>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 23 ++++++++++++++---------
+ 1 file changed, 14 insertions(+), 9 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -342,6 +342,18 @@ enum unix_recv_queue_lock_class {
+ U_RECVQ_LOCK_EMBRYO,
+ };
+
++static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist)
++{
++ skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist);
++
++#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
++ if (u->oob_skb) {
++ WARN_ON_ONCE(skb_unref(u->oob_skb));
++ u->oob_skb = NULL;
++ }
++#endif
++}
++
+ static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
+ {
+ struct unix_vertex *vertex;
+@@ -365,18 +377,11 @@ static void unix_collect_skb(struct list
+
+ /* listener -> embryo order, the inversion never happens. */
+ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
+- skb_queue_splice_init(embryo_queue, hitlist);
++ unix_collect_queue(unix_sk(skb->sk), hitlist);
+ spin_unlock(&embryo_queue->lock);
+ }
+ } else {
+- skb_queue_splice_init(queue, hitlist);
+-
+-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+- if (u->oob_skb) {
+- kfree_skb(u->oob_skb);
+- u->oob_skb = NULL;
+- }
+-#endif
++ unix_collect_queue(u, hitlist);
+ }
+
+ spin_unlock(&queue->lock);
--- /dev/null
+From stable+bounces-145873-greg=kroah.com@vger.kernel.org Wed May 21 17:02:42 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:34 +0000
+Subject: af_unix: Fix uninit-value in __unix_walk_scc()
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, Shigeru Yoshida <syoshida@redhat.com>, syzkaller <syzkaller@googlegroups.com>
+Message-ID: <20250521144803.2050504-27-lee@kernel.org>
+
+From: Shigeru Yoshida <syoshida@redhat.com>
+
+commit 927fa5b3e4f52e0967bfc859afc98ad1c523d2d5 upstream.
+
+KMSAN reported uninit-value access in __unix_walk_scc() [1].
+
+In the list_for_each_entry_reverse() loop, when the vertex's index
+equals it's scc_index, the loop uses the variable vertex as a
+temporary variable that points to a vertex in scc. And when the loop
+is finished, the variable vertex points to the list head, in this case
+scc, which is a local variable on the stack (more precisely, it's not
+even scc and might underflow the call stack of __unix_walk_scc():
+container_of(&scc, struct unix_vertex, scc_entry)).
+
+However, the variable vertex is used under the label prev_vertex. So
+if the edge_stack is not empty and the function jumps to the
+prev_vertex label, the function will access invalid data on the
+stack. This causes the uninit-value access issue.
+
+Fix this by introducing a new temporary variable for the loop.
+
+[1]
+BUG: KMSAN: uninit-value in __unix_walk_scc net/unix/garbage.c:478 [inline]
+BUG: KMSAN: uninit-value in unix_walk_scc net/unix/garbage.c:526 [inline]
+BUG: KMSAN: uninit-value in __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584
+ __unix_walk_scc net/unix/garbage.c:478 [inline]
+ unix_walk_scc net/unix/garbage.c:526 [inline]
+ __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584
+ process_one_work kernel/workqueue.c:3231 [inline]
+ process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312
+ worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393
+ kthread+0x3c4/0x530 kernel/kthread.c:389
+ ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+Uninit was stored to memory at:
+ unix_walk_scc net/unix/garbage.c:526 [inline]
+ __unix_gc+0x2adf/0x3c20 net/unix/garbage.c:584
+ process_one_work kernel/workqueue.c:3231 [inline]
+ process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312
+ worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393
+ kthread+0x3c4/0x530 kernel/kthread.c:389
+ ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147
+ ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244
+
+Local variable entries created at:
+ ref_tracker_free+0x48/0xf30 lib/ref_tracker.c:222
+ netdev_tracker_free include/linux/netdevice.h:4058 [inline]
+ netdev_put include/linux/netdevice.h:4075 [inline]
+ dev_put include/linux/netdevice.h:4101 [inline]
+ update_gid_event_work_handler+0xaa/0x1b0 drivers/infiniband/core/roce_gid_mgmt.c:813
+
+CPU: 1 PID: 12763 Comm: kworker/u8:31 Not tainted 6.10.0-rc4-00217-g35bb670d65fc #32
+Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014
+Workqueue: events_unbound __unix_gc
+
+Fixes: 3484f063172d ("af_unix: Detect Strongly Connected Components.")
+Reported-by: syzkaller <syzkaller@googlegroups.com>
+Signed-off-by: Shigeru Yoshida <syoshida@redhat.com>
+Reviewed-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://patch.msgid.link/20240702160428.10153-1-syoshida@redhat.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 9 +++++----
+ 1 file changed, 5 insertions(+), 4 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -476,6 +476,7 @@ prev_vertex:
+ }
+
+ if (vertex->index == vertex->scc_index) {
++ struct unix_vertex *v;
+ struct list_head scc;
+ bool scc_dead = true;
+
+@@ -486,15 +487,15 @@ prev_vertex:
+ */
+ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry);
+
+- list_for_each_entry_reverse(vertex, &scc, scc_entry) {
++ list_for_each_entry_reverse(v, &scc, scc_entry) {
+ /* Don't restart DFS from this vertex in unix_walk_scc(). */
+- list_move_tail(&vertex->entry, &unix_visited_vertices);
++ list_move_tail(&v->entry, &unix_visited_vertices);
+
+ /* Mark vertex as off-stack. */
+- vertex->index = unix_vertex_grouped_index;
++ v->index = unix_vertex_grouped_index;
+
+ if (scc_dead)
+- scc_dead = unix_vertex_dead(vertex);
++ scc_dead = unix_vertex_dead(v);
+ }
+
+ if (scc_dead)
--- /dev/null
+From stable+bounces-145861-greg=kroah.com@vger.kernel.org Wed May 21 16:54:34 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:22 +0000
+Subject: af_unix: Fix up unix_edge.successor for embryo socket.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-15-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit dcf70df2048d27c5d186f013f101a4aefd63aa41 upstream.
+
+To garbage collect inflight AF_UNIX sockets, we must define the
+cyclic reference appropriately. This is a bit tricky if the loop
+consists of embryo sockets.
+
+Suppose that the fd of AF_UNIX socket A is passed to D and the fd B
+to C and that C and D are embryo sockets of A and B, respectively.
+It may appear that there are two separate graphs, A (-> D) and
+B (-> C), but this is not correct.
+
+ A --. .-- B
+ X
+ C <-' `-> D
+
+Now, D holds A's refcount, and C has B's refcount, so unix_release()
+will never be called for A and B when we close() them. However, no
+one can call close() for D and C to free skbs holding refcounts of A
+and B because C/D is in A/B's receive queue, which should have been
+purged by unix_release() for A and B.
+
+So, here's another type of cyclic reference. When a fd of an AF_UNIX
+socket is passed to an embryo socket, the reference is indirectly held
+by its parent listening socket.
+
+ .-> A .-> B
+ | `- sk_receive_queue | `- sk_receive_queue
+ | `- skb | `- skb
+ | `- sk == C | `- sk == D
+ | `- sk_receive_queue | `- sk_receive_queue
+ | `- skb +---------' `- skb +-.
+ | |
+ `---------------------------------------------------------'
+
+Technically, the graph must be denoted as A <-> B instead of A (-> D)
+and B (-> C) to find such a cyclic reference without touching each
+socket's receive queue.
+
+ .-> A --. .-- B <-.
+ | X | == A <-> B
+ `-- C <-' `-> D --'
+
+We apply this fixup during GC by fetching the real successor by
+unix_edge_successor().
+
+When we call accept(), we clear unix_sock.listener under unix_gc_lock
+not to confuse GC.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 +
+ net/unix/af_unix.c | 2 +-
+ net/unix/garbage.c | 20 +++++++++++++++++++-
+ 3 files changed, 21 insertions(+), 2 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *u
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
++void unix_update_edges(struct unix_sock *receiver);
+ int unix_prepare_fpl(struct scm_fp_list *fpl);
+ void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1705,7 +1705,7 @@ static int unix_accept(struct socket *so
+ }
+
+ tsk = skb->sk;
+- unix_sk(tsk)->listener = NULL;
++ unix_update_edges(unix_sk(tsk));
+ skb_free_datagram(sk, skb);
+ wake_up_interruptible(&unix_sk(sk)->peer_wait);
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct
+ return NULL;
+ }
+
++static struct unix_vertex *unix_edge_successor(struct unix_edge *edge)
++{
++ /* If an embryo socket has a fd,
++ * the listener indirectly holds the fd's refcnt.
++ */
++ if (edge->successor->listener)
++ return unix_sk(edge->successor->listener)->vertex;
++
++ return edge->successor->vertex;
++}
++
+ static LIST_HEAD(unix_unvisited_vertices);
+
+ enum unix_vertex_index {
+@@ -209,6 +220,13 @@ out:
+ fpl->inflight = false;
+ }
+
++void unix_update_edges(struct unix_sock *receiver)
++{
++ spin_lock(&unix_gc_lock);
++ receiver->listener = NULL;
++ spin_unlock(&unix_gc_lock);
++}
++
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
+ {
+ struct unix_vertex *vertex;
+@@ -268,7 +286,7 @@ next_vertex:
+
+ /* Explore neighbour vertices (receivers of the current vertex's fd). */
+ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
+- struct unix_vertex *next_vertex = edge->successor->vertex;
++ struct unix_vertex *next_vertex = unix_edge_successor(edge);
+
+ if (!next_vertex)
+ continue;
--- /dev/null
+From stable+bounces-145858-greg=kroah.com@vger.kernel.org Wed May 21 16:53:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:19 +0000
+Subject: af_unix: Iterate all vertices by DFS.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-12-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 6ba76fd2848e107594ea4f03b737230f74bc23ea upstream.
+
+The new GC will use a depth first search graph algorithm to find
+cyclic references. The algorithm visits every vertex exactly once.
+
+Here, we implement the DFS part without recursion so that no one
+can abuse it.
+
+unix_walk_scc() marks every vertex unvisited by initialising index
+as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in
+unix_unvisited_vertices and call __unix_walk_scc() to start DFS from
+an arbitrary vertex.
+
+__unix_walk_scc() iterates all edges starting from the vertex and
+explores the neighbour vertices with DFS using edge_stack.
+
+After visiting all neighbours, __unix_walk_scc() moves the visited
+vertex to unix_visited_vertices so that unix_walk_scc() will not
+restart DFS from the visited vertex.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 2 +
+ net/unix/garbage.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++
+ 2 files changed, 76 insertions(+)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -33,12 +33,14 @@ struct unix_vertex {
+ struct list_head edges;
+ struct list_head entry;
+ unsigned long out_degree;
++ unsigned long index;
+ };
+
+ struct unix_edge {
+ struct unix_sock *predecessor;
+ struct unix_sock *successor;
+ struct list_head vertex_entry;
++ struct list_head stack_entry;
+ };
+
+ struct sock *unix_peer_get(struct sock *sk);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct
+
+ static LIST_HEAD(unix_unvisited_vertices);
+
++enum unix_vertex_index {
++ UNIX_VERTEX_INDEX_UNVISITED,
++ UNIX_VERTEX_INDEX_START,
++};
++
+ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+@@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list
+ unix_free_vertices(fpl);
+ }
+
++static LIST_HEAD(unix_visited_vertices);
++
++static void __unix_walk_scc(struct unix_vertex *vertex)
++{
++ unsigned long index = UNIX_VERTEX_INDEX_START;
++ struct unix_edge *edge;
++ LIST_HEAD(edge_stack);
++
++next_vertex:
++ vertex->index = index;
++ index++;
++
++ /* Explore neighbour vertices (receivers of the current vertex's fd). */
++ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++ struct unix_vertex *next_vertex = edge->successor->vertex;
++
++ if (!next_vertex)
++ continue;
++
++ if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
++ /* Iterative deepening depth first search
++ *
++ * 1. Push a forward edge to edge_stack and set
++ * the successor to vertex for the next iteration.
++ */
++ list_add(&edge->stack_entry, &edge_stack);
++
++ vertex = next_vertex;
++ goto next_vertex;
++
++ /* 2. Pop the edge directed to the current vertex
++ * and restore the ancestor for backtracking.
++ */
++prev_vertex:
++ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry);
++ list_del_init(&edge->stack_entry);
++
++ vertex = edge->predecessor->vertex;
++ }
++ }
++
++ /* Don't restart DFS from this vertex in unix_walk_scc(). */
++ list_move_tail(&vertex->entry, &unix_visited_vertices);
++
++ /* Need backtracking ? */
++ if (!list_empty(&edge_stack))
++ goto prev_vertex;
++}
++
++static void unix_walk_scc(void)
++{
++ struct unix_vertex *vertex;
++
++ list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
++ vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
++
++ /* Visit every vertex exactly once.
++ * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
++ */
++ while (!list_empty(&unix_unvisited_vertices)) {
++ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
++ __unix_walk_scc(vertex);
++ }
++
++ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
++}
++
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
+
+@@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct
+
+ spin_lock(&unix_gc_lock);
+
++ unix_walk_scc();
++
+ /* First, select candidates for garbage collection. Only
+ * in-flight sockets are considered, and from those only ones
+ * which don't have any external reference.
--- /dev/null
+From stable+bounces-145856-greg=kroah.com@vger.kernel.org Wed May 21 16:52:44 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:17 +0000
+Subject: af_unix: Link struct unix_edge when queuing skb.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-10-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 42f298c06b30bfe0a8cbee5d38644e618699e26e upstream.
+
+Just before queuing skb with inflight fds, we call scm_stat_add(),
+which is a good place to set up the preallocated struct unix_vertex
+and struct unix_edge in UNIXCB(skb).fp.
+
+Then, we call unix_add_edges() and construct the directed graph
+as follows:
+
+ 1. Set the inflight socket's unix_sock to unix_edge.predecessor.
+ 2. Set the receiver's unix_sock to unix_edge.successor.
+ 3. Set the preallocated vertex to inflight socket's unix_sock.vertex.
+ 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices.
+ 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges.
+
+Let's say we pass the fd of AF_UNIX socket A to B and the fd of B
+to C. The graph looks like this:
+
+ +-------------------------+
+ | unix_unvisited_vertices | <-------------------------.
+ +-------------------------+ |
+ + |
+ | +--------------+ +--------------+ | +--------------+
+ | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C |
+ | +--------------+ | | +--------------+ | | | +--------------+
+ | .-+ | vertex | | | .-+ | vertex | | | | | vertex |
+ | | +--------------+ | | | +--------------+ | | | +--------------+
+ | | | | | | | |
+ | | +--------------+ | | | +--------------+ | | |
+ | '-> | unix_vertex | | | '-> | unix_vertex | | | |
+ | +--------------+ | | +--------------+ | | |
+ `---> | entry | +---------> | entry | +-' | |
+ |--------------| | | |--------------| | |
+ | edges | <-. | | | edges | <-. | |
+ +--------------+ | | | +--------------+ | | |
+ | | | | | |
+ .----------------------' | | .----------------------' | |
+ | | | | | |
+ | +--------------+ | | | +--------------+ | |
+ | | unix_edge | | | | | unix_edge | | |
+ | +--------------+ | | | +--------------+ | |
+ `-> | vertex_entry | | | `-> | vertex_entry | | |
+ |--------------| | | |--------------| | |
+ | predecessor | +---' | | predecessor | +---' |
+ |--------------| | |--------------| |
+ | successor | +-----' | successor | +-----'
+ +--------------+ +--------------+
+
+Henceforth, we denote such a graph as A -> B (-> C).
+
+Now, we can express all inflight fd graphs that do not contain
+embryo sockets. We will support the particular case later.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 2 +
+ include/net/scm.h | 1
+ net/core/scm.c | 2 +
+ net/unix/af_unix.c | 8 +++-
+ net/unix/garbage.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++-
+ 5 files changed, 100 insertions(+), 3 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight;
+
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
++void unix_del_edges(struct scm_fp_list *fpl);
+ int unix_prepare_fpl(struct scm_fp_list *fpl);
+ void unix_destroy_fpl(struct scm_fp_list *fpl);
+ void unix_gc(void);
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -31,6 +31,7 @@ struct scm_fp_list {
+ short count_unix;
+ short max;
+ #ifdef CONFIG_UNIX
++ bool inflight;
+ struct list_head vertices;
+ struct unix_edge *edges;
+ #endif
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
+ #if IS_ENABLED(CONFIG_UNIX)
++ fpl->inflight = false;
+ fpl->edges = NULL;
+ INIT_LIST_HEAD(&fpl->vertices);
+ #endif
+@@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct sc
+ new_fpl->max = new_fpl->count;
+ new_fpl->user = get_uid(fpl->user);
+ #if IS_ENABLED(CONFIG_UNIX)
++ new_fpl->inflight = false;
+ new_fpl->edges = NULL;
+ INIT_LIST_HEAD(&new_fpl->vertices);
+ #endif
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1920,8 +1920,10 @@ static void scm_stat_add(struct sock *sk
+ struct scm_fp_list *fp = UNIXCB(skb).fp;
+ struct unix_sock *u = unix_sk(sk);
+
+- if (unlikely(fp && fp->count))
++ if (unlikely(fp && fp->count)) {
+ atomic_add(fp->count, &u->scm_stat.nr_fds);
++ unix_add_edges(fp, u);
++ }
+ }
+
+ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
+@@ -1929,8 +1931,10 @@ static void scm_stat_del(struct sock *sk
+ struct scm_fp_list *fp = UNIXCB(skb).fp;
+ struct unix_sock *u = unix_sk(sk);
+
+- if (unlikely(fp && fp->count))
++ if (unlikely(fp && fp->count)) {
+ atomic_sub(fp->count, &u->scm_stat.nr_fds);
++ unix_del_edges(fp);
++ }
+ }
+
+ /*
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct
+ return NULL;
+ }
+
++static LIST_HEAD(unix_unvisited_vertices);
++
++static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
++{
++ struct unix_vertex *vertex = edge->predecessor->vertex;
++
++ if (!vertex) {
++ vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
++ vertex->out_degree = 0;
++ INIT_LIST_HEAD(&vertex->edges);
++
++ list_move_tail(&vertex->entry, &unix_unvisited_vertices);
++ edge->predecessor->vertex = vertex;
++ }
++
++ vertex->out_degree++;
++ list_add_tail(&edge->vertex_entry, &vertex->edges);
++}
++
++static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
++{
++ struct unix_vertex *vertex = edge->predecessor->vertex;
++
++ list_del(&edge->vertex_entry);
++ vertex->out_degree--;
++
++ if (!vertex->out_degree) {
++ edge->predecessor->vertex = NULL;
++ list_move_tail(&vertex->entry, &fpl->vertices);
++ }
++}
++
+ static void unix_free_vertices(struct scm_fp_list *fpl)
+ {
+ struct unix_vertex *vertex, *next_vertex;
+@@ -111,6 +143,60 @@ static void unix_free_vertices(struct sc
+ }
+ }
+
++DEFINE_SPINLOCK(unix_gc_lock);
++
++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
++{
++ int i = 0, j = 0;
++
++ spin_lock(&unix_gc_lock);
++
++ if (!fpl->count_unix)
++ goto out;
++
++ do {
++ struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]);
++ struct unix_edge *edge;
++
++ if (!inflight)
++ continue;
++
++ edge = fpl->edges + i++;
++ edge->predecessor = inflight;
++ edge->successor = receiver;
++
++ unix_add_edge(fpl, edge);
++ } while (i < fpl->count_unix);
++
++out:
++ spin_unlock(&unix_gc_lock);
++
++ fpl->inflight = true;
++
++ unix_free_vertices(fpl);
++}
++
++void unix_del_edges(struct scm_fp_list *fpl)
++{
++ int i = 0;
++
++ spin_lock(&unix_gc_lock);
++
++ if (!fpl->count_unix)
++ goto out;
++
++ do {
++ struct unix_edge *edge = fpl->edges + i++;
++
++ unix_del_edge(fpl, edge);
++ } while (i < fpl->count_unix);
++
++out:
++ spin_unlock(&unix_gc_lock);
++
++ fpl->inflight = false;
++}
++
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
+ {
+ struct unix_vertex *vertex;
+@@ -141,11 +227,13 @@ err:
+
+ void unix_destroy_fpl(struct scm_fp_list *fpl)
+ {
++ if (fpl->inflight)
++ unix_del_edges(fpl);
++
+ kvfree(fpl->edges);
+ unix_free_vertices(fpl);
+ }
+
+-DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
+ static LIST_HEAD(gc_inflight_list);
--- /dev/null
+From stable+bounces-145853-greg=kroah.com@vger.kernel.org Wed May 21 16:51:31 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:14 +0000
+Subject: af_unix: Remove CONFIG_UNIX_SCM.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-7-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 upstream.
+
+Originally, the code related to garbage collection was all in garbage.c.
+
+Commit f4e65870e5ce ("net: split out functions related to registering
+inflight socket files") moved some functions to scm.c for io_uring and
+added CONFIG_UNIX_SCM just in case AF_UNIX was built as module.
+
+However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX
+bool"), AF_UNIX is no longer built separately. Also, io_uring does not
+support SCM_RIGHTS now.
+
+Let's move the functions back to garbage.c
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 7 +-
+ net/Makefile | 2
+ net/unix/Kconfig | 5 -
+ net/unix/Makefile | 2
+ net/unix/af_unix.c | 63 ++++++++++++++++++++-
+ net/unix/garbage.c | 73 +++++++++++++++++++++++-
+ net/unix/scm.c | 150 --------------------------------------------------
+ net/unix/scm.h | 10 ---
+ 8 files changed, 137 insertions(+), 175 deletions(-)
+ delete mode 100644 net/unix/scm.c
+ delete mode 100644 net/unix/scm.h
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get
+ }
+ #endif
+
++extern spinlock_t unix_gc_lock;
++extern unsigned int unix_tot_inflight;
++
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+-void unix_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
++
+ struct sock *unix_peer_get(struct sock *sk);
+
+ #define UNIX_HASH_MOD (256 - 1)
+ #define UNIX_HASH_SIZE (256 * 2)
+ #define UNIX_HASH_BITS 8
+
+-extern unsigned int unix_tot_inflight;
+-
+ struct unix_address {
+ refcount_t refcnt;
+ int len;
+--- a/net/Makefile
++++ b/net/Makefile
+@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/
+ obj-$(CONFIG_INET) += ipv4/
+ obj-$(CONFIG_TLS) += tls/
+ obj-$(CONFIG_XFRM) += xfrm/
+-obj-$(CONFIG_UNIX_SCM) += unix/
++obj-$(CONFIG_UNIX) += unix/
+ obj-y += ipv6/
+ obj-$(CONFIG_BPFILTER) += bpfilter/
+ obj-$(CONFIG_PACKET) += packet/
+--- a/net/unix/Kconfig
++++ b/net/unix/Kconfig
+@@ -16,11 +16,6 @@ config UNIX
+
+ Say Y unless you know what you are doing.
+
+-config UNIX_SCM
+- bool
+- depends on UNIX
+- default y
+-
+ config AF_UNIX_OOB
+ bool
+ depends on UNIX
+--- a/net/unix/Makefile
++++ b/net/unix/Makefile
+@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o
+
+ obj-$(CONFIG_UNIX_DIAG) += unix_diag.o
+ unix_diag-y := diag.o
+-
+-obj-$(CONFIG_UNIX_SCM) += scm.o
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -117,8 +117,6 @@
+ #include <linux/file.h>
+ #include <linux/btf_ids.h>
+
+-#include "scm.h"
+-
+ static atomic_long_t unix_nr_socks;
+ static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2];
+ static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2];
+@@ -1752,6 +1750,52 @@ out:
+ return err;
+ }
+
++/* The "user->unix_inflight" variable is protected by the garbage
++ * collection lock, and we just read it locklessly here. If you go
++ * over the limit, there might be a tiny race in actually noticing
++ * it across threads. Tough.
++ */
++static inline bool too_many_unix_fds(struct task_struct *p)
++{
++ struct user_struct *user = current_user();
++
++ if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
++ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
++ return false;
++}
++
++static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++ int i;
++
++ if (too_many_unix_fds(current))
++ return -ETOOMANYREFS;
++
++ /* Need to duplicate file references for the sake of garbage
++ * collection. Otherwise a socket in the fps might become a
++ * candidate for GC while the skb is not yet queued.
++ */
++ UNIXCB(skb).fp = scm_fp_dup(scm->fp);
++ if (!UNIXCB(skb).fp)
++ return -ENOMEM;
++
++ for (i = scm->fp->count - 1; i >= 0; i--)
++ unix_inflight(scm->fp->user, scm->fp->fp[i]);
++
++ return 0;
++}
++
++static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
++{
++ int i;
++
++ scm->fp = UNIXCB(skb).fp;
++ UNIXCB(skb).fp = NULL;
++
++ for (i = scm->fp->count - 1; i >= 0; i--)
++ unix_notinflight(scm->fp->user, scm->fp->fp[i]);
++}
++
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+ scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+@@ -1799,6 +1843,21 @@ static void unix_peek_fds(struct scm_coo
+ spin_unlock(&unix_gc_lock);
+ }
+
++static void unix_destruct_scm(struct sk_buff *skb)
++{
++ struct scm_cookie scm;
++
++ memset(&scm, 0, sizeof(scm));
++ scm.pid = UNIXCB(skb).pid;
++ if (UNIXCB(skb).fp)
++ unix_detach_fds(&scm, skb);
++
++ /* Alas, it calls VFS */
++ /* So fscking what? fput() had been SMP-safe since the last Summer */
++ scm_destroy(&scm);
++ sock_wfree(skb);
++}
++
+ static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds)
+ {
+ int err = 0;
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -81,11 +81,80 @@
+ #include <net/scm.h>
+ #include <net/tcp_states.h>
+
+-#include "scm.h"
++struct unix_sock *unix_get_socket(struct file *filp)
++{
++ struct inode *inode = file_inode(filp);
+
+-/* Internal data structures and random procedures: */
++ /* Socket ? */
++ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
++ struct socket *sock = SOCKET_I(inode);
++ const struct proto_ops *ops;
++ struct sock *sk = sock->sk;
+
++ ops = READ_ONCE(sock->ops);
++
++ /* PF_UNIX ? */
++ if (sk && ops && ops->family == PF_UNIX)
++ return unix_sk(sk);
++ }
++
++ return NULL;
++}
++
++DEFINE_SPINLOCK(unix_gc_lock);
++unsigned int unix_tot_inflight;
+ static LIST_HEAD(gc_candidates);
++static LIST_HEAD(gc_inflight_list);
++
++/* Keep the number of times in flight count for the file
++ * descriptor if it is for an AF_UNIX socket.
++ */
++void unix_inflight(struct user_struct *user, struct file *filp)
++{
++ struct unix_sock *u = unix_get_socket(filp);
++
++ spin_lock(&unix_gc_lock);
++
++ if (u) {
++ if (!u->inflight) {
++ WARN_ON_ONCE(!list_empty(&u->link));
++ list_add_tail(&u->link, &gc_inflight_list);
++ } else {
++ WARN_ON_ONCE(list_empty(&u->link));
++ }
++ u->inflight++;
++
++ /* Paired with READ_ONCE() in wait_for_unix_gc() */
++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
++ }
++
++ WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
++
++ spin_unlock(&unix_gc_lock);
++}
++
++void unix_notinflight(struct user_struct *user, struct file *filp)
++{
++ struct unix_sock *u = unix_get_socket(filp);
++
++ spin_lock(&unix_gc_lock);
++
++ if (u) {
++ WARN_ON_ONCE(!u->inflight);
++ WARN_ON_ONCE(list_empty(&u->link));
++
++ u->inflight--;
++ if (!u->inflight)
++ list_del_init(&u->link);
++
++ /* Paired with READ_ONCE() in wait_for_unix_gc() */
++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
++ }
++
++ WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
++
++ spin_unlock(&unix_gc_lock);
++}
+
+ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+ struct sk_buff_head *hitlist)
+--- a/net/unix/scm.c
++++ /dev/null
+@@ -1,150 +0,0 @@
+-// SPDX-License-Identifier: GPL-2.0
+-#include <linux/module.h>
+-#include <linux/kernel.h>
+-#include <linux/string.h>
+-#include <linux/socket.h>
+-#include <linux/net.h>
+-#include <linux/fs.h>
+-#include <net/af_unix.h>
+-#include <net/scm.h>
+-#include <linux/init.h>
+-#include <linux/io_uring.h>
+-
+-#include "scm.h"
+-
+-unsigned int unix_tot_inflight;
+-EXPORT_SYMBOL(unix_tot_inflight);
+-
+-LIST_HEAD(gc_inflight_list);
+-EXPORT_SYMBOL(gc_inflight_list);
+-
+-DEFINE_SPINLOCK(unix_gc_lock);
+-EXPORT_SYMBOL(unix_gc_lock);
+-
+-struct unix_sock *unix_get_socket(struct file *filp)
+-{
+- struct inode *inode = file_inode(filp);
+-
+- /* Socket ? */
+- if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) {
+- struct socket *sock = SOCKET_I(inode);
+- const struct proto_ops *ops = READ_ONCE(sock->ops);
+- struct sock *s = sock->sk;
+-
+- /* PF_UNIX ? */
+- if (s && ops && ops->family == PF_UNIX)
+- return unix_sk(s);
+- }
+-
+- return NULL;
+-}
+-EXPORT_SYMBOL(unix_get_socket);
+-
+-/* Keep the number of times in flight count for the file
+- * descriptor if it is for an AF_UNIX socket.
+- */
+-void unix_inflight(struct user_struct *user, struct file *fp)
+-{
+- struct unix_sock *u = unix_get_socket(fp);
+-
+- spin_lock(&unix_gc_lock);
+-
+- if (u) {
+- if (!u->inflight) {
+- WARN_ON_ONCE(!list_empty(&u->link));
+- list_add_tail(&u->link, &gc_inflight_list);
+- } else {
+- WARN_ON_ONCE(list_empty(&u->link));
+- }
+- u->inflight++;
+- /* Paired with READ_ONCE() in wait_for_unix_gc() */
+- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1);
+- }
+- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1);
+- spin_unlock(&unix_gc_lock);
+-}
+-
+-void unix_notinflight(struct user_struct *user, struct file *fp)
+-{
+- struct unix_sock *u = unix_get_socket(fp);
+-
+- spin_lock(&unix_gc_lock);
+-
+- if (u) {
+- WARN_ON_ONCE(!u->inflight);
+- WARN_ON_ONCE(list_empty(&u->link));
+-
+- u->inflight--;
+- if (!u->inflight)
+- list_del_init(&u->link);
+- /* Paired with READ_ONCE() in wait_for_unix_gc() */
+- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1);
+- }
+- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1);
+- spin_unlock(&unix_gc_lock);
+-}
+-
+-/*
+- * The "user->unix_inflight" variable is protected by the garbage
+- * collection lock, and we just read it locklessly here. If you go
+- * over the limit, there might be a tiny race in actually noticing
+- * it across threads. Tough.
+- */
+-static inline bool too_many_unix_fds(struct task_struct *p)
+-{
+- struct user_struct *user = current_user();
+-
+- if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE)))
+- return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN);
+- return false;
+-}
+-
+-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+-{
+- int i;
+-
+- if (too_many_unix_fds(current))
+- return -ETOOMANYREFS;
+-
+- /*
+- * Need to duplicate file references for the sake of garbage
+- * collection. Otherwise a socket in the fps might become a
+- * candidate for GC while the skb is not yet queued.
+- */
+- UNIXCB(skb).fp = scm_fp_dup(scm->fp);
+- if (!UNIXCB(skb).fp)
+- return -ENOMEM;
+-
+- for (i = scm->fp->count - 1; i >= 0; i--)
+- unix_inflight(scm->fp->user, scm->fp->fp[i]);
+- return 0;
+-}
+-EXPORT_SYMBOL(unix_attach_fds);
+-
+-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+-{
+- int i;
+-
+- scm->fp = UNIXCB(skb).fp;
+- UNIXCB(skb).fp = NULL;
+-
+- for (i = scm->fp->count-1; i >= 0; i--)
+- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+-}
+-EXPORT_SYMBOL(unix_detach_fds);
+-
+-void unix_destruct_scm(struct sk_buff *skb)
+-{
+- struct scm_cookie scm;
+-
+- memset(&scm, 0, sizeof(scm));
+- scm.pid = UNIXCB(skb).pid;
+- if (UNIXCB(skb).fp)
+- unix_detach_fds(&scm, skb);
+-
+- /* Alas, it calls VFS */
+- /* So fscking what? fput() had been SMP-safe since the last Summer */
+- scm_destroy(&scm);
+- sock_wfree(skb);
+-}
+-EXPORT_SYMBOL(unix_destruct_scm);
+--- a/net/unix/scm.h
++++ /dev/null
+@@ -1,10 +0,0 @@
+-#ifndef NET_UNIX_SCM_H
+-#define NET_UNIX_SCM_H
+-
+-extern struct list_head gc_inflight_list;
+-extern spinlock_t unix_gc_lock;
+-
+-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb);
+-
+-#endif
--- /dev/null
+From stable+bounces-145852-greg=kroah.com@vger.kernel.org Wed May 21 16:51:43 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:13 +0000
+Subject: af_unix: Remove io_uring code for GC.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-6-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 11498715f266a3fb4caabba9dd575636cbcaa8f1 upstream.
+
+Since commit 705318a99a13 ("io_uring/af_unix: disable sending
+io_uring over sockets"), io_uring's unix socket cannot be passed
+via SCM_RIGHTS, so it does not contribute to cyclic reference and
+no longer be candidate for garbage collection.
+
+Also, commit 6e5e6d274956 ("io_uring: drop any code related to
+SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring.
+
+Let's do it in AF_UNIX as well by reverting commit 0091bfc81741
+("io_uring/af_unix: defer registered files gc to io_uring release")
+and commit 10369080454d ("net: reclaim skb->scm_io_uring bit").
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 -
+ net/unix/garbage.c | 25 ++-----------------------
+ net/unix/scm.c | 6 ------
+ 3 files changed, 2 insertions(+), 30 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -20,7 +20,6 @@ static inline struct unix_sock *unix_get
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_destruct_scm(struct sk_buff *skb);
+-void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(struct scm_fp_list *fpl);
+ struct sock *unix_peer_get(struct sock *sk);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -184,12 +184,10 @@ static bool gc_in_progress;
+
+ static void __unix_gc(struct work_struct *work)
+ {
+- struct sk_buff *next_skb, *skb;
+- struct unix_sock *u;
+- struct unix_sock *next;
+ struct sk_buff_head hitlist;
+- struct list_head cursor;
++ struct unix_sock *u, *next;
+ LIST_HEAD(not_cycle_list);
++ struct list_head cursor;
+
+ spin_lock(&unix_gc_lock);
+
+@@ -293,30 +291,11 @@ static void __unix_gc(struct work_struct
+
+ spin_unlock(&unix_gc_lock);
+
+- /* We need io_uring to clean its registered files, ignore all io_uring
+- * originated skbs. It's fine as io_uring doesn't keep references to
+- * other io_uring instances and so killing all other files in the cycle
+- * will put all io_uring references forcing it to go through normal
+- * release.path eventually putting registered files.
+- */
+- skb_queue_walk_safe(&hitlist, skb, next_skb) {
+- if (skb->destructor == io_uring_destruct_scm) {
+- __skb_unlink(skb, &hitlist);
+- skb_queue_tail(&skb->sk->sk_receive_queue, skb);
+- }
+- }
+-
+ /* Here we are. Hitlist is filled. Die. */
+ __skb_queue_purge(&hitlist);
+
+ spin_lock(&unix_gc_lock);
+
+- /* There could be io_uring registered files, just push them back to
+- * the inflight list
+- */
+- list_for_each_entry_safe(u, next, &gc_candidates, link)
+- list_move_tail(&u->link, &gc_inflight_list);
+-
+ /* All candidates should have been detached by now. */
+ WARN_ON_ONCE(!list_empty(&gc_candidates));
+
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -148,9 +148,3 @@ void unix_destruct_scm(struct sk_buff *s
+ sock_wfree(skb);
+ }
+ EXPORT_SYMBOL(unix_destruct_scm);
+-
+-void io_uring_destruct_scm(struct sk_buff *skb)
+-{
+- unix_destruct_scm(skb);
+-}
+-EXPORT_SYMBOL(io_uring_destruct_scm);
--- /dev/null
+From stable+bounces-145868-greg=kroah.com@vger.kernel.org Wed May 21 16:56:33 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:29 +0000
+Subject: af_unix: Remove lock dance in unix_peek_fds().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-22-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 118f457da9ed58a79e24b73c2ef0aa1987241f0e upstream.
+
+In the previous GC implementation, the shape of the inflight socket
+graph was not expected to change while GC was in progress.
+
+MSG_PEEK was tricky because it could install inflight fd silently
+and transform the graph.
+
+Let's say we peeked a fd, which was a listening socket, and accept()ed
+some embryo sockets from it. The garbage collection algorithm would
+have been confused because the set of sockets visited in scan_inflight()
+would change within the same GC invocation.
+
+That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in
+unix_peek_fds() with a fat comment.
+
+In the new GC implementation, we no longer garbage-collect the socket
+if it exists in another queue, that is, if it has a bridge to another
+SCC. Also, accept() will require the lock if it has edges.
+
+Thus, we need not do the complicated lock dance.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 -
+ net/unix/af_unix.c | 42 ------------------------------------------
+ net/unix/garbage.c | 2 +-
+ 3 files changed, 1 insertion(+), 44 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get
+ }
+ #endif
+
+-extern spinlock_t unix_gc_lock;
+ extern unsigned int unix_tot_inflight;
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1796,48 +1796,6 @@ static void unix_detach_fds(struct scm_c
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+ scm->fp = scm_fp_dup(UNIXCB(skb).fp);
+-
+- /*
+- * Garbage collection of unix sockets starts by selecting a set of
+- * candidate sockets which have reference only from being in flight
+- * (total_refs == inflight_refs). This condition is checked once during
+- * the candidate collection phase, and candidates are marked as such, so
+- * that non-candidates can later be ignored. While inflight_refs is
+- * protected by unix_gc_lock, total_refs (file count) is not, hence this
+- * is an instantaneous decision.
+- *
+- * Once a candidate, however, the socket must not be reinstalled into a
+- * file descriptor while the garbage collection is in progress.
+- *
+- * If the above conditions are met, then the directed graph of
+- * candidates (*) does not change while unix_gc_lock is held.
+- *
+- * Any operations that changes the file count through file descriptors
+- * (dup, close, sendmsg) does not change the graph since candidates are
+- * not installed in fds.
+- *
+- * Dequeing a candidate via recvmsg would install it into an fd, but
+- * that takes unix_gc_lock to decrement the inflight count, so it's
+- * serialized with garbage collection.
+- *
+- * MSG_PEEK is special in that it does not change the inflight count,
+- * yet does install the socket into an fd. The following lock/unlock
+- * pair is to ensure serialization with garbage collection. It must be
+- * done between incrementing the file count and installing the file into
+- * an fd.
+- *
+- * If garbage collection starts after the barrier provided by the
+- * lock/unlock, then it will see the elevated refcount and not mark this
+- * as a candidate. If a garbage collection is already in progress
+- * before the file count was incremented, then the lock/unlock pair will
+- * ensure that garbage collection is finished before progressing to
+- * installing the fd.
+- *
+- * (*) A -> B where B is on the queue of A or B is on the queue of C
+- * which is on the queue of listening socket A.
+- */
+- spin_lock(&unix_gc_lock);
+- spin_unlock(&unix_gc_lock);
+ }
+
+ static void unix_destruct_scm(struct sk_buff *skb)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -183,7 +183,7 @@ static void unix_free_vertices(struct sc
+ }
+ }
+
+-DEFINE_SPINLOCK(unix_gc_lock);
++static DEFINE_SPINLOCK(unix_gc_lock);
+ unsigned int unix_tot_inflight;
+
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver)
--- /dev/null
+From stable+bounces-145851-greg=kroah.com@vger.kernel.org Wed May 21 16:51:24 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:12 +0000
+Subject: af_unix: Replace BUG_ON() with WARN_ON_ONCE().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-5-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit d0f6dc26346863e1f4a23117f5468614e54df064 upstream.
+
+This is a prep patch for the last patch in this series so that
+checkpatch will not warn about BUG_ON().
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Jens Axboe <axboe@kernel.dk>
+Link: https://lore.kernel.org/r/20240129190435.57228-2-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 8 ++++----
+ net/unix/scm.c | 8 ++++----
+ 2 files changed, 8 insertions(+), 8 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -145,7 +145,7 @@ static void scan_children(struct sock *x
+ /* An embryo cannot be in-flight, so it's safe
+ * to use the list link.
+ */
+- BUG_ON(!list_empty(&u->link));
++ WARN_ON_ONCE(!list_empty(&u->link));
+ list_add_tail(&u->link, &embryos);
+ }
+ spin_unlock(&x->sk_receive_queue.lock);
+@@ -224,8 +224,8 @@ static void __unix_gc(struct work_struct
+
+ total_refs = file_count(sk->sk_socket->file);
+
+- BUG_ON(!u->inflight);
+- BUG_ON(total_refs < u->inflight);
++ WARN_ON_ONCE(!u->inflight);
++ WARN_ON_ONCE(total_refs < u->inflight);
+ if (total_refs == u->inflight) {
+ list_move_tail(&u->link, &gc_candidates);
+ __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+@@ -318,7 +318,7 @@ static void __unix_gc(struct work_struct
+ list_move_tail(&u->link, &gc_inflight_list);
+
+ /* All candidates should have been detached by now. */
+- BUG_ON(!list_empty(&gc_candidates));
++ WARN_ON_ONCE(!list_empty(&gc_candidates));
+
+ /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+ WRITE_ONCE(gc_in_progress, false);
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -51,10 +51,10 @@ void unix_inflight(struct user_struct *u
+
+ if (u) {
+ if (!u->inflight) {
+- BUG_ON(!list_empty(&u->link));
++ WARN_ON_ONCE(!list_empty(&u->link));
+ list_add_tail(&u->link, &gc_inflight_list);
+ } else {
+- BUG_ON(list_empty(&u->link));
++ WARN_ON_ONCE(list_empty(&u->link));
+ }
+ u->inflight++;
+ /* Paired with READ_ONCE() in wait_for_unix_gc() */
+@@ -71,8 +71,8 @@ void unix_notinflight(struct user_struct
+ spin_lock(&unix_gc_lock);
+
+ if (u) {
+- BUG_ON(!u->inflight);
+- BUG_ON(list_empty(&u->link));
++ WARN_ON_ONCE(!u->inflight);
++ WARN_ON_ONCE(list_empty(&u->link));
+
+ u->inflight--;
+ if (!u->inflight)
--- /dev/null
+From stable+bounces-145867-greg=kroah.com@vger.kernel.org Wed May 21 16:59:26 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:28 +0000
+Subject: af_unix: Replace garbage collection algorithm.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-21-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 4090fa373f0e763c43610853d2774b5979915959 upstream.
+
+If we find a dead SCC during iteration, we call unix_collect_skb()
+to splice all skb in the SCC to the global sk_buff_head, hitlist.
+
+After iterating all SCC, we unlock unix_gc_lock and purge the queue.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 8 -
+ net/unix/af_unix.c | 12 -
+ net/unix/garbage.c | 318 ++++++++++----------------------------------------
+ 3 files changed, 64 insertions(+), 274 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get
+
+ extern spinlock_t unix_gc_lock;
+ extern unsigned int unix_tot_inflight;
+-
+-void unix_inflight(struct user_struct *user, struct file *fp);
+-void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
+ void unix_del_edges(struct scm_fp_list *fpl);
+ void unix_update_edges(struct unix_sock *receiver);
+@@ -85,12 +82,7 @@ struct unix_sock {
+ struct sock *peer;
+ struct sock *listener;
+ struct unix_vertex *vertex;
+- struct list_head link;
+- unsigned long inflight;
+ spinlock_t lock;
+- unsigned long gc_flags;
+-#define UNIX_GC_CANDIDATE 0
+-#define UNIX_GC_MAYBE_CYCLE 1
+ struct socket_wq peer_wq;
+ wait_queue_entry_t peer_wake;
+ struct scm_stat scm_stat;
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -979,12 +979,10 @@ static struct sock *unix_create1(struct
+ sk->sk_destruct = unix_sock_destructor;
+ u = unix_sk(sk);
+ u->listener = NULL;
+- u->inflight = 0;
+ u->vertex = NULL;
+ u->path.dentry = NULL;
+ u->path.mnt = NULL;
+ spin_lock_init(&u->lock);
+- INIT_LIST_HEAD(&u->link);
+ mutex_init(&u->iolock); /* single task reading lock */
+ mutex_init(&u->bindlock); /* single task binding lock */
+ init_waitqueue_head(&u->peer_wait);
+@@ -1770,8 +1768,6 @@ static inline bool too_many_unix_fds(str
+
+ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+- int i;
+-
+ if (too_many_unix_fds(current))
+ return -ETOOMANYREFS;
+
+@@ -1783,9 +1779,6 @@ static int unix_attach_fds(struct scm_co
+ if (!UNIXCB(skb).fp)
+ return -ENOMEM;
+
+- for (i = scm->fp->count - 1; i >= 0; i--)
+- unix_inflight(scm->fp->user, scm->fp->fp[i]);
+-
+ if (unix_prepare_fpl(UNIXCB(skb).fp))
+ return -ENOMEM;
+
+@@ -1794,15 +1787,10 @@ static int unix_attach_fds(struct scm_co
+
+ static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
+ {
+- int i;
+-
+ scm->fp = UNIXCB(skb).fp;
+ UNIXCB(skb).fp = NULL;
+
+ unix_destroy_fpl(scm->fp);
+-
+- for (i = scm->fp->count - 1; i >= 0; i--)
+- unix_notinflight(scm->fp->user, scm->fp->fp[i]);
+ }
+
+ static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix
+ return true;
+ }
+
++enum unix_recv_queue_lock_class {
++ U_RECVQ_LOCK_NORMAL,
++ U_RECVQ_LOCK_EMBRYO,
++};
++
++static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist)
++{
++ struct unix_vertex *vertex;
++
++ list_for_each_entry_reverse(vertex, scc, scc_entry) {
++ struct sk_buff_head *queue;
++ struct unix_edge *edge;
++ struct unix_sock *u;
++
++ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry);
++ u = edge->predecessor;
++ queue = &u->sk.sk_receive_queue;
++
++ spin_lock(&queue->lock);
++
++ if (u->sk.sk_state == TCP_LISTEN) {
++ struct sk_buff *skb;
++
++ skb_queue_walk(queue, skb) {
++ struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue;
++
++ /* listener -> embryo order, the inversion never happens. */
++ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO);
++ skb_queue_splice_init(embryo_queue, hitlist);
++ spin_unlock(&embryo_queue->lock);
++ }
++ } else {
++ skb_queue_splice_init(queue, hitlist);
++
++#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
++ if (u->oob_skb) {
++ kfree_skb(u->oob_skb);
++ u->oob_skb = NULL;
++ }
++#endif
++ }
++
++ spin_unlock(&queue->lock);
++ }
++}
++
+ static bool unix_scc_cyclic(struct list_head *scc)
+ {
+ struct unix_vertex *vertex;
+@@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+-static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index)
++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index,
++ struct sk_buff_head *hitlist)
+ {
+ LIST_HEAD(vertex_stack);
+ struct unix_edge *edge;
+@@ -430,7 +477,9 @@ prev_vertex:
+ scc_dead = unix_vertex_dead(vertex);
+ }
+
+- if (!unix_graph_maybe_cyclic)
++ if (scc_dead)
++ unix_collect_skb(&scc, hitlist);
++ else if (!unix_graph_maybe_cyclic)
+ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
+
+ list_del(&scc);
+@@ -441,7 +490,7 @@ prev_vertex:
+ goto prev_vertex;
+ }
+
+-static void unix_walk_scc(void)
++static void unix_walk_scc(struct sk_buff_head *hitlist)
+ {
+ unsigned long last_index = UNIX_VERTEX_INDEX_START;
+
+@@ -454,7 +503,7 @@ static void unix_walk_scc(void)
+ struct unix_vertex *vertex;
+
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+- __unix_walk_scc(vertex, &last_index);
++ __unix_walk_scc(vertex, &last_index, hitlist);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+@@ -463,7 +512,7 @@ static void unix_walk_scc(void)
+ unix_graph_grouped = true;
+ }
+
+-static void unix_walk_scc_fast(void)
++static void unix_walk_scc_fast(struct sk_buff_head *hitlist)
+ {
+ while (!list_empty(&unix_unvisited_vertices)) {
+ struct unix_vertex *vertex;
+@@ -480,279 +529,40 @@ static void unix_walk_scc_fast(void)
+ scc_dead = unix_vertex_dead(vertex);
+ }
+
++ if (scc_dead)
++ unix_collect_skb(&scc, hitlist);
++
+ list_del(&scc);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
+ }
+
+-static LIST_HEAD(gc_candidates);
+-static LIST_HEAD(gc_inflight_list);
+-
+-/* Keep the number of times in flight count for the file
+- * descriptor if it is for an AF_UNIX socket.
+- */
+-void unix_inflight(struct user_struct *user, struct file *filp)
+-{
+- struct unix_sock *u = unix_get_socket(filp);
+-
+- spin_lock(&unix_gc_lock);
+-
+- if (u) {
+- if (!u->inflight) {
+- WARN_ON_ONCE(!list_empty(&u->link));
+- list_add_tail(&u->link, &gc_inflight_list);
+- } else {
+- WARN_ON_ONCE(list_empty(&u->link));
+- }
+- u->inflight++;
+- }
+-
+- spin_unlock(&unix_gc_lock);
+-}
+-
+-void unix_notinflight(struct user_struct *user, struct file *filp)
+-{
+- struct unix_sock *u = unix_get_socket(filp);
+-
+- spin_lock(&unix_gc_lock);
+-
+- if (u) {
+- WARN_ON_ONCE(!u->inflight);
+- WARN_ON_ONCE(list_empty(&u->link));
+-
+- u->inflight--;
+- if (!u->inflight)
+- list_del_init(&u->link);
+- }
+-
+- spin_unlock(&unix_gc_lock);
+-}
+-
+-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+- struct sk_buff_head *hitlist)
+-{
+- struct sk_buff *skb;
+- struct sk_buff *next;
+-
+- spin_lock(&x->sk_receive_queue.lock);
+- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+- /* Do we have file descriptors ? */
+- if (UNIXCB(skb).fp) {
+- bool hit = false;
+- /* Process the descriptors of this socket */
+- int nfd = UNIXCB(skb).fp->count;
+- struct file **fp = UNIXCB(skb).fp->fp;
+-
+- while (nfd--) {
+- /* Get the socket the fd matches if it indeed does so */
+- struct unix_sock *u = unix_get_socket(*fp++);
+-
+- /* Ignore non-candidates, they could have been added
+- * to the queues after starting the garbage collection
+- */
+- if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+- hit = true;
+-
+- func(u);
+- }
+- }
+- if (hit && hitlist != NULL) {
+- __skb_unlink(skb, &x->sk_receive_queue);
+- __skb_queue_tail(hitlist, skb);
+- }
+- }
+- }
+- spin_unlock(&x->sk_receive_queue.lock);
+-}
+-
+-static void scan_children(struct sock *x, void (*func)(struct unix_sock *),
+- struct sk_buff_head *hitlist)
+-{
+- if (x->sk_state != TCP_LISTEN) {
+- scan_inflight(x, func, hitlist);
+- } else {
+- struct sk_buff *skb;
+- struct sk_buff *next;
+- struct unix_sock *u;
+- LIST_HEAD(embryos);
+-
+- /* For a listening socket collect the queued embryos
+- * and perform a scan on them as well.
+- */
+- spin_lock(&x->sk_receive_queue.lock);
+- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) {
+- u = unix_sk(skb->sk);
+-
+- /* An embryo cannot be in-flight, so it's safe
+- * to use the list link.
+- */
+- WARN_ON_ONCE(!list_empty(&u->link));
+- list_add_tail(&u->link, &embryos);
+- }
+- spin_unlock(&x->sk_receive_queue.lock);
+-
+- while (!list_empty(&embryos)) {
+- u = list_entry(embryos.next, struct unix_sock, link);
+- scan_inflight(&u->sk, func, hitlist);
+- list_del_init(&u->link);
+- }
+- }
+-}
+-
+-static void dec_inflight(struct unix_sock *usk)
+-{
+- usk->inflight--;
+-}
+-
+-static void inc_inflight(struct unix_sock *usk)
+-{
+- usk->inflight++;
+-}
+-
+-static void inc_inflight_move_tail(struct unix_sock *u)
+-{
+- u->inflight++;
+-
+- /* If this still might be part of a cycle, move it to the end
+- * of the list, so that it's checked even if it was already
+- * passed over
+- */
+- if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags))
+- list_move_tail(&u->link, &gc_candidates);
+-}
+-
+ static bool gc_in_progress;
+
+ static void __unix_gc(struct work_struct *work)
+ {
+ struct sk_buff_head hitlist;
+- struct unix_sock *u, *next;
+- LIST_HEAD(not_cycle_list);
+- struct list_head cursor;
+
+ spin_lock(&unix_gc_lock);
+
+- if (!unix_graph_maybe_cyclic)
++ if (!unix_graph_maybe_cyclic) {
++ spin_unlock(&unix_gc_lock);
+ goto skip_gc;
+-
+- if (unix_graph_grouped)
+- unix_walk_scc_fast();
+- else
+- unix_walk_scc();
+-
+- /* First, select candidates for garbage collection. Only
+- * in-flight sockets are considered, and from those only ones
+- * which don't have any external reference.
+- *
+- * Holding unix_gc_lock will protect these candidates from
+- * being detached, and hence from gaining an external
+- * reference. Since there are no possible receivers, all
+- * buffers currently on the candidates' queues stay there
+- * during the garbage collection.
+- *
+- * We also know that no new candidate can be added onto the
+- * receive queues. Other, non candidate sockets _can_ be
+- * added to queue, so we must make sure only to touch
+- * candidates.
+- *
+- * Embryos, though never candidates themselves, affect which
+- * candidates are reachable by the garbage collector. Before
+- * being added to a listener's queue, an embryo may already
+- * receive data carrying SCM_RIGHTS, potentially making the
+- * passed socket a candidate that is not yet reachable by the
+- * collector. It becomes reachable once the embryo is
+- * enqueued. Therefore, we must ensure that no SCM-laden
+- * embryo appears in a (candidate) listener's queue between
+- * consecutive scan_children() calls.
+- */
+- list_for_each_entry_safe(u, next, &gc_inflight_list, link) {
+- struct sock *sk = &u->sk;
+- long total_refs;
+-
+- total_refs = file_count(sk->sk_socket->file);
+-
+- WARN_ON_ONCE(!u->inflight);
+- WARN_ON_ONCE(total_refs < u->inflight);
+- if (total_refs == u->inflight) {
+- list_move_tail(&u->link, &gc_candidates);
+- __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+- __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+-
+- if (sk->sk_state == TCP_LISTEN) {
+- unix_state_lock_nested(sk, U_LOCK_GC_LISTENER);
+- unix_state_unlock(sk);
+- }
+- }
+- }
+-
+- /* Now remove all internal in-flight reference to children of
+- * the candidates.
+- */
+- list_for_each_entry(u, &gc_candidates, link)
+- scan_children(&u->sk, dec_inflight, NULL);
+-
+- /* Restore the references for children of all candidates,
+- * which have remaining references. Do this recursively, so
+- * only those remain, which form cyclic references.
+- *
+- * Use a "cursor" link, to make the list traversal safe, even
+- * though elements might be moved about.
+- */
+- list_add(&cursor, &gc_candidates);
+- while (cursor.next != &gc_candidates) {
+- u = list_entry(cursor.next, struct unix_sock, link);
+-
+- /* Move cursor to after the current position. */
+- list_move(&cursor, &u->link);
+-
+- if (u->inflight) {
+- list_move_tail(&u->link, ¬_cycle_list);
+- __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags);
+- scan_children(&u->sk, inc_inflight_move_tail, NULL);
+- }
+ }
+- list_del(&cursor);
+
+- /* Now gc_candidates contains only garbage. Restore original
+- * inflight counters for these as well, and remove the skbuffs
+- * which are creating the cycle(s).
+- */
+- skb_queue_head_init(&hitlist);
+- list_for_each_entry(u, &gc_candidates, link) {
+- scan_children(&u->sk, inc_inflight, &hitlist);
+-
+-#if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+- if (u->oob_skb) {
+- kfree_skb(u->oob_skb);
+- u->oob_skb = NULL;
+- }
+-#endif
+- }
++ __skb_queue_head_init(&hitlist);
+
+- /* not_cycle_list contains those sockets which do not make up a
+- * cycle. Restore these to the inflight list.
+- */
+- while (!list_empty(¬_cycle_list)) {
+- u = list_entry(not_cycle_list.next, struct unix_sock, link);
+- __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags);
+- list_move_tail(&u->link, &gc_inflight_list);
+- }
++ if (unix_graph_grouped)
++ unix_walk_scc_fast(&hitlist);
++ else
++ unix_walk_scc(&hitlist);
+
+ spin_unlock(&unix_gc_lock);
+
+- /* Here we are. Hitlist is filled. Die. */
+ __skb_queue_purge(&hitlist);
+-
+- spin_lock(&unix_gc_lock);
+-
+- /* All candidates should have been detached by now. */
+- WARN_ON_ONCE(!list_empty(&gc_candidates));
+ skip_gc:
+- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+ WRITE_ONCE(gc_in_progress, false);
+-
+- spin_unlock(&unix_gc_lock);
+ }
+
+ static DECLARE_WORK(unix_gc_work, __unix_gc);
--- /dev/null
+From stable+bounces-145848-greg=kroah.com@vger.kernel.org Wed May 21 16:50:06 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:09 +0000
+Subject: af_unix: Return struct unix_sock from unix_get_socket().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, Simon Horman <horms@kernel.org>
+Message-ID: <20250521144803.2050504-2-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 5b17307bd0789edea0675d524a2b277b93bbde62 upstream.
+
+Currently, unix_get_socket() returns struct sock, but after calling
+it, we always cast it to unix_sk().
+
+Let's return struct unix_sock from unix_get_socket().
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Pavel Begunkov <asml.silence@gmail.com>
+Reviewed-by: Simon Horman <horms@kernel.org>
+Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 2 +-
+ net/unix/garbage.c | 19 +++++++------------
+ net/unix/scm.c | 19 +++++++------------
+ 3 files changed, 15 insertions(+), 25 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *s
+ void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+ void wait_for_unix_gc(void);
+-struct sock *unix_get_socket(struct file *filp);
++struct unix_sock *unix_get_socket(struct file *filp);
+ struct sock *unix_peer_get(struct sock *sk);
+
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x
+
+ while (nfd--) {
+ /* Get the socket the fd matches if it indeed does so */
+- struct sock *sk = unix_get_socket(*fp++);
++ struct unix_sock *u = unix_get_socket(*fp++);
+
+- if (sk) {
+- struct unix_sock *u = unix_sk(sk);
++ /* Ignore non-candidates, they could have been added
++ * to the queues after starting the garbage collection
++ */
++ if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
++ hit = true;
+
+- /* Ignore non-candidates, they could
+- * have been added to the queues after
+- * starting the garbage collection
+- */
+- if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) {
+- hit = true;
+-
+- func(u);
+- }
++ func(u);
+ }
+ }
+ if (hit && hitlist != NULL) {
+--- a/net/unix/scm.c
++++ b/net/unix/scm.c
+@@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list);
+ DEFINE_SPINLOCK(unix_gc_lock);
+ EXPORT_SYMBOL(unix_gc_lock);
+
+-struct sock *unix_get_socket(struct file *filp)
++struct unix_sock *unix_get_socket(struct file *filp)
+ {
+- struct sock *u_sock = NULL;
+ struct inode *inode = file_inode(filp);
+
+ /* Socket ? */
+@@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file
+
+ /* PF_UNIX ? */
+ if (s && ops && ops->family == PF_UNIX)
+- u_sock = s;
++ return unix_sk(s);
+ }
+
+- return u_sock;
++ return NULL;
+ }
+ EXPORT_SYMBOL(unix_get_socket);
+
+@@ -46,13 +45,11 @@ EXPORT_SYMBOL(unix_get_socket);
+ */
+ void unix_inflight(struct user_struct *user, struct file *fp)
+ {
+- struct sock *s = unix_get_socket(fp);
++ struct unix_sock *u = unix_get_socket(fp);
+
+ spin_lock(&unix_gc_lock);
+
+- if (s) {
+- struct unix_sock *u = unix_sk(s);
+-
++ if (u) {
+ if (!u->inflight) {
+ BUG_ON(!list_empty(&u->link));
+ list_add_tail(&u->link, &gc_inflight_list);
+@@ -69,13 +66,11 @@ void unix_inflight(struct user_struct *u
+
+ void unix_notinflight(struct user_struct *user, struct file *fp)
+ {
+- struct sock *s = unix_get_socket(fp);
++ struct unix_sock *u = unix_get_socket(fp);
+
+ spin_lock(&unix_gc_lock);
+
+- if (s) {
+- struct unix_sock *u = unix_sk(s);
+-
++ if (u) {
+ BUG_ON(!u->inflight);
+ BUG_ON(list_empty(&u->link));
+
--- /dev/null
+From stable+bounces-145849-greg=kroah.com@vger.kernel.org Wed May 21 16:50:23 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:10 +0000
+Subject: af_unix: Run GC on only one CPU.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-3-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 8b90a9f819dc2a06baae4ec1a64d875e53b824ec upstream.
+
+If more than 16000 inflight AF_UNIX sockets exist and the garbage
+collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
+Also, they wait for unix_gc() to complete.
+
+In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
+and more if they are the GC candidate. Thus, sendmsg() significantly
+slows down with too many inflight AF_UNIX sockets.
+
+There is a small window to invoke multiple unix_gc() instances, which
+will then be blocked by the same spinlock except for one.
+
+Let's convert unix_gc() to use struct work so that it will not consume
+CPUs unnecessarily.
+
+Note WRITE_ONCE(gc_in_progress, true) is moved before running GC.
+If we leave the WRITE_ONCE() as is and use the following test to
+call flush_work(), a process might not call it.
+
+ CPU 0 CPU 1
+ --- ---
+ start work and call __unix_gc()
+ if (work_pending(&unix_gc_work) || <-- false
+ READ_ONCE(gc_in_progress)) <-- false
+ flush_work(); <-- missed!
+ WRITE_ONCE(gc_in_progress, true)
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240123170856.41348-5-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 54 ++++++++++++++++++++++++++---------------------------
+ 1 file changed, 27 insertions(+), 27 deletions(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -86,7 +86,6 @@
+ /* Internal data structures and random procedures: */
+
+ static LIST_HEAD(gc_candidates);
+-static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait);
+
+ static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *),
+ struct sk_buff_head *hitlist)
+@@ -182,23 +181,8 @@ static void inc_inflight_move_tail(struc
+ }
+
+ static bool gc_in_progress;
+-#define UNIX_INFLIGHT_TRIGGER_GC 16000
+-
+-void wait_for_unix_gc(void)
+-{
+- /* If number of inflight sockets is insane,
+- * force a garbage collect right now.
+- * Paired with the WRITE_ONCE() in unix_inflight(),
+- * unix_notinflight() and gc_in_progress().
+- */
+- if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
+- !READ_ONCE(gc_in_progress))
+- unix_gc();
+- wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress));
+-}
+
+-/* The external entry point: unix_gc() */
+-void unix_gc(void)
++static void __unix_gc(struct work_struct *work)
+ {
+ struct sk_buff *next_skb, *skb;
+ struct unix_sock *u;
+@@ -209,13 +193,6 @@ void unix_gc(void)
+
+ spin_lock(&unix_gc_lock);
+
+- /* Avoid a recursive GC. */
+- if (gc_in_progress)
+- goto out;
+-
+- /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+- WRITE_ONCE(gc_in_progress, true);
+-
+ /* First, select candidates for garbage collection. Only
+ * in-flight sockets are considered, and from those only ones
+ * which don't have any external reference.
+@@ -346,8 +323,31 @@ void unix_gc(void)
+ /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+ WRITE_ONCE(gc_in_progress, false);
+
+- wake_up(&unix_gc_wait);
+-
+- out:
+ spin_unlock(&unix_gc_lock);
+ }
++
++static DECLARE_WORK(unix_gc_work, __unix_gc);
++
++void unix_gc(void)
++{
++ WRITE_ONCE(gc_in_progress, true);
++ queue_work(system_unbound_wq, &unix_gc_work);
++}
++
++#define UNIX_INFLIGHT_TRIGGER_GC 16000
++
++void wait_for_unix_gc(void)
++{
++ /* If number of inflight sockets is insane,
++ * force a garbage collect right now.
++ *
++ * Paired with the WRITE_ONCE() in unix_inflight(),
++ * unix_notinflight(), and __unix_gc().
++ */
++ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC &&
++ !READ_ONCE(gc_in_progress))
++ unix_gc();
++
++ if (READ_ONCE(gc_in_progress))
++ flush_work(&unix_gc_work);
++}
--- /dev/null
+From stable+bounces-145860-greg=kroah.com@vger.kernel.org Wed May 21 16:55:46 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:21 +0000
+Subject: af_unix: Save listener for embryo socket.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-14-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit aed6ecef55d70de3762ce41c561b7f547dbaf107 upstream.
+
+This is a prep patch for the following change, where we need to
+fetch the listening socket from the successor embryo socket
+during GC.
+
+We add a new field to struct unix_sock to save a pointer to a
+listening socket.
+
+We set it when connect() creates a new socket, and clear it when
+accept() is called.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 +
+ net/unix/af_unix.c | 5 ++++-
+ 2 files changed, 5 insertions(+), 1 deletion(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -83,6 +83,7 @@ struct unix_sock {
+ struct path path;
+ struct mutex iolock, bindlock;
+ struct sock *peer;
++ struct sock *listener;
+ struct unix_vertex *vertex;
+ struct list_head link;
+ unsigned long inflight;
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -978,6 +978,7 @@ static struct sock *unix_create1(struct
+ sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen);
+ sk->sk_destruct = unix_sock_destructor;
+ u = unix_sk(sk);
++ u->listener = NULL;
+ u->inflight = 0;
+ u->vertex = NULL;
+ u->path.dentry = NULL;
+@@ -1582,6 +1583,7 @@ restart:
+ newsk->sk_type = sk->sk_type;
+ init_peercred(newsk);
+ newu = unix_sk(newsk);
++ newu->listener = other;
+ RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
+ otheru = unix_sk(other);
+
+@@ -1677,8 +1679,8 @@ static int unix_accept(struct socket *so
+ bool kern)
+ {
+ struct sock *sk = sock->sk;
+- struct sock *tsk;
+ struct sk_buff *skb;
++ struct sock *tsk;
+ int err;
+
+ err = -EOPNOTSUPP;
+@@ -1703,6 +1705,7 @@ static int unix_accept(struct socket *so
+ }
+
+ tsk = skb->sk;
++ unix_sk(tsk)->listener = NULL;
+ skb_free_datagram(sk, skb);
+ wake_up_interruptible(&unix_sk(sk)->peer_wait);
+
--- /dev/null
+From stable+bounces-145862-greg=kroah.com@vger.kernel.org Wed May 21 16:54:49 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:23 +0000
+Subject: af_unix: Save O(n) setup of Tarjan's algo.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-16-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit ba31b4a4e1018f5844c6eb31734976e2184f2f9a upstream.
+
+Before starting Tarjan's algorithm, we need to mark all vertices
+as unvisited. We can save this O(n) setup by reserving two special
+indices (0, 1) and using two variables.
+
+The first time we link a vertex to unix_unvisited_vertices, we set
+unix_vertex_unvisited_index to index.
+
+During DFS, we can see that the index of unvisited vertices is the
+same as unix_vertex_unvisited_index.
+
+When we finalise SCC later, we set unix_vertex_grouped_index to each
+vertex's index.
+
+Then, we can know (i) that the vertex is on the stack if the index
+of a visited vertex is >= 2 and (ii) that it is not on the stack and
+belongs to a different SCC if the index is unix_vertex_grouped_index.
+
+After the whole algorithm, all indices of vertices are set as
+unix_vertex_grouped_index.
+
+Next time we start DFS, we know that all unvisited vertices have
+unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index
+as the not-on-stack marker.
+
+To use the same variable in __unix_walk_scc(), we can swap
+unix_vertex_(grouped|unvisited)_index at the end of Tarjan's
+algorithm.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 -
+ net/unix/garbage.c | 26 +++++++++++++++-----------
+ 2 files changed, 15 insertions(+), 12 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -37,7 +37,6 @@ struct unix_vertex {
+ unsigned long out_degree;
+ unsigned long index;
+ unsigned long lowlink;
+- bool on_stack;
+ };
+
+ struct unix_edge {
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_suc
+ static LIST_HEAD(unix_unvisited_vertices);
+
+ enum unix_vertex_index {
+- UNIX_VERTEX_INDEX_UNVISITED,
++ UNIX_VERTEX_INDEX_MARK1,
++ UNIX_VERTEX_INDEX_MARK2,
+ UNIX_VERTEX_INDEX_START,
+ };
+
++static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1;
++
+ static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
+ if (!vertex) {
+ vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry);
++ vertex->index = unix_vertex_unvisited_index;
+ vertex->out_degree = 0;
+ INIT_LIST_HEAD(&vertex->edges);
+
+@@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list
+ }
+
+ static LIST_HEAD(unix_visited_vertices);
++static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+ static void __unix_walk_scc(struct unix_vertex *vertex)
+ {
+@@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_
+ LIST_HEAD(edge_stack);
+
+ next_vertex:
+- /* Push vertex to vertex_stack.
++ /* Push vertex to vertex_stack and mark it as on-stack
++ * (index >= UNIX_VERTEX_INDEX_START).
+ * The vertex will be popped when finalising SCC later.
+ */
+- vertex->on_stack = true;
+ list_add(&vertex->scc_entry, &vertex_stack);
+
+ vertex->index = index;
+@@ -291,7 +296,7 @@ next_vertex:
+ if (!next_vertex)
+ continue;
+
+- if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) {
++ if (next_vertex->index == unix_vertex_unvisited_index) {
+ /* Iterative deepening depth first search
+ *
+ * 1. Push a forward edge to edge_stack and set
+@@ -317,7 +322,7 @@ prev_vertex:
+ * to skip SCC finalisation.
+ */
+ vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink);
+- } else if (next_vertex->on_stack) {
++ } else if (next_vertex->index != unix_vertex_grouped_index) {
+ /* Loop detected by a back/cross edge.
+ *
+ * The successor is on vertex_stack, so two vertices are
+@@ -344,7 +349,8 @@ prev_vertex:
+ /* Don't restart DFS from this vertex in unix_walk_scc(). */
+ list_move_tail(&vertex->entry, &unix_visited_vertices);
+
+- vertex->on_stack = false;
++ /* Mark vertex as off-stack. */
++ vertex->index = unix_vertex_grouped_index;
+ }
+
+ list_del(&scc);
+@@ -357,20 +363,18 @@ prev_vertex:
+
+ static void unix_walk_scc(void)
+ {
+- struct unix_vertex *vertex;
+-
+- list_for_each_entry(vertex, &unix_unvisited_vertices, entry)
+- vertex->index = UNIX_VERTEX_INDEX_UNVISITED;
+-
+ /* Visit every vertex exactly once.
+ * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+ */
+ while (!list_empty(&unix_unvisited_vertices)) {
++ struct unix_vertex *vertex;
++
+ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry);
+ __unix_walk_scc(vertex);
+ }
+
+ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices);
++ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index);
+ }
+
+ static LIST_HEAD(gc_candidates);
--- /dev/null
+From stable+bounces-145863-greg=kroah.com@vger.kernel.org Wed May 21 16:57:29 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:24 +0000
+Subject: af_unix: Skip GC if no cycle exists.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-17-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit 77e5593aebba823bcbcf2c4b58b07efcd63933b8 upstream.
+
+We do not need to run GC if there is no possible cyclic reference.
+We use unix_graph_maybe_cyclic to decide if we should run GC.
+
+If a fd of an AF_UNIX socket is passed to an already inflight AF_UNIX
+socket, they could form a cyclic reference. Then, we set true to
+unix_graph_maybe_cyclic and later run Tarjan's algorithm to group
+them into SCC.
+
+Once we run Tarjan's algorithm, we are 100% sure whether cyclic
+references exist or not. If there is no cycle, we set false to
+unix_graph_maybe_cyclic and can skip the entire garbage collection
+next time.
+
+When finalising SCC, we set true to unix_graph_maybe_cyclic if SCC
+consists of multiple vertices.
+
+Even if SCC is a single vertex, a cycle might exist as self-fd passing.
+Given the corner case is rare, we detect it by checking all edges of
+the vertex and set true to unix_graph_maybe_cyclic.
+
+With this change, __unix_gc() is just a spin_lock() dance in the normal
+usage.
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Acked-by: Paolo Abeni <pabeni@redhat.com>
+Link: https://lore.kernel.org/r/20240325202425.60930-11-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ net/unix/garbage.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++-
+ 1 file changed, 47 insertions(+), 1 deletion(-)
+
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -112,6 +112,19 @@ static struct unix_vertex *unix_edge_suc
+ return edge->successor->vertex;
+ }
+
++static bool unix_graph_maybe_cyclic;
++
++static void unix_update_graph(struct unix_vertex *vertex)
++{
++ /* If the receiver socket is not inflight, no cyclic
++ * reference could be formed.
++ */
++ if (!vertex)
++ return;
++
++ unix_graph_maybe_cyclic = true;
++}
++
+ static LIST_HEAD(unix_unvisited_vertices);
+
+ enum unix_vertex_index {
+@@ -138,12 +151,16 @@ static void unix_add_edge(struct scm_fp_
+
+ vertex->out_degree++;
+ list_add_tail(&edge->vertex_entry, &vertex->edges);
++
++ unix_update_graph(unix_edge_successor(edge));
+ }
+
+ static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge)
+ {
+ struct unix_vertex *vertex = edge->predecessor->vertex;
+
++ unix_update_graph(unix_edge_successor(edge));
++
+ list_del(&edge->vertex_entry);
+ vertex->out_degree--;
+
+@@ -227,6 +244,7 @@ out:
+ void unix_update_edges(struct unix_sock *receiver)
+ {
+ spin_lock(&unix_gc_lock);
++ unix_update_graph(unix_sk(receiver->listener)->vertex);
+ receiver->listener = NULL;
+ spin_unlock(&unix_gc_lock);
+ }
+@@ -268,6 +286,26 @@ void unix_destroy_fpl(struct scm_fp_list
+ unix_free_vertices(fpl);
+ }
+
++static bool unix_scc_cyclic(struct list_head *scc)
++{
++ struct unix_vertex *vertex;
++ struct unix_edge *edge;
++
++ /* SCC containing multiple vertices ? */
++ if (!list_is_singular(scc))
++ return true;
++
++ vertex = list_first_entry(scc, typeof(*vertex), scc_entry);
++
++ /* Self-reference or a embryo-listener circle ? */
++ list_for_each_entry(edge, &vertex->edges, vertex_entry) {
++ if (unix_edge_successor(edge) == vertex)
++ return true;
++ }
++
++ return false;
++}
++
+ static LIST_HEAD(unix_visited_vertices);
+ static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2;
+
+@@ -353,6 +391,9 @@ prev_vertex:
+ vertex->index = unix_vertex_grouped_index;
+ }
+
++ if (!unix_graph_maybe_cyclic)
++ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc);
++
+ list_del(&scc);
+ }
+
+@@ -363,6 +404,8 @@ prev_vertex:
+
+ static void unix_walk_scc(void)
+ {
++ unix_graph_maybe_cyclic = false;
++
+ /* Visit every vertex exactly once.
+ * __unix_walk_scc() moves visited vertices to unix_visited_vertices.
+ */
+@@ -524,6 +567,9 @@ static void __unix_gc(struct work_struct
+
+ spin_lock(&unix_gc_lock);
+
++ if (!unix_graph_maybe_cyclic)
++ goto skip_gc;
++
+ unix_walk_scc();
+
+ /* First, select candidates for garbage collection. Only
+@@ -633,7 +679,7 @@ static void __unix_gc(struct work_struct
+
+ /* All candidates should have been detached by now. */
+ WARN_ON_ONCE(!list_empty(&gc_candidates));
+-
++skip_gc:
+ /* Paired with READ_ONCE() in wait_for_unix_gc(). */
+ WRITE_ONCE(gc_in_progress, false);
+
--- /dev/null
+From stable+bounces-145869-greg=kroah.com@vger.kernel.org Wed May 21 17:00:36 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:30 +0000
+Subject: af_unix: Try not to hold unix_gc_lock during accept().
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Pavel Begunkov <asml.silence@gmail.com>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org, kernel test robot <oliver.sang@intel.com>
+Message-ID: <20250521144803.2050504-23-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit fd86344823b521149bb31d91eba900ba3525efa6 upstream.
+
+Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo
+socket.") added spin_lock(&unix_gc_lock) in accept() path, and it
+caused regression in a stress test as reported by kernel test robot.
+
+If the embryo socket is not part of the inflight graph, we need not
+hold the lock.
+
+To decide that in O(1) time and avoid the regression in the normal
+use case,
+
+ 1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds
+
+ 2. count the number of inflight AF_UNIX sockets in the receive
+ queue under unix_state_lock()
+
+ 3. move unix_update_edges() call under unix_state_lock()
+
+ 4. avoid locking if nr_unix_fds is 0 in unix_update_edges()
+
+Reported-by: kernel test robot <oliver.sang@intel.com>
+Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com
+Signed-off-by: Paolo Abeni <pabeni@redhat.com>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 1 +
+ net/unix/af_unix.c | 2 +-
+ net/unix/garbage.c | 20 ++++++++++++++++----
+ 3 files changed, 18 insertions(+), 5 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -67,6 +67,7 @@ struct unix_skb_parms {
+
+ struct scm_stat {
+ atomic_t nr_fds;
++ unsigned long nr_unix_fds;
+ };
+
+ #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb))
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1703,12 +1703,12 @@ static int unix_accept(struct socket *so
+ }
+
+ tsk = skb->sk;
+- unix_update_edges(unix_sk(tsk));
+ skb_free_datagram(sk, skb);
+ wake_up_interruptible(&unix_sk(sk)->peer_wait);
+
+ /* attach accepted sock to socket */
+ unix_state_lock(tsk);
++ unix_update_edges(unix_sk(tsk));
+ newsock->state = SS_CONNECTED;
+ unix_sock_inherit_flags(sock, newsock);
+ sock_graft(tsk, newsock);
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list *
+ unix_add_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
++ receiver->scm_stat.nr_unix_fds += fpl->count_unix;
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix);
+ out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count);
+@@ -222,6 +223,7 @@ out:
+
+ void unix_del_edges(struct scm_fp_list *fpl)
+ {
++ struct unix_sock *receiver;
+ int i = 0;
+
+ spin_lock(&unix_gc_lock);
+@@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list *
+ unix_del_edge(fpl, edge);
+ } while (i < fpl->count_unix);
+
++ receiver = fpl->edges[0].successor;
++ receiver->scm_stat.nr_unix_fds -= fpl->count_unix;
+ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix);
+ out:
+ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count);
+@@ -246,10 +250,18 @@ out:
+
+ void unix_update_edges(struct unix_sock *receiver)
+ {
+- spin_lock(&unix_gc_lock);
+- unix_update_graph(unix_sk(receiver->listener)->vertex);
+- receiver->listener = NULL;
+- spin_unlock(&unix_gc_lock);
++ /* nr_unix_fds is only updated under unix_state_lock().
++ * If it's 0 here, the embryo socket is not part of the
++ * inflight graph, and GC will not see it, so no lock needed.
++ */
++ if (!receiver->scm_stat.nr_unix_fds) {
++ receiver->listener = NULL;
++ } else {
++ spin_lock(&unix_gc_lock);
++ unix_update_graph(unix_sk(receiver->listener)->vertex);
++ receiver->listener = NULL;
++ spin_unlock(&unix_gc_lock);
++ }
+ }
+
+ int unix_prepare_fpl(struct scm_fp_list *fpl)
--- /dev/null
+From stable+bounces-145850-greg=kroah.com@vger.kernel.org Wed May 21 16:50:41 2025
+From: Lee Jones <lee@kernel.org>
+Date: Wed, 21 May 2025 14:45:11 +0000
+Subject: af_unix: Try to run GC async.
+To: lee@kernel.org, "David S. Miller" <davem@davemloft.net>, Eric Dumazet <edumazet@google.com>, Jakub Kicinski <kuba@kernel.org>, Paolo Abeni <pabeni@redhat.com>, Kuniyuki Iwashima <kuniyu@amazon.com>, Jens Axboe <axboe@kernel.dk>, Sasha Levin <sashal@kernel.org>, Michal Luczaj <mhal@rbox.co>, Rao Shoaib <Rao.Shoaib@oracle.com>, Simon Horman <horms@kernel.org>, linux-kernel@vger.kernel.org, netdev@vger.kernel.org
+Cc: stable@vger.kernel.org
+Message-ID: <20250521144803.2050504-4-lee@kernel.org>
+
+From: Kuniyuki Iwashima <kuniyu@amazon.com>
+
+commit d9f21b3613337b55cc9d4a6ead484dca68475143 upstream.
+
+If more than 16000 inflight AF_UNIX sockets exist and the garbage
+collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc().
+Also, they wait for unix_gc() to complete.
+
+In unix_gc(), all inflight AF_UNIX sockets are traversed at least once,
+and more if they are the GC candidate. Thus, sendmsg() significantly
+slows down with too many inflight AF_UNIX sockets.
+
+However, if a process sends data with no AF_UNIX FD, the sendmsg() call
+does not need to wait for GC. After this change, only the process that
+meets the condition below will be blocked under such a situation.
+
+ 1) cmsg contains AF_UNIX socket
+ 2) more than 32 AF_UNIX sent by the same user are still inflight
+
+Note that even a sendmsg() call that does not meet the condition but has
+AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock,
+but we allow that as a bonus for sane users.
+
+The results below are the time spent in unix_dgram_sendmsg() sending 1
+byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX
+sockets exist.
+
+Without series: the sane sendmsg() needs to wait gc unreasonably.
+
+ $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg
+ Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
+ ^C
+ nsecs : count distribution
+ [...]
+ 524288 -> 1048575 : 0 | |
+ 1048576 -> 2097151 : 3881 |****************************************|
+ 2097152 -> 4194303 : 214 |** |
+ 4194304 -> 8388607 : 1 | |
+
+ avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096
+
+With series: the sane sendmsg() can finish much faster.
+
+ $ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg
+ Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end.
+ ^C
+ nsecs : count distribution
+ [...]
+ 128 -> 255 : 0 | |
+ 256 -> 511 : 4092 |****************************************|
+ 512 -> 1023 : 2 | |
+ 1024 -> 2047 : 0 | |
+ 2048 -> 4095 : 0 | |
+ 4096 -> 8191 : 1 | |
+ 8192 -> 16383 : 1 | |
+
+ avg = 410 nsecs, total: 1680510 nsecs, count: 4096
+
+Signed-off-by: Kuniyuki Iwashima <kuniyu@amazon.com>
+Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com
+Signed-off-by: Jakub Kicinski <kuba@kernel.org>
+Signed-off-by: Lee Jones <lee@kernel.org>
+Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
+---
+ include/net/af_unix.h | 12 ++++++++++--
+ include/net/scm.h | 1 +
+ net/core/scm.c | 5 +++++
+ net/unix/af_unix.c | 6 ++++--
+ net/unix/garbage.c | 10 +++++++++-
+ 5 files changed, 29 insertions(+), 5 deletions(-)
+
+--- a/include/net/af_unix.h
++++ b/include/net/af_unix.h
+@@ -8,13 +8,21 @@
+ #include <linux/refcount.h>
+ #include <net/sock.h>
+
++#if IS_ENABLED(CONFIG_UNIX)
++struct unix_sock *unix_get_socket(struct file *filp);
++#else
++static inline struct unix_sock *unix_get_socket(struct file *filp)
++{
++ return NULL;
++}
++#endif
++
+ void unix_inflight(struct user_struct *user, struct file *fp);
+ void unix_notinflight(struct user_struct *user, struct file *fp);
+ void unix_destruct_scm(struct sk_buff *skb);
+ void io_uring_destruct_scm(struct sk_buff *skb);
+ void unix_gc(void);
+-void wait_for_unix_gc(void);
+-struct unix_sock *unix_get_socket(struct file *filp);
++void wait_for_unix_gc(struct scm_fp_list *fpl);
+ struct sock *unix_peer_get(struct sock *sk);
+
+ #define UNIX_HASH_MOD (256 - 1)
+--- a/include/net/scm.h
++++ b/include/net/scm.h
+@@ -24,6 +24,7 @@ struct scm_creds {
+
+ struct scm_fp_list {
+ short count;
++ short count_unix;
+ short max;
+ struct user_struct *user;
+ struct file *fp[SCM_MAX_FD];
+--- a/net/core/scm.c
++++ b/net/core/scm.c
+@@ -36,6 +36,7 @@
+ #include <net/compat.h>
+ #include <net/scm.h>
+ #include <net/cls_cgroup.h>
++#include <net/af_unix.h>
+
+
+ /*
+@@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *c
+ return -ENOMEM;
+ *fplp = fpl;
+ fpl->count = 0;
++ fpl->count_unix = 0;
+ fpl->max = SCM_MAX_FD;
+ fpl->user = NULL;
+ }
+@@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *c
+ fput(file);
+ return -EINVAL;
+ }
++ if (unix_get_socket(file))
++ fpl->count_unix++;
++
+ *fpp++ = file;
+ fpl->count++;
+ }
+--- a/net/unix/af_unix.c
++++ b/net/unix/af_unix.c
+@@ -1885,11 +1885,12 @@ static int unix_dgram_sendmsg(struct soc
+ long timeo;
+ int err;
+
+- wait_for_unix_gc();
+ err = scm_send(sock, msg, &scm, false);
+ if (err < 0)
+ return err;
+
++ wait_for_unix_gc(scm.fp);
++
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags&MSG_OOB)
+ goto out;
+@@ -2157,11 +2158,12 @@ static int unix_stream_sendmsg(struct so
+ bool fds_sent = false;
+ int data_len;
+
+- wait_for_unix_gc();
+ err = scm_send(sock, msg, &scm, false);
+ if (err < 0)
+ return err;
+
++ wait_for_unix_gc(scm.fp);
++
+ err = -EOPNOTSUPP;
+ if (msg->msg_flags & MSG_OOB) {
+ #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
+--- a/net/unix/garbage.c
++++ b/net/unix/garbage.c
+@@ -335,8 +335,9 @@ void unix_gc(void)
+ }
+
+ #define UNIX_INFLIGHT_TRIGGER_GC 16000
++#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8)
+
+-void wait_for_unix_gc(void)
++void wait_for_unix_gc(struct scm_fp_list *fpl)
+ {
+ /* If number of inflight sockets is insane,
+ * force a garbage collect right now.
+@@ -348,6 +349,13 @@ void wait_for_unix_gc(void)
+ !READ_ONCE(gc_in_progress))
+ unix_gc();
+
++ /* Penalise users who want to send AF_UNIX sockets
++ * but whose sockets have not been received yet.
++ */
++ if (!fpl || !fpl->count_unix ||
++ READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER)
++ return;
++
+ if (READ_ONCE(gc_in_progress))
+ flush_work(&unix_gc_work);
+ }
can-kvaser_pciefd-force-irq-edge-in-case-of-nested-irq.patch
hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch
btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch
+af_unix-return-struct-unix_sock-from-unix_get_socket.patch
+af_unix-run-gc-on-only-one-cpu.patch
+af_unix-try-to-run-gc-async.patch
+af_unix-replace-bug_on-with-warn_on_once.patch
+af_unix-remove-io_uring-code-for-gc.patch
+af_unix-remove-config_unix_scm.patch
+af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch
+af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch
+af_unix-link-struct-unix_edge-when-queuing-skb.patch
+af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch
+af_unix-iterate-all-vertices-by-dfs.patch
+af_unix-detect-strongly-connected-components.patch
+af_unix-save-listener-for-embryo-socket.patch
+af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch
+af_unix-save-o-n-setup-of-tarjan-s-algo.patch
+af_unix-skip-gc-if-no-cycle-exists.patch
+af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch
+af_unix-assign-a-unique-index-to-scc.patch
+af_unix-detect-dead-scc.patch
+af_unix-replace-garbage-collection-algorithm.patch
+af_unix-remove-lock-dance-in-unix_peek_fds.patch
+af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch
+af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch
+af_unix-add-dead-flag-to-struct-scm_fp_list.patch
+af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch
+af_unix-fix-uninit-value-in-__unix_walk_scc.patch