From: Greg Kroah-Hartman Date: Thu, 29 May 2025 12:22:08 +0000 (+0200) Subject: 6.6-stable patches X-Git-Tag: v5.4.294~39 X-Git-Url: http://git.ipfire.org/?a=commitdiff_plain;h=1a288402c1f766419ae9af2ff23d347d5d35a2b0;p=thirdparty%2Fkernel%2Fstable-queue.git 6.6-stable patches added patches: af_unix-add-dead-flag-to-struct-scm_fp_list.patch af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch af_unix-assign-a-unique-index-to-scc.patch af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch af_unix-detect-dead-scc.patch af_unix-detect-strongly-connected-components.patch af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch af_unix-fix-uninit-value-in-__unix_walk_scc.patch af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch af_unix-iterate-all-vertices-by-dfs.patch af_unix-link-struct-unix_edge-when-queuing-skb.patch af_unix-remove-config_unix_scm.patch af_unix-remove-io_uring-code-for-gc.patch af_unix-remove-lock-dance-in-unix_peek_fds.patch af_unix-replace-bug_on-with-warn_on_once.patch af_unix-replace-garbage-collection-algorithm.patch af_unix-return-struct-unix_sock-from-unix_get_socket.patch af_unix-run-gc-on-only-one-cpu.patch af_unix-save-listener-for-embryo-socket.patch af_unix-save-o-n-setup-of-tarjan-s-algo.patch af_unix-skip-gc-if-no-cycle-exists.patch af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch af_unix-try-to-run-gc-async.patch --- diff --git a/queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch b/queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch new file mode 100644 index 0000000000..cb74c7b7e0 --- /dev/null +++ b/queue-6.6/af_unix-add-dead-flag-to-struct-scm_fp_list.patch @@ -0,0 +1,110 @@ +From stable+bounces-145871-greg=kroah.com@vger.kernel.org Wed May 21 16:58:24 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:32 +0000 +Subject: af_unix: Add dead flag to struct scm_fp_list. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-25-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 7172dc93d621d5dc302d007e95ddd1311ec64283 upstream. + +Commit 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() +during GC.") fixed use-after-free by avoid accessing edge->successor while +GC is in progress. + +However, there could be a small race window where another process could +call unix_del_edges() while gc_in_progress is true and __skb_queue_purge() +is on the way. + +So, we need another marker for struct scm_fp_list which indicates if the +skb is garbage-collected. + +This patch adds dead flag in struct scm_fp_list and set it true before +calling __skb_queue_purge(). + +Fixes: 1af2dface5d2 ("af_unix: Don't access successor in unix_del_edges() during GC.") +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240508171150.50601-1-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/scm.h | 1 + + net/core/scm.c | 1 + + net/unix/garbage.c | 14 ++++++++++---- + 3 files changed, 12 insertions(+), 4 deletions(-) + +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -32,6 +32,7 @@ struct scm_fp_list { + short max; + #ifdef CONFIG_UNIX + bool inflight; ++ bool dead; + struct list_head vertices; + struct unix_edge *edges; + #endif +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -91,6 +91,7 @@ static int scm_fp_copy(struct cmsghdr *c + fpl->user = NULL; + #if IS_ENABLED(CONFIG_UNIX) + fpl->inflight = false; ++ fpl->dead = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); + #endif +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -158,13 +158,11 @@ static void unix_add_edge(struct scm_fp_ + unix_update_graph(unix_edge_successor(edge)); + } + +-static bool gc_in_progress; +- + static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) + { + struct unix_vertex *vertex = edge->predecessor->vertex; + +- if (!gc_in_progress) ++ if (!fpl->dead) + unix_update_graph(unix_edge_successor(edge)); + + list_del(&edge->vertex_entry); +@@ -240,7 +238,7 @@ void unix_del_edges(struct scm_fp_list * + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + +- if (!gc_in_progress) { ++ if (!fpl->dead) { + receiver = fpl->edges[0].successor; + receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + } +@@ -559,9 +557,12 @@ static void unix_walk_scc_fast(struct sk + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + } + ++static bool gc_in_progress; ++ + static void __unix_gc(struct work_struct *work) + { + struct sk_buff_head hitlist; ++ struct sk_buff *skb; + + spin_lock(&unix_gc_lock); + +@@ -579,6 +580,11 @@ static void __unix_gc(struct work_struct + + spin_unlock(&unix_gc_lock); + ++ skb_queue_walk(&hitlist, skb) { ++ if (UNIXCB(skb).fp) ++ UNIXCB(skb).fp->dead = true; ++ } ++ + __skb_queue_purge(&hitlist); + skip_gc: + WRITE_ONCE(gc_in_progress, false); diff --git a/queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch b/queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch new file mode 100644 index 0000000000..8233e90bc4 --- /dev/null +++ b/queue-6.6/af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch @@ -0,0 +1,115 @@ +From stable+bounces-145855-greg=kroah.com@vger.kernel.org Wed May 21 16:53:51 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:16 +0000 +Subject: af_unix: Allocate struct unix_edge for each inflight AF_UNIX fd. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-9-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 29b64e354029cfcf1eea4d91b146c7b769305930 upstream. + +As with the previous patch, we preallocate to skb's scm_fp_list an +array of struct unix_edge in the number of inflight AF_UNIX fds. + +There we just preallocate memory and do not use immediately because +sendmsg() could fail after this point. The actual use will be in +the next patch. + +When we queue skb with inflight edges, we will set the inflight +socket's unix_sock as unix_edge->predecessor and the receiver's +unix_sock as successor, and then we will link the edge to the +inflight socket's unix_vertex.edges. + +Note that we set NULL to cloned scm_fp_list.edges in scm_fp_dup() +so that MSG_PEEK does not change the shape of the directed graph. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-3-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 6 ++++++ + include/net/scm.h | 5 +++++ + net/core/scm.c | 2 ++ + net/unix/garbage.c | 6 ++++++ + 4 files changed, 19 insertions(+) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -33,6 +33,12 @@ struct unix_vertex { + unsigned long out_degree; + }; + ++struct unix_edge { ++ struct unix_sock *predecessor; ++ struct unix_sock *successor; ++ struct list_head vertex_entry; ++}; ++ + struct sock *unix_peer_get(struct sock *sk); + + #define UNIX_HASH_MOD (256 - 1) +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -22,12 +22,17 @@ struct scm_creds { + kgid_t gid; + }; + ++#ifdef CONFIG_UNIX ++struct unix_edge; ++#endif ++ + struct scm_fp_list { + short count; + short count_unix; + short max; + #ifdef CONFIG_UNIX + struct list_head vertices; ++ struct unix_edge *edges; + #endif + struct user_struct *user; + struct file *fp[SCM_MAX_FD]; +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c + fpl->max = SCM_MAX_FD; + fpl->user = NULL; + #if IS_ENABLED(CONFIG_UNIX) ++ fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); + #endif + } +@@ -383,6 +384,7 @@ struct scm_fp_list *scm_fp_dup(struct sc + new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); + #if IS_ENABLED(CONFIG_UNIX) ++ new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); + #endif + } +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -127,6 +127,11 @@ int unix_prepare_fpl(struct scm_fp_list + list_add(&vertex->entry, &fpl->vertices); + } + ++ fpl->edges = kvmalloc_array(fpl->count_unix, sizeof(*fpl->edges), ++ GFP_KERNEL_ACCOUNT); ++ if (!fpl->edges) ++ goto err; ++ + return 0; + + err: +@@ -136,6 +141,7 @@ err: + + void unix_destroy_fpl(struct scm_fp_list *fpl) + { ++ kvfree(fpl->edges); + unix_free_vertices(fpl); + } + diff --git a/queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch b/queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch new file mode 100644 index 0000000000..9767815042 --- /dev/null +++ b/queue-6.6/af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch @@ -0,0 +1,202 @@ +From stable+bounces-145854-greg=kroah.com@vger.kernel.org Wed May 21 16:53:21 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:15 +0000 +Subject: af_unix: Allocate struct unix_vertex for each inflight AF_UNIX fd. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-8-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 1fbfdfaa590248c1d86407f578e40e5c65136330 upstream. + +We will replace the garbage collection algorithm for AF_UNIX, where +we will consider each inflight AF_UNIX socket as a vertex and its file +descriptor as an edge in a directed graph. + +This patch introduces a new struct unix_vertex representing a vertex +in the graph and adds its pointer to struct unix_sock. + +When we send a fd using the SCM_RIGHTS message, we allocate struct +scm_fp_list to struct scm_cookie in scm_fp_copy(). Then, we bump +each refcount of the inflight fds' struct file and save them in +scm_fp_list.fp. + +After that, unix_attach_fds() inexplicably clones scm_fp_list of +scm_cookie and sets it to skb. (We will remove this part after +replacing GC.) + +Here, we add a new function call in unix_attach_fds() to preallocate +struct unix_vertex per inflight AF_UNIX fd and link each vertex to +skb's scm_fp_list.vertices. + +When sendmsg() succeeds later, if the socket of the inflight fd is +still not inflight yet, we will set the preallocated vertex to struct +unix_sock.vertex and link it to a global list unix_unvisited_vertices +under spin_lock(&unix_gc_lock). + +If the socket is already inflight, we free the preallocated vertex. +This is to avoid taking the lock unnecessarily when sendmsg() could +fail later. + +In the following patch, we will similarly allocate another struct +per edge, which will finally be linked to the inflight socket's +unix_vertex.edges. + +And then, we will count the number of edges as unix_vertex.out_degree. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-2-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 9 +++++++++ + include/net/scm.h | 3 +++ + net/core/scm.c | 7 +++++++ + net/unix/af_unix.c | 6 ++++++ + net/unix/garbage.c | 38 ++++++++++++++++++++++++++++++++++++++ + 5 files changed, 63 insertions(+) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -22,9 +22,17 @@ extern unsigned int unix_tot_inflight; + + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); ++int unix_prepare_fpl(struct scm_fp_list *fpl); ++void unix_destroy_fpl(struct scm_fp_list *fpl); + void unix_gc(void); + void wait_for_unix_gc(struct scm_fp_list *fpl); + ++struct unix_vertex { ++ struct list_head edges; ++ struct list_head entry; ++ unsigned long out_degree; ++}; ++ + struct sock *unix_peer_get(struct sock *sk); + + #define UNIX_HASH_MOD (256 - 1) +@@ -62,6 +70,7 @@ struct unix_sock { + struct path path; + struct mutex iolock, bindlock; + struct sock *peer; ++ struct unix_vertex *vertex; + struct list_head link; + unsigned long inflight; + spinlock_t lock; +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -26,6 +26,9 @@ struct scm_fp_list { + short count; + short count_unix; + short max; ++#ifdef CONFIG_UNIX ++ struct list_head vertices; ++#endif + struct user_struct *user; + struct file *fp[SCM_MAX_FD]; + }; +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -89,6 +89,9 @@ static int scm_fp_copy(struct cmsghdr *c + fpl->count_unix = 0; + fpl->max = SCM_MAX_FD; + fpl->user = NULL; ++#if IS_ENABLED(CONFIG_UNIX) ++ INIT_LIST_HEAD(&fpl->vertices); ++#endif + } + fpp = &fpl->fp[fpl->count]; + +@@ -376,8 +379,12 @@ struct scm_fp_list *scm_fp_dup(struct sc + if (new_fpl) { + for (i = 0; i < fpl->count; i++) + get_file(fpl->fp[i]); ++ + new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); ++#if IS_ENABLED(CONFIG_UNIX) ++ INIT_LIST_HEAD(&new_fpl->vertices); ++#endif + } + return new_fpl; + } +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -979,6 +979,7 @@ static struct sock *unix_create1(struct + sk->sk_destruct = unix_sock_destructor; + u = unix_sk(sk); + u->inflight = 0; ++ u->vertex = NULL; + u->path.dentry = NULL; + u->path.mnt = NULL; + spin_lock_init(&u->lock); +@@ -1782,6 +1783,9 @@ static int unix_attach_fds(struct scm_co + for (i = scm->fp->count - 1; i >= 0; i--) + unix_inflight(scm->fp->user, scm->fp->fp[i]); + ++ if (unix_prepare_fpl(UNIXCB(skb).fp)) ++ return -ENOMEM; ++ + return 0; + } + +@@ -1792,6 +1796,8 @@ static void unix_detach_fds(struct scm_c + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + ++ unix_destroy_fpl(scm->fp); ++ + for (i = scm->fp->count - 1; i >= 0; i--) + unix_notinflight(scm->fp->user, scm->fp->fp[i]); + } +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -101,6 +101,44 @@ struct unix_sock *unix_get_socket(struct + return NULL; + } + ++static void unix_free_vertices(struct scm_fp_list *fpl) ++{ ++ struct unix_vertex *vertex, *next_vertex; ++ ++ list_for_each_entry_safe(vertex, next_vertex, &fpl->vertices, entry) { ++ list_del(&vertex->entry); ++ kfree(vertex); ++ } ++} ++ ++int unix_prepare_fpl(struct scm_fp_list *fpl) ++{ ++ struct unix_vertex *vertex; ++ int i; ++ ++ if (!fpl->count_unix) ++ return 0; ++ ++ for (i = 0; i < fpl->count_unix; i++) { ++ vertex = kmalloc(sizeof(*vertex), GFP_KERNEL); ++ if (!vertex) ++ goto err; ++ ++ list_add(&vertex->entry, &fpl->vertices); ++ } ++ ++ return 0; ++ ++err: ++ unix_free_vertices(fpl); ++ return -ENOMEM; ++} ++ ++void unix_destroy_fpl(struct scm_fp_list *fpl) ++{ ++ unix_free_vertices(fpl); ++} ++ + DEFINE_SPINLOCK(unix_gc_lock); + unsigned int unix_tot_inflight; + static LIST_HEAD(gc_candidates); diff --git a/queue-6.6/af_unix-assign-a-unique-index-to-scc.patch b/queue-6.6/af_unix-assign-a-unique-index-to-scc.patch new file mode 100644 index 0000000000..c5422540d3 --- /dev/null +++ b/queue-6.6/af_unix-assign-a-unique-index-to-scc.patch @@ -0,0 +1,166 @@ +From stable+bounces-145865-greg=kroah.com@vger.kernel.org Wed May 21 16:55:41 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:26 +0000 +Subject: af_unix: Assign a unique index to SCC. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-19-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit bfdb01283ee8f2f3089656c3ff8f62bb072dabb2 upstream. + +The definition of the lowlink in Tarjan's algorithm is the +smallest index of a vertex that is reachable with at most one +back-edge in SCC. This is not useful for a cross-edge. + +If we start traversing from A in the following graph, the final +lowlink of D is 3. The cross-edge here is one between D and C. + + A -> B -> D D = (4, 3) (index, lowlink) + ^ | | C = (3, 1) + | V | B = (2, 1) + `--- C <--' A = (1, 1) + +This is because the lowlink of D is updated with the index of C. + +In the following patch, we detect a dead SCC by checking two +conditions for each vertex. + + 1) vertex has no edge directed to another SCC (no bridge) + 2) vertex's out_degree is the same as the refcount of its file + +If 1) is false, there is a receiver of all fds of the SCC and +its ancestor SCC. + +To evaluate 1), we need to assign a unique index to each SCC and +assign it to all vertices in the SCC. + +This patch changes the lowlink update logic for cross-edge so +that in the example above, the lowlink of D is updated with the +lowlink of C. + + A -> B -> D D = (4, 1) (index, lowlink) + ^ | | C = (3, 1) + | V | B = (2, 1) + `--- C <--' A = (1, 1) + +Then, all vertices in the same SCC have the same lowlink, and we +can quickly find the bridge connecting to different SCC if exists. + +However, it is no longer called lowlink, so we rename it to +scc_index. (It's sometimes called lowpoint.) + +Also, we add a global variable to hold the last index used in DFS +so that we do not reset the initial index in each DFS. + +This patch can be squashed to the SCC detection patch but is +split deliberately for anyone wondering why lowlink is not used +as used in the original Tarjan's algorithm and many reference +implementations. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-13-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 2 +- + net/unix/garbage.c | 29 +++++++++++++++-------------- + 2 files changed, 16 insertions(+), 15 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -36,7 +36,7 @@ struct unix_vertex { + struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; +- unsigned long lowlink; ++ unsigned long scc_index; + }; + + struct unix_edge { +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -312,9 +312,8 @@ static bool unix_scc_cyclic(struct list_ + static LIST_HEAD(unix_visited_vertices); + static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +-static void __unix_walk_scc(struct unix_vertex *vertex) ++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) + { +- unsigned long index = UNIX_VERTEX_INDEX_START; + LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); +@@ -326,9 +325,9 @@ next_vertex: + */ + list_add(&vertex->scc_entry, &vertex_stack); + +- vertex->index = index; +- vertex->lowlink = index; +- index++; ++ vertex->index = *last_index; ++ vertex->scc_index = *last_index; ++ (*last_index)++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { +@@ -358,30 +357,30 @@ prev_vertex: + next_vertex = vertex; + vertex = edge->predecessor->vertex; + +- /* If the successor has a smaller lowlink, two vertices +- * are in the same SCC, so propagate the smaller lowlink ++ /* If the successor has a smaller scc_index, two vertices ++ * are in the same SCC, so propagate the smaller scc_index + * to skip SCC finalisation. + */ +- vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); ++ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * +- * The successor is on vertex_stack, so two vertices are +- * in the same SCC. If the successor has a smaller index, ++ * The successor is on vertex_stack, so two vertices are in ++ * the same SCC. If the successor has a smaller *scc_index*, + * propagate it to skip SCC finalisation. + */ +- vertex->lowlink = min(vertex->lowlink, next_vertex->index); ++ vertex->scc_index = min(vertex->scc_index, next_vertex->scc_index); + } else { + /* The successor was already grouped as another SCC */ + } + } + +- if (vertex->index == vertex->lowlink) { ++ if (vertex->index == vertex->scc_index) { + struct list_head scc; + + /* SCC finalised. + * +- * If the lowlink was not updated, all the vertices above on ++ * If the scc_index was not updated, all the vertices above on + * vertex_stack are in the same SCC. Group them using scc_entry. + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); +@@ -407,6 +406,8 @@ prev_vertex: + + static void unix_walk_scc(void) + { ++ unsigned long last_index = UNIX_VERTEX_INDEX_START; ++ + unix_graph_maybe_cyclic = false; + + /* Visit every vertex exactly once. +@@ -416,7 +417,7 @@ static void unix_walk_scc(void) + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); +- __unix_walk_scc(vertex); ++ __unix_walk_scc(vertex, &last_index); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); diff --git a/queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch b/queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch new file mode 100644 index 0000000000..6a20fb07a4 --- /dev/null +++ b/queue-6.6/af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch @@ -0,0 +1,106 @@ +From stable+bounces-145864-greg=kroah.com@vger.kernel.org Wed May 21 16:58:08 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:25 +0000 +Subject: af_unix: Avoid Tarjan's algorithm if unnecessary. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-18-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit ad081928a8b0f57f269df999a28087fce6f2b6ce upstream. + +Once a cyclic reference is formed, we need to run GC to check if +there is dead SCC. + +However, we do not need to run Tarjan's algorithm if we know that +the shape of the inflight graph has not been changed. + +If an edge is added/updated/deleted and the edge's successor is +inflight, we set false to unix_graph_grouped, which means we need +to re-classify SCC. + +Once we finalise SCC, we set true to unix_graph_grouped. + +While unix_graph_grouped is true, we can iterate the grouped +SCC using vertex->scc_entry in unix_walk_scc_fast(). + +list_add() and list_for_each_entry_reverse() uses seem weird, but +they are to keep the vertex order consistent and make writing test +easier. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-12-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 28 +++++++++++++++++++++++++++- + 1 file changed, 27 insertions(+), 1 deletion(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -113,6 +113,7 @@ static struct unix_vertex *unix_edge_suc + } + + static bool unix_graph_maybe_cyclic; ++static bool unix_graph_grouped; + + static void unix_update_graph(struct unix_vertex *vertex) + { +@@ -123,6 +124,7 @@ static void unix_update_graph(struct uni + return; + + unix_graph_maybe_cyclic = true; ++ unix_graph_grouped = false; + } + + static LIST_HEAD(unix_unvisited_vertices); +@@ -144,6 +146,7 @@ static void unix_add_edge(struct scm_fp_ + vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); ++ INIT_LIST_HEAD(&vertex->scc_entry); + + list_move_tail(&vertex->entry, &unix_unvisited_vertices); + edge->predecessor->vertex = vertex; +@@ -418,6 +421,26 @@ static void unix_walk_scc(void) + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); ++ ++ unix_graph_grouped = true; ++} ++ ++static void unix_walk_scc_fast(void) ++{ ++ while (!list_empty(&unix_unvisited_vertices)) { ++ struct unix_vertex *vertex; ++ struct list_head scc; ++ ++ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); ++ list_add(&scc, &vertex->scc_entry); ++ ++ list_for_each_entry_reverse(vertex, &scc, scc_entry) ++ list_move_tail(&vertex->entry, &unix_visited_vertices); ++ ++ list_del(&scc); ++ } ++ ++ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + } + + static LIST_HEAD(gc_candidates); +@@ -570,7 +593,10 @@ static void __unix_gc(struct work_struct + if (!unix_graph_maybe_cyclic) + goto skip_gc; + +- unix_walk_scc(); ++ if (unix_graph_grouped) ++ unix_walk_scc_fast(); ++ else ++ unix_walk_scc(); + + /* First, select candidates for garbage collection. Only + * in-flight sockets are considered, and from those only ones diff --git a/queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch b/queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch new file mode 100644 index 0000000000..973204661b --- /dev/null +++ b/queue-6.6/af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch @@ -0,0 +1,102 @@ +From stable+bounces-145857-greg=kroah.com@vger.kernel.org Wed May 21 16:54:19 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:18 +0000 +Subject: af_unix: Bulk update unix_tot_inflight/unix_inflight when queuing skb. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-11-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 22c3c0c52d32f41cc38cd936ea0c93f22ced3315 upstream. + +Currently, we track the number of inflight sockets in two variables. +unix_tot_inflight is the total number of inflight AF_UNIX sockets on +the host, and user->unix_inflight is the number of inflight fds per +user. + +We update them one by one in unix_inflight(), which can be done once +in batch. Also, sendmsg() could fail even after unix_inflight(), then +we need to acquire unix_gc_lock only to decrement the counters. + +Let's bulk update the counters in unix_add_edges() and unix_del_edges(), +which is called only for successfully passed fds. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-5-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 18 +++++++----------- + 1 file changed, 7 insertions(+), 11 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -144,6 +144,7 @@ static void unix_free_vertices(struct sc + } + + DEFINE_SPINLOCK(unix_gc_lock); ++unsigned int unix_tot_inflight; + + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) + { +@@ -168,7 +169,10 @@ void unix_add_edges(struct scm_fp_list * + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + ++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); + out: ++ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); ++ + spin_unlock(&unix_gc_lock); + + fpl->inflight = true; +@@ -191,7 +195,10 @@ void unix_del_edges(struct scm_fp_list * + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + ++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); + out: ++ WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); ++ + spin_unlock(&unix_gc_lock); + + fpl->inflight = false; +@@ -234,7 +241,6 @@ void unix_destroy_fpl(struct scm_fp_list + unix_free_vertices(fpl); + } + +-unsigned int unix_tot_inflight; + static LIST_HEAD(gc_candidates); + static LIST_HEAD(gc_inflight_list); + +@@ -255,13 +261,8 @@ void unix_inflight(struct user_struct *u + WARN_ON_ONCE(list_empty(&u->link)); + } + u->inflight++; +- +- /* Paired with READ_ONCE() in wait_for_unix_gc() */ +- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); + } + +- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); +- + spin_unlock(&unix_gc_lock); + } + +@@ -278,13 +279,8 @@ void unix_notinflight(struct user_struct + u->inflight--; + if (!u->inflight) + list_del_init(&u->link); +- +- /* Paired with READ_ONCE() in wait_for_unix_gc() */ +- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); + } + +- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); +- + spin_unlock(&unix_gc_lock); + } + diff --git a/queue-6.6/af_unix-detect-dead-scc.patch b/queue-6.6/af_unix-detect-dead-scc.patch new file mode 100644 index 0000000000..a7c720f383 --- /dev/null +++ b/queue-6.6/af_unix-detect-dead-scc.patch @@ -0,0 +1,112 @@ +From stable+bounces-145866-greg=kroah.com@vger.kernel.org Wed May 21 16:59:06 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:27 +0000 +Subject: af_unix: Detect dead SCC. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-20-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit a15702d8b3aad8ce5268c565bd29f0e02fd2db83 upstream. + +When iterating SCC, we call unix_vertex_dead() for each vertex +to check if the vertex is close()d and has no bridge to another +SCC. + +If both conditions are true for every vertex in SCC, we can +execute garbage collection for all skb in the SCC. + +The actual garbage collection is done in the following patch, +replacing the old implementation. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-14-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 44 +++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 43 insertions(+), 1 deletion(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -289,6 +289,39 @@ void unix_destroy_fpl(struct scm_fp_list + unix_free_vertices(fpl); + } + ++static bool unix_vertex_dead(struct unix_vertex *vertex) ++{ ++ struct unix_edge *edge; ++ struct unix_sock *u; ++ long total_ref; ++ ++ list_for_each_entry(edge, &vertex->edges, vertex_entry) { ++ struct unix_vertex *next_vertex = unix_edge_successor(edge); ++ ++ /* The vertex's fd can be received by a non-inflight socket. */ ++ if (!next_vertex) ++ return false; ++ ++ /* The vertex's fd can be received by an inflight socket in ++ * another SCC. ++ */ ++ if (next_vertex->scc_index != vertex->scc_index) ++ return false; ++ } ++ ++ /* No receiver exists out of the same SCC. */ ++ ++ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); ++ u = edge->predecessor; ++ total_ref = file_count(u->sk.sk_socket->file); ++ ++ /* If not close()d, total_ref > out_degree. */ ++ if (total_ref != vertex->out_degree) ++ return false; ++ ++ return true; ++} ++ + static bool unix_scc_cyclic(struct list_head *scc) + { + struct unix_vertex *vertex; +@@ -377,6 +410,7 @@ prev_vertex: + + if (vertex->index == vertex->scc_index) { + struct list_head scc; ++ bool scc_dead = true; + + /* SCC finalised. + * +@@ -391,6 +425,9 @@ prev_vertex: + + /* Mark vertex as off-stack. */ + vertex->index = unix_vertex_grouped_index; ++ ++ if (scc_dead) ++ scc_dead = unix_vertex_dead(vertex); + } + + if (!unix_graph_maybe_cyclic) +@@ -431,13 +468,18 @@ static void unix_walk_scc_fast(void) + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; ++ bool scc_dead = true; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + list_add(&scc, &vertex->scc_entry); + +- list_for_each_entry_reverse(vertex, &scc, scc_entry) ++ list_for_each_entry_reverse(vertex, &scc, scc_entry) { + list_move_tail(&vertex->entry, &unix_visited_vertices); + ++ if (scc_dead) ++ scc_dead = unix_vertex_dead(vertex); ++ } ++ + list_del(&scc); + } + diff --git a/queue-6.6/af_unix-detect-strongly-connected-components.patch b/queue-6.6/af_unix-detect-strongly-connected-components.patch new file mode 100644 index 0000000000..68ff173b58 --- /dev/null +++ b/queue-6.6/af_unix-detect-strongly-connected-components.patch @@ -0,0 +1,198 @@ +From stable+bounces-145859-greg=kroah.com@vger.kernel.org Wed May 21 16:55:45 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:20 +0000 +Subject: af_unix: Detect Strongly Connected Components. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-13-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 3484f063172dd88776b062046d721d7c2ae1af7c upstream. + +In the new GC, we use a simple graph algorithm, Tarjan's Strongly +Connected Components (SCC) algorithm, to find cyclic references. + +The algorithm visits every vertex exactly once using depth-first +search (DFS). + +DFS starts by pushing an input vertex to a stack and assigning it +a unique number. Two fields, index and lowlink, are initialised +with the number, but lowlink could be updated later during DFS. + +If a vertex has an edge to an unvisited inflight vertex, we visit +it and do the same processing. So, we will have vertices in the +stack in the order they appear and number them consecutively in +the same order. + +If a vertex has a back-edge to a visited vertex in the stack, +we update the predecessor's lowlink with the successor's index. + +After iterating edges from the vertex, we check if its index +equals its lowlink. + +If the lowlink is different from the index, it shows there was a +back-edge. Then, we go backtracking and propagate the lowlink to +its predecessor and resume the previous edge iteration from the +next edge. + +If the lowlink is the same as the index, we pop vertices before +and including the vertex from the stack. Then, the set of vertices +is SCC, possibly forming a cycle. At the same time, we move the +vertices to unix_visited_vertices. + +When we finish the algorithm, all vertices in each SCC will be +linked via unix_vertex.scc_entry. + +Let's take an example. We have a graph including five inflight +vertices (F is not inflight): + + A -> B -> C -> D -> E (-> F) + ^ | + `---------' + +Suppose that we start DFS from C. We will visit C, D, and B first +and initialise their index and lowlink. Then, the stack looks like +this: + + > B = (3, 3) (index, lowlink) + D = (2, 2) + C = (1, 1) + +When checking B's edge to C, we update B's lowlink with C's index +and propagate it to D. + + B = (3, 1) (index, lowlink) + > D = (2, 1) + C = (1, 1) + +Next, we visit E, which has no edge to an inflight vertex. + + > E = (4, 4) (index, lowlink) + B = (3, 1) + D = (2, 1) + C = (1, 1) + +When we leave from E, its index and lowlink are the same, so we +pop E from the stack as single-vertex SCC. Next, we leave from +B and D but do nothing because their lowlink are different from +their index. + + B = (3, 1) (index, lowlink) + D = (2, 1) + > C = (1, 1) + +Then, we leave from C, whose index and lowlink are the same, so +we pop B, D and C as SCC. + +Last, we do DFS for the rest of vertices, A, which is also a +single-vertex SCC. + +Finally, each unix_vertex.scc_entry is linked as follows: + + A -. B -> C -> D E -. + ^ | ^ | ^ | + `--' `---------' `--' + +We use SCC later to decide whether we can garbage-collect the +sockets. + +Note that we still cannot detect SCC properly if an edge points +to an embryo socket. The following two patches will sort it out. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-7-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 3 +++ + net/unix/garbage.c | 46 ++++++++++++++++++++++++++++++++++++++++++++-- + 2 files changed, 47 insertions(+), 2 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -32,8 +32,11 @@ void wait_for_unix_gc(struct scm_fp_list + struct unix_vertex { + struct list_head edges; + struct list_head entry; ++ struct list_head scc_entry; + unsigned long out_degree; + unsigned long index; ++ unsigned long lowlink; ++ bool on_stack; + }; + + struct unix_edge { +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -251,11 +251,19 @@ static LIST_HEAD(unix_visited_vertices); + static void __unix_walk_scc(struct unix_vertex *vertex) + { + unsigned long index = UNIX_VERTEX_INDEX_START; ++ LIST_HEAD(vertex_stack); + struct unix_edge *edge; + LIST_HEAD(edge_stack); + + next_vertex: ++ /* Push vertex to vertex_stack. ++ * The vertex will be popped when finalising SCC later. ++ */ ++ vertex->on_stack = true; ++ list_add(&vertex->scc_entry, &vertex_stack); ++ + vertex->index = index; ++ vertex->lowlink = index; + index++; + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ +@@ -283,12 +291,46 @@ prev_vertex: + edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); + list_del_init(&edge->stack_entry); + ++ next_vertex = vertex; + vertex = edge->predecessor->vertex; ++ ++ /* If the successor has a smaller lowlink, two vertices ++ * are in the same SCC, so propagate the smaller lowlink ++ * to skip SCC finalisation. ++ */ ++ vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); ++ } else if (next_vertex->on_stack) { ++ /* Loop detected by a back/cross edge. ++ * ++ * The successor is on vertex_stack, so two vertices are ++ * in the same SCC. If the successor has a smaller index, ++ * propagate it to skip SCC finalisation. ++ */ ++ vertex->lowlink = min(vertex->lowlink, next_vertex->index); ++ } else { ++ /* The successor was already grouped as another SCC */ + } + } + +- /* Don't restart DFS from this vertex in unix_walk_scc(). */ +- list_move_tail(&vertex->entry, &unix_visited_vertices); ++ if (vertex->index == vertex->lowlink) { ++ struct list_head scc; ++ ++ /* SCC finalised. ++ * ++ * If the lowlink was not updated, all the vertices above on ++ * vertex_stack are in the same SCC. Group them using scc_entry. ++ */ ++ __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); ++ ++ list_for_each_entry_reverse(vertex, &scc, scc_entry) { ++ /* Don't restart DFS from this vertex in unix_walk_scc(). */ ++ list_move_tail(&vertex->entry, &unix_visited_vertices); ++ ++ vertex->on_stack = false; ++ } ++ ++ list_del(&scc); ++ } + + /* Need backtracking ? */ + if (!list_empty(&edge_stack)) diff --git a/queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch b/queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch new file mode 100644 index 0000000000..f7ba0448e1 --- /dev/null +++ b/queue-6.6/af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch @@ -0,0 +1,230 @@ +From stable+bounces-145870-greg=kroah.com@vger.kernel.org Wed May 21 17:01:08 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:31 +0000 +Subject: af_unix: Don't access successor in unix_del_edges() during GC. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org, syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com +Message-ID: <20250521144803.2050504-24-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 1af2dface5d286dd1f2f3405a0d6fa9f2c8fb998 upstream. + +syzbot reported use-after-free in unix_del_edges(). [0] + +What the repro does is basically repeat the following quickly. + + 1. pass a fd of an AF_UNIX socket to itself + + socketpair(AF_UNIX, SOCK_DGRAM, 0, [3, 4]) = 0 + sendmsg(3, {..., msg_control=[{cmsg_len=20, cmsg_level=SOL_SOCKET, + cmsg_type=SCM_RIGHTS, cmsg_data=[4]}], ...}, 0) = 0 + + 2. pass other fds of AF_UNIX sockets to the socket above + + socketpair(AF_UNIX, SOCK_SEQPACKET, 0, [5, 6]) = 0 + sendmsg(3, {..., msg_control=[{cmsg_len=48, cmsg_level=SOL_SOCKET, + cmsg_type=SCM_RIGHTS, cmsg_data=[5, 6]}], ...}, 0) = 0 + + 3. close all sockets + +Here, two skb are created, and every unix_edge->successor is the first +socket. Then, __unix_gc() will garbage-collect the two skb: + + (a) free skb with self-referencing fd + (b) free skb holding other sockets + +After (a), the self-referencing socket will be scheduled to be freed +later by the delayed_fput() task. + +syzbot repeated the sequences above (1. ~ 3.) quickly and triggered +the task concurrently while GC was running. + +So, at (b), the socket was already freed, and accessing it was illegal. + +unix_del_edges() accesses the receiver socket as edge->successor to +optimise GC. However, we should not do it during GC. + +Garbage-collecting sockets does not change the shape of the rest +of the graph, so we need not call unix_update_graph() to update +unix_graph_grouped when we purge skb. + +However, if we clean up all loops in the unix_walk_scc_fast() path, +unix_graph_maybe_cyclic remains unchanged (true), and __unix_gc() +will call unix_walk_scc_fast() continuously even though there is no +socket to garbage-collect. + +To keep that optimisation while fixing UAF, let's add the same +updating logic of unix_graph_maybe_cyclic in unix_walk_scc_fast() +as done in unix_walk_scc() and __unix_walk_scc(). + +Note that when unix_del_edges() is called from other places, the +receiver socket is always alive: + + - sendmsg: the successor's sk_refcnt is bumped by sock_hold() + unix_find_other() for SOCK_DGRAM, connect() for SOCK_STREAM + + - recvmsg: the successor is the receiver, and its fd is alive + +[0]: +BUG: KASAN: slab-use-after-free in unix_edge_successor net/unix/garbage.c:109 [inline] +BUG: KASAN: slab-use-after-free in unix_del_edge net/unix/garbage.c:165 [inline] +BUG: KASAN: slab-use-after-free in unix_del_edges+0x148/0x630 net/unix/garbage.c:237 +Read of size 8 at addr ffff888079c6e640 by task kworker/u8:6/1099 + +CPU: 0 PID: 1099 Comm: kworker/u8:6 Not tainted 6.9.0-rc4-next-20240418-syzkaller #0 +Hardware name: Google Google Compute Engine/Google Compute Engine, BIOS Google 03/27/2024 +Workqueue: events_unbound __unix_gc +Call Trace: + + __dump_stack lib/dump_stack.c:88 [inline] + dump_stack_lvl+0x241/0x360 lib/dump_stack.c:114 + print_address_description mm/kasan/report.c:377 [inline] + print_report+0x169/0x550 mm/kasan/report.c:488 + kasan_report+0x143/0x180 mm/kasan/report.c:601 + unix_edge_successor net/unix/garbage.c:109 [inline] + unix_del_edge net/unix/garbage.c:165 [inline] + unix_del_edges+0x148/0x630 net/unix/garbage.c:237 + unix_destroy_fpl+0x59/0x210 net/unix/garbage.c:298 + unix_detach_fds net/unix/af_unix.c:1811 [inline] + unix_destruct_scm+0x13e/0x210 net/unix/af_unix.c:1826 + skb_release_head_state+0x100/0x250 net/core/skbuff.c:1127 + skb_release_all net/core/skbuff.c:1138 [inline] + __kfree_skb net/core/skbuff.c:1154 [inline] + kfree_skb_reason+0x16d/0x3b0 net/core/skbuff.c:1190 + __skb_queue_purge_reason include/linux/skbuff.h:3251 [inline] + __skb_queue_purge include/linux/skbuff.h:3256 [inline] + __unix_gc+0x1732/0x1830 net/unix/garbage.c:575 + process_one_work kernel/workqueue.c:3218 [inline] + process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 + worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 + kthread+0x2f0/0x390 kernel/kthread.c:389 + ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + + +Allocated by task 14427: + kasan_save_stack mm/kasan/common.c:47 [inline] + kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 + unpoison_slab_object mm/kasan/common.c:312 [inline] + __kasan_slab_alloc+0x66/0x80 mm/kasan/common.c:338 + kasan_slab_alloc include/linux/kasan.h:201 [inline] + slab_post_alloc_hook mm/slub.c:3897 [inline] + slab_alloc_node mm/slub.c:3957 [inline] + kmem_cache_alloc_noprof+0x135/0x290 mm/slub.c:3964 + sk_prot_alloc+0x58/0x210 net/core/sock.c:2074 + sk_alloc+0x38/0x370 net/core/sock.c:2133 + unix_create1+0xb4/0x770 + unix_create+0x14e/0x200 net/unix/af_unix.c:1034 + __sock_create+0x490/0x920 net/socket.c:1571 + sock_create net/socket.c:1622 [inline] + __sys_socketpair+0x33e/0x720 net/socket.c:1773 + __do_sys_socketpair net/socket.c:1822 [inline] + __se_sys_socketpair net/socket.c:1819 [inline] + __x64_sys_socketpair+0x9b/0xb0 net/socket.c:1819 + do_syscall_x64 arch/x86/entry/common.c:52 [inline] + do_syscall_64+0xf5/0x240 arch/x86/entry/common.c:83 + entry_SYSCALL_64_after_hwframe+0x77/0x7f + +Freed by task 1805: + kasan_save_stack mm/kasan/common.c:47 [inline] + kasan_save_track+0x3f/0x80 mm/kasan/common.c:68 + kasan_save_free_info+0x40/0x50 mm/kasan/generic.c:579 + poison_slab_object+0xe0/0x150 mm/kasan/common.c:240 + __kasan_slab_free+0x37/0x60 mm/kasan/common.c:256 + kasan_slab_free include/linux/kasan.h:184 [inline] + slab_free_hook mm/slub.c:2190 [inline] + slab_free mm/slub.c:4393 [inline] + kmem_cache_free+0x145/0x340 mm/slub.c:4468 + sk_prot_free net/core/sock.c:2114 [inline] + __sk_destruct+0x467/0x5f0 net/core/sock.c:2208 + sock_put include/net/sock.h:1948 [inline] + unix_release_sock+0xa8b/0xd20 net/unix/af_unix.c:665 + unix_release+0x91/0xc0 net/unix/af_unix.c:1049 + __sock_release net/socket.c:659 [inline] + sock_close+0xbc/0x240 net/socket.c:1421 + __fput+0x406/0x8b0 fs/file_table.c:422 + delayed_fput+0x59/0x80 fs/file_table.c:445 + process_one_work kernel/workqueue.c:3218 [inline] + process_scheduled_works+0xa2c/0x1830 kernel/workqueue.c:3299 + worker_thread+0x86d/0xd70 kernel/workqueue.c:3380 + kthread+0x2f0/0x390 kernel/kthread.c:389 + ret_from_fork+0x4b/0x80 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + +The buggy address belongs to the object at ffff888079c6e000 + which belongs to the cache UNIX of size 1920 +The buggy address is located 1600 bytes inside of + freed 1920-byte region [ffff888079c6e000, ffff888079c6e780) + +Reported-by: syzbot+f3f3eef1d2100200e593@syzkaller.appspotmail.com +Closes: https://syzkaller.appspot.com/bug?extid=f3f3eef1d2100200e593 +Fixes: 77e5593aebba ("af_unix: Skip GC if no cycle exists.") +Fixes: fd86344823b5 ("af_unix: Try not to hold unix_gc_lock during accept().") +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240419235102.31707-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 17 ++++++++++++----- + 1 file changed, 12 insertions(+), 5 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -158,11 +158,14 @@ static void unix_add_edge(struct scm_fp_ + unix_update_graph(unix_edge_successor(edge)); + } + ++static bool gc_in_progress; ++ + static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) + { + struct unix_vertex *vertex = edge->predecessor->vertex; + +- unix_update_graph(unix_edge_successor(edge)); ++ if (!gc_in_progress) ++ unix_update_graph(unix_edge_successor(edge)); + + list_del(&edge->vertex_entry); + vertex->out_degree--; +@@ -237,8 +240,10 @@ void unix_del_edges(struct scm_fp_list * + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + +- receiver = fpl->edges[0].successor; +- receiver->scm_stat.nr_unix_fds -= fpl->count_unix; ++ if (!gc_in_progress) { ++ receiver = fpl->edges[0].successor; ++ receiver->scm_stat.nr_unix_fds -= fpl->count_unix; ++ } + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); + out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); +@@ -526,6 +531,8 @@ static void unix_walk_scc(struct sk_buff + + static void unix_walk_scc_fast(struct sk_buff_head *hitlist) + { ++ unix_graph_maybe_cyclic = false; ++ + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; + struct list_head scc; +@@ -543,6 +550,8 @@ static void unix_walk_scc_fast(struct sk + + if (scc_dead) + unix_collect_skb(&scc, hitlist); ++ else if (!unix_graph_maybe_cyclic) ++ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); + } +@@ -550,8 +559,6 @@ static void unix_walk_scc_fast(struct sk + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + } + +-static bool gc_in_progress; +- + static void __unix_gc(struct work_struct *work) + { + struct sk_buff_head hitlist; diff --git a/queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch b/queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch new file mode 100644 index 0000000000..782f94589c --- /dev/null +++ b/queue-6.6/af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch @@ -0,0 +1,99 @@ +From stable+bounces-145872-greg=kroah.com@vger.kernel.org Wed May 21 17:02:13 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:33 +0000 +Subject: af_unix: Fix garbage collection of embryos carrying OOB with SCM_RIGHTS +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-26-lee@kernel.org> + +From: Michal Luczaj + +commit 041933a1ec7b4173a8e638cae4f8e394331d7e54 upstream. + +GC attempts to explicitly drop oob_skb's reference before purging the hit +list. + +The problem is with embryos: kfree_skb(u->oob_skb) is never called on an +embryo socket. + +The python script below [0] sends a listener's fd to its embryo as OOB +data. While GC does collect the embryo's queue, it fails to drop the OOB +skb's refcount. The skb which was in embryo's receive queue stays as +unix_sk(sk)->oob_skb and keeps the listener's refcount [1]. + +Tell GC to dispose embryo's oob_skb. + +[0]: +from array import array +from socket import * + +addr = '\x00unix-oob' +lis = socket(AF_UNIX, SOCK_STREAM) +lis.bind(addr) +lis.listen(1) + +s = socket(AF_UNIX, SOCK_STREAM) +s.connect(addr) +scm = (SOL_SOCKET, SCM_RIGHTS, array('i', [lis.fileno()])) +s.sendmsg([b'x'], [scm], MSG_OOB) +lis.close() + +[1] +$ grep unix-oob /proc/net/unix +$ ./unix-oob.py +$ grep unix-oob /proc/net/unix +0000000000000000: 00000002 00000000 00000000 0001 02 0 @unix-oob +0000000000000000: 00000002 00000000 00010000 0001 01 6072 @unix-oob + +Fixes: 4090fa373f0e ("af_unix: Replace garbage collection algorithm.") +Signed-off-by: Michal Luczaj +Reviewed-by: Kuniyuki Iwashima +Signed-off-by: Paolo Abeni +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 23 ++++++++++++++--------- + 1 file changed, 14 insertions(+), 9 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -342,6 +342,18 @@ enum unix_recv_queue_lock_class { + U_RECVQ_LOCK_EMBRYO, + }; + ++static void unix_collect_queue(struct unix_sock *u, struct sk_buff_head *hitlist) ++{ ++ skb_queue_splice_init(&u->sk.sk_receive_queue, hitlist); ++ ++#if IS_ENABLED(CONFIG_AF_UNIX_OOB) ++ if (u->oob_skb) { ++ WARN_ON_ONCE(skb_unref(u->oob_skb)); ++ u->oob_skb = NULL; ++ } ++#endif ++} ++ + static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) + { + struct unix_vertex *vertex; +@@ -365,18 +377,11 @@ static void unix_collect_skb(struct list + + /* listener -> embryo order, the inversion never happens. */ + spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); +- skb_queue_splice_init(embryo_queue, hitlist); ++ unix_collect_queue(unix_sk(skb->sk), hitlist); + spin_unlock(&embryo_queue->lock); + } + } else { +- skb_queue_splice_init(queue, hitlist); +- +-#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +- if (u->oob_skb) { +- kfree_skb(u->oob_skb); +- u->oob_skb = NULL; +- } +-#endif ++ unix_collect_queue(u, hitlist); + } + + spin_unlock(&queue->lock); diff --git a/queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch b/queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch new file mode 100644 index 0000000000..8f8d3a7dae --- /dev/null +++ b/queue-6.6/af_unix-fix-uninit-value-in-__unix_walk_scc.patch @@ -0,0 +1,106 @@ +From stable+bounces-145873-greg=kroah.com@vger.kernel.org Wed May 21 17:02:42 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:34 +0000 +Subject: af_unix: Fix uninit-value in __unix_walk_scc() +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org, Shigeru Yoshida , syzkaller +Message-ID: <20250521144803.2050504-27-lee@kernel.org> + +From: Shigeru Yoshida + +commit 927fa5b3e4f52e0967bfc859afc98ad1c523d2d5 upstream. + +KMSAN reported uninit-value access in __unix_walk_scc() [1]. + +In the list_for_each_entry_reverse() loop, when the vertex's index +equals it's scc_index, the loop uses the variable vertex as a +temporary variable that points to a vertex in scc. And when the loop +is finished, the variable vertex points to the list head, in this case +scc, which is a local variable on the stack (more precisely, it's not +even scc and might underflow the call stack of __unix_walk_scc(): +container_of(&scc, struct unix_vertex, scc_entry)). + +However, the variable vertex is used under the label prev_vertex. So +if the edge_stack is not empty and the function jumps to the +prev_vertex label, the function will access invalid data on the +stack. This causes the uninit-value access issue. + +Fix this by introducing a new temporary variable for the loop. + +[1] +BUG: KMSAN: uninit-value in __unix_walk_scc net/unix/garbage.c:478 [inline] +BUG: KMSAN: uninit-value in unix_walk_scc net/unix/garbage.c:526 [inline] +BUG: KMSAN: uninit-value in __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584 + __unix_walk_scc net/unix/garbage.c:478 [inline] + unix_walk_scc net/unix/garbage.c:526 [inline] + __unix_gc+0x2589/0x3c20 net/unix/garbage.c:584 + process_one_work kernel/workqueue.c:3231 [inline] + process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312 + worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393 + kthread+0x3c4/0x530 kernel/kthread.c:389 + ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + +Uninit was stored to memory at: + unix_walk_scc net/unix/garbage.c:526 [inline] + __unix_gc+0x2adf/0x3c20 net/unix/garbage.c:584 + process_one_work kernel/workqueue.c:3231 [inline] + process_scheduled_works+0xade/0x1bf0 kernel/workqueue.c:3312 + worker_thread+0xeb6/0x15b0 kernel/workqueue.c:3393 + kthread+0x3c4/0x530 kernel/kthread.c:389 + ret_from_fork+0x6e/0x90 arch/x86/kernel/process.c:147 + ret_from_fork_asm+0x1a/0x30 arch/x86/entry/entry_64.S:244 + +Local variable entries created at: + ref_tracker_free+0x48/0xf30 lib/ref_tracker.c:222 + netdev_tracker_free include/linux/netdevice.h:4058 [inline] + netdev_put include/linux/netdevice.h:4075 [inline] + dev_put include/linux/netdevice.h:4101 [inline] + update_gid_event_work_handler+0xaa/0x1b0 drivers/infiniband/core/roce_gid_mgmt.c:813 + +CPU: 1 PID: 12763 Comm: kworker/u8:31 Not tainted 6.10.0-rc4-00217-g35bb670d65fc #32 +Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.16.3-2.fc40 04/01/2014 +Workqueue: events_unbound __unix_gc + +Fixes: 3484f063172d ("af_unix: Detect Strongly Connected Components.") +Reported-by: syzkaller +Signed-off-by: Shigeru Yoshida +Reviewed-by: Kuniyuki Iwashima +Link: https://patch.msgid.link/20240702160428.10153-1-syoshida@redhat.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 9 +++++---- + 1 file changed, 5 insertions(+), 4 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -476,6 +476,7 @@ prev_vertex: + } + + if (vertex->index == vertex->scc_index) { ++ struct unix_vertex *v; + struct list_head scc; + bool scc_dead = true; + +@@ -486,15 +487,15 @@ prev_vertex: + */ + __list_cut_position(&scc, &vertex_stack, &vertex->scc_entry); + +- list_for_each_entry_reverse(vertex, &scc, scc_entry) { ++ list_for_each_entry_reverse(v, &scc, scc_entry) { + /* Don't restart DFS from this vertex in unix_walk_scc(). */ +- list_move_tail(&vertex->entry, &unix_visited_vertices); ++ list_move_tail(&v->entry, &unix_visited_vertices); + + /* Mark vertex as off-stack. */ +- vertex->index = unix_vertex_grouped_index; ++ v->index = unix_vertex_grouped_index; + + if (scc_dead) +- scc_dead = unix_vertex_dead(vertex); ++ scc_dead = unix_vertex_dead(v); + } + + if (scc_dead) diff --git a/queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch b/queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch new file mode 100644 index 0000000000..1c256c3580 --- /dev/null +++ b/queue-6.6/af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch @@ -0,0 +1,134 @@ +From stable+bounces-145861-greg=kroah.com@vger.kernel.org Wed May 21 16:54:34 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:22 +0000 +Subject: af_unix: Fix up unix_edge.successor for embryo socket. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-15-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit dcf70df2048d27c5d186f013f101a4aefd63aa41 upstream. + +To garbage collect inflight AF_UNIX sockets, we must define the +cyclic reference appropriately. This is a bit tricky if the loop +consists of embryo sockets. + +Suppose that the fd of AF_UNIX socket A is passed to D and the fd B +to C and that C and D are embryo sockets of A and B, respectively. +It may appear that there are two separate graphs, A (-> D) and +B (-> C), but this is not correct. + + A --. .-- B + X + C <-' `-> D + +Now, D holds A's refcount, and C has B's refcount, so unix_release() +will never be called for A and B when we close() them. However, no +one can call close() for D and C to free skbs holding refcounts of A +and B because C/D is in A/B's receive queue, which should have been +purged by unix_release() for A and B. + +So, here's another type of cyclic reference. When a fd of an AF_UNIX +socket is passed to an embryo socket, the reference is indirectly held +by its parent listening socket. + + .-> A .-> B + | `- sk_receive_queue | `- sk_receive_queue + | `- skb | `- skb + | `- sk == C | `- sk == D + | `- sk_receive_queue | `- sk_receive_queue + | `- skb +---------' `- skb +-. + | | + `---------------------------------------------------------' + +Technically, the graph must be denoted as A <-> B instead of A (-> D) +and B (-> C) to find such a cyclic reference without touching each +socket's receive queue. + + .-> A --. .-- B <-. + | X | == A <-> B + `-- C <-' `-> D --' + +We apply this fixup during GC by fetching the real successor by +unix_edge_successor(). + +When we call accept(), we clear unix_sock.listener under unix_gc_lock +not to confuse GC. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-9-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 + + net/unix/af_unix.c | 2 +- + net/unix/garbage.c | 20 +++++++++++++++++++- + 3 files changed, 21 insertions(+), 2 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -24,6 +24,7 @@ void unix_inflight(struct user_struct *u + void unix_notinflight(struct user_struct *user, struct file *fp); + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); + void unix_del_edges(struct scm_fp_list *fpl); ++void unix_update_edges(struct unix_sock *receiver); + int unix_prepare_fpl(struct scm_fp_list *fpl); + void unix_destroy_fpl(struct scm_fp_list *fpl); + void unix_gc(void); +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1705,7 +1705,7 @@ static int unix_accept(struct socket *so + } + + tsk = skb->sk; +- unix_sk(tsk)->listener = NULL; ++ unix_update_edges(unix_sk(tsk)); + skb_free_datagram(sk, skb); + wake_up_interruptible(&unix_sk(sk)->peer_wait); + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -101,6 +101,17 @@ struct unix_sock *unix_get_socket(struct + return NULL; + } + ++static struct unix_vertex *unix_edge_successor(struct unix_edge *edge) ++{ ++ /* If an embryo socket has a fd, ++ * the listener indirectly holds the fd's refcnt. ++ */ ++ if (edge->successor->listener) ++ return unix_sk(edge->successor->listener)->vertex; ++ ++ return edge->successor->vertex; ++} ++ + static LIST_HEAD(unix_unvisited_vertices); + + enum unix_vertex_index { +@@ -209,6 +220,13 @@ out: + fpl->inflight = false; + } + ++void unix_update_edges(struct unix_sock *receiver) ++{ ++ spin_lock(&unix_gc_lock); ++ receiver->listener = NULL; ++ spin_unlock(&unix_gc_lock); ++} ++ + int unix_prepare_fpl(struct scm_fp_list *fpl) + { + struct unix_vertex *vertex; +@@ -268,7 +286,7 @@ next_vertex: + + /* Explore neighbour vertices (receivers of the current vertex's fd). */ + list_for_each_entry(edge, &vertex->edges, vertex_entry) { +- struct unix_vertex *next_vertex = edge->successor->vertex; ++ struct unix_vertex *next_vertex = unix_edge_successor(edge); + + if (!next_vertex) + continue; diff --git a/queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch b/queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch new file mode 100644 index 0000000000..1e05a4c25c --- /dev/null +++ b/queue-6.6/af_unix-iterate-all-vertices-by-dfs.patch @@ -0,0 +1,155 @@ +From stable+bounces-145858-greg=kroah.com@vger.kernel.org Wed May 21 16:53:24 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:19 +0000 +Subject: af_unix: Iterate all vertices by DFS. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-12-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 6ba76fd2848e107594ea4f03b737230f74bc23ea upstream. + +The new GC will use a depth first search graph algorithm to find +cyclic references. The algorithm visits every vertex exactly once. + +Here, we implement the DFS part without recursion so that no one +can abuse it. + +unix_walk_scc() marks every vertex unvisited by initialising index +as UNIX_VERTEX_INDEX_UNVISITED and iterates inflight vertices in +unix_unvisited_vertices and call __unix_walk_scc() to start DFS from +an arbitrary vertex. + +__unix_walk_scc() iterates all edges starting from the vertex and +explores the neighbour vertices with DFS using edge_stack. + +After visiting all neighbours, __unix_walk_scc() moves the visited +vertex to unix_visited_vertices so that unix_walk_scc() will not +restart DFS from the visited vertex. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-6-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 2 + + net/unix/garbage.c | 74 ++++++++++++++++++++++++++++++++++++++++++++++++++ + 2 files changed, 76 insertions(+) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -33,12 +33,14 @@ struct unix_vertex { + struct list_head edges; + struct list_head entry; + unsigned long out_degree; ++ unsigned long index; + }; + + struct unix_edge { + struct unix_sock *predecessor; + struct unix_sock *successor; + struct list_head vertex_entry; ++ struct list_head stack_entry; + }; + + struct sock *unix_peer_get(struct sock *sk); +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -103,6 +103,11 @@ struct unix_sock *unix_get_socket(struct + + static LIST_HEAD(unix_unvisited_vertices); + ++enum unix_vertex_index { ++ UNIX_VERTEX_INDEX_UNVISITED, ++ UNIX_VERTEX_INDEX_START, ++}; ++ + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) + { + struct unix_vertex *vertex = edge->predecessor->vertex; +@@ -241,6 +246,73 @@ void unix_destroy_fpl(struct scm_fp_list + unix_free_vertices(fpl); + } + ++static LIST_HEAD(unix_visited_vertices); ++ ++static void __unix_walk_scc(struct unix_vertex *vertex) ++{ ++ unsigned long index = UNIX_VERTEX_INDEX_START; ++ struct unix_edge *edge; ++ LIST_HEAD(edge_stack); ++ ++next_vertex: ++ vertex->index = index; ++ index++; ++ ++ /* Explore neighbour vertices (receivers of the current vertex's fd). */ ++ list_for_each_entry(edge, &vertex->edges, vertex_entry) { ++ struct unix_vertex *next_vertex = edge->successor->vertex; ++ ++ if (!next_vertex) ++ continue; ++ ++ if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { ++ /* Iterative deepening depth first search ++ * ++ * 1. Push a forward edge to edge_stack and set ++ * the successor to vertex for the next iteration. ++ */ ++ list_add(&edge->stack_entry, &edge_stack); ++ ++ vertex = next_vertex; ++ goto next_vertex; ++ ++ /* 2. Pop the edge directed to the current vertex ++ * and restore the ancestor for backtracking. ++ */ ++prev_vertex: ++ edge = list_first_entry(&edge_stack, typeof(*edge), stack_entry); ++ list_del_init(&edge->stack_entry); ++ ++ vertex = edge->predecessor->vertex; ++ } ++ } ++ ++ /* Don't restart DFS from this vertex in unix_walk_scc(). */ ++ list_move_tail(&vertex->entry, &unix_visited_vertices); ++ ++ /* Need backtracking ? */ ++ if (!list_empty(&edge_stack)) ++ goto prev_vertex; ++} ++ ++static void unix_walk_scc(void) ++{ ++ struct unix_vertex *vertex; ++ ++ list_for_each_entry(vertex, &unix_unvisited_vertices, entry) ++ vertex->index = UNIX_VERTEX_INDEX_UNVISITED; ++ ++ /* Visit every vertex exactly once. ++ * __unix_walk_scc() moves visited vertices to unix_visited_vertices. ++ */ ++ while (!list_empty(&unix_unvisited_vertices)) { ++ vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); ++ __unix_walk_scc(vertex); ++ } ++ ++ list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); ++} ++ + static LIST_HEAD(gc_candidates); + static LIST_HEAD(gc_inflight_list); + +@@ -388,6 +460,8 @@ static void __unix_gc(struct work_struct + + spin_lock(&unix_gc_lock); + ++ unix_walk_scc(); ++ + /* First, select candidates for garbage collection. Only + * in-flight sockets are considered, and from those only ones + * which don't have any external reference. diff --git a/queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch b/queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch new file mode 100644 index 0000000000..7ec202920d --- /dev/null +++ b/queue-6.6/af_unix-link-struct-unix_edge-when-queuing-skb.patch @@ -0,0 +1,259 @@ +From stable+bounces-145856-greg=kroah.com@vger.kernel.org Wed May 21 16:52:44 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:17 +0000 +Subject: af_unix: Link struct unix_edge when queuing skb. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-10-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 42f298c06b30bfe0a8cbee5d38644e618699e26e upstream. + +Just before queuing skb with inflight fds, we call scm_stat_add(), +which is a good place to set up the preallocated struct unix_vertex +and struct unix_edge in UNIXCB(skb).fp. + +Then, we call unix_add_edges() and construct the directed graph +as follows: + + 1. Set the inflight socket's unix_sock to unix_edge.predecessor. + 2. Set the receiver's unix_sock to unix_edge.successor. + 3. Set the preallocated vertex to inflight socket's unix_sock.vertex. + 4. Link inflight socket's unix_vertex.entry to unix_unvisited_vertices. + 5. Link unix_edge.vertex_entry to the inflight socket's unix_vertex.edges. + +Let's say we pass the fd of AF_UNIX socket A to B and the fd of B +to C. The graph looks like this: + + +-------------------------+ + | unix_unvisited_vertices | <-------------------------. + +-------------------------+ | + + | + | +--------------+ +--------------+ | +--------------+ + | | unix_sock A | <---. .---> | unix_sock B | <-|-. .---> | unix_sock C | + | +--------------+ | | +--------------+ | | | +--------------+ + | .-+ | vertex | | | .-+ | vertex | | | | | vertex | + | | +--------------+ | | | +--------------+ | | | +--------------+ + | | | | | | | | + | | +--------------+ | | | +--------------+ | | | + | '-> | unix_vertex | | | '-> | unix_vertex | | | | + | +--------------+ | | +--------------+ | | | + `---> | entry | +---------> | entry | +-' | | + |--------------| | | |--------------| | | + | edges | <-. | | | edges | <-. | | + +--------------+ | | | +--------------+ | | | + | | | | | | + .----------------------' | | .----------------------' | | + | | | | | | + | +--------------+ | | | +--------------+ | | + | | unix_edge | | | | | unix_edge | | | + | +--------------+ | | | +--------------+ | | + `-> | vertex_entry | | | `-> | vertex_entry | | | + |--------------| | | |--------------| | | + | predecessor | +---' | | predecessor | +---' | + |--------------| | |--------------| | + | successor | +-----' | successor | +-----' + +--------------+ +--------------+ + +Henceforth, we denote such a graph as A -> B (-> C). + +Now, we can express all inflight fd graphs that do not contain +embryo sockets. We will support the particular case later. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-4-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 2 + + include/net/scm.h | 1 + net/core/scm.c | 2 + + net/unix/af_unix.c | 8 +++- + net/unix/garbage.c | 90 +++++++++++++++++++++++++++++++++++++++++++++++++- + 5 files changed, 100 insertions(+), 3 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -22,6 +22,8 @@ extern unsigned int unix_tot_inflight; + + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); ++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); ++void unix_del_edges(struct scm_fp_list *fpl); + int unix_prepare_fpl(struct scm_fp_list *fpl); + void unix_destroy_fpl(struct scm_fp_list *fpl); + void unix_gc(void); +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -31,6 +31,7 @@ struct scm_fp_list { + short count_unix; + short max; + #ifdef CONFIG_UNIX ++ bool inflight; + struct list_head vertices; + struct unix_edge *edges; + #endif +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -90,6 +90,7 @@ static int scm_fp_copy(struct cmsghdr *c + fpl->max = SCM_MAX_FD; + fpl->user = NULL; + #if IS_ENABLED(CONFIG_UNIX) ++ fpl->inflight = false; + fpl->edges = NULL; + INIT_LIST_HEAD(&fpl->vertices); + #endif +@@ -384,6 +385,7 @@ struct scm_fp_list *scm_fp_dup(struct sc + new_fpl->max = new_fpl->count; + new_fpl->user = get_uid(fpl->user); + #if IS_ENABLED(CONFIG_UNIX) ++ new_fpl->inflight = false; + new_fpl->edges = NULL; + INIT_LIST_HEAD(&new_fpl->vertices); + #endif +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1920,8 +1920,10 @@ static void scm_stat_add(struct sock *sk + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + +- if (unlikely(fp && fp->count)) ++ if (unlikely(fp && fp->count)) { + atomic_add(fp->count, &u->scm_stat.nr_fds); ++ unix_add_edges(fp, u); ++ } + } + + static void scm_stat_del(struct sock *sk, struct sk_buff *skb) +@@ -1929,8 +1931,10 @@ static void scm_stat_del(struct sock *sk + struct scm_fp_list *fp = UNIXCB(skb).fp; + struct unix_sock *u = unix_sk(sk); + +- if (unlikely(fp && fp->count)) ++ if (unlikely(fp && fp->count)) { + atomic_sub(fp->count, &u->scm_stat.nr_fds); ++ unix_del_edges(fp); ++ } + } + + /* +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -101,6 +101,38 @@ struct unix_sock *unix_get_socket(struct + return NULL; + } + ++static LIST_HEAD(unix_unvisited_vertices); ++ ++static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) ++{ ++ struct unix_vertex *vertex = edge->predecessor->vertex; ++ ++ if (!vertex) { ++ vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); ++ vertex->out_degree = 0; ++ INIT_LIST_HEAD(&vertex->edges); ++ ++ list_move_tail(&vertex->entry, &unix_unvisited_vertices); ++ edge->predecessor->vertex = vertex; ++ } ++ ++ vertex->out_degree++; ++ list_add_tail(&edge->vertex_entry, &vertex->edges); ++} ++ ++static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) ++{ ++ struct unix_vertex *vertex = edge->predecessor->vertex; ++ ++ list_del(&edge->vertex_entry); ++ vertex->out_degree--; ++ ++ if (!vertex->out_degree) { ++ edge->predecessor->vertex = NULL; ++ list_move_tail(&vertex->entry, &fpl->vertices); ++ } ++} ++ + static void unix_free_vertices(struct scm_fp_list *fpl) + { + struct unix_vertex *vertex, *next_vertex; +@@ -111,6 +143,60 @@ static void unix_free_vertices(struct sc + } + } + ++DEFINE_SPINLOCK(unix_gc_lock); ++ ++void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) ++{ ++ int i = 0, j = 0; ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (!fpl->count_unix) ++ goto out; ++ ++ do { ++ struct unix_sock *inflight = unix_get_socket(fpl->fp[j++]); ++ struct unix_edge *edge; ++ ++ if (!inflight) ++ continue; ++ ++ edge = fpl->edges + i++; ++ edge->predecessor = inflight; ++ edge->successor = receiver; ++ ++ unix_add_edge(fpl, edge); ++ } while (i < fpl->count_unix); ++ ++out: ++ spin_unlock(&unix_gc_lock); ++ ++ fpl->inflight = true; ++ ++ unix_free_vertices(fpl); ++} ++ ++void unix_del_edges(struct scm_fp_list *fpl) ++{ ++ int i = 0; ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (!fpl->count_unix) ++ goto out; ++ ++ do { ++ struct unix_edge *edge = fpl->edges + i++; ++ ++ unix_del_edge(fpl, edge); ++ } while (i < fpl->count_unix); ++ ++out: ++ spin_unlock(&unix_gc_lock); ++ ++ fpl->inflight = false; ++} ++ + int unix_prepare_fpl(struct scm_fp_list *fpl) + { + struct unix_vertex *vertex; +@@ -141,11 +227,13 @@ err: + + void unix_destroy_fpl(struct scm_fp_list *fpl) + { ++ if (fpl->inflight) ++ unix_del_edges(fpl); ++ + kvfree(fpl->edges); + unix_free_vertices(fpl); + } + +-DEFINE_SPINLOCK(unix_gc_lock); + unsigned int unix_tot_inflight; + static LIST_HEAD(gc_candidates); + static LIST_HEAD(gc_inflight_list); diff --git a/queue-6.6/af_unix-remove-config_unix_scm.patch b/queue-6.6/af_unix-remove-config_unix_scm.patch new file mode 100644 index 0000000000..826f7f1742 --- /dev/null +++ b/queue-6.6/af_unix-remove-config_unix_scm.patch @@ -0,0 +1,439 @@ +From stable+bounces-145853-greg=kroah.com@vger.kernel.org Wed May 21 16:51:31 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:14 +0000 +Subject: af_unix: Remove CONFIG_UNIX_SCM. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-7-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 99a7a5b9943ea2d05fb0dee38e4ae2290477ed83 upstream. + +Originally, the code related to garbage collection was all in garbage.c. + +Commit f4e65870e5ce ("net: split out functions related to registering +inflight socket files") moved some functions to scm.c for io_uring and +added CONFIG_UNIX_SCM just in case AF_UNIX was built as module. + +However, since commit 97154bcf4d1b ("af_unix: Kconfig: make CONFIG_UNIX +bool"), AF_UNIX is no longer built separately. Also, io_uring does not +support SCM_RIGHTS now. + +Let's move the functions back to garbage.c + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Jens Axboe +Link: https://lore.kernel.org/r/20240129190435.57228-4-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 7 +- + net/Makefile | 2 + net/unix/Kconfig | 5 - + net/unix/Makefile | 2 + net/unix/af_unix.c | 63 ++++++++++++++++++++- + net/unix/garbage.c | 73 +++++++++++++++++++++++- + net/unix/scm.c | 150 -------------------------------------------------- + net/unix/scm.h | 10 --- + 8 files changed, 137 insertions(+), 175 deletions(-) + delete mode 100644 net/unix/scm.c + delete mode 100644 net/unix/scm.h + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -17,19 +17,20 @@ static inline struct unix_sock *unix_get + } + #endif + ++extern spinlock_t unix_gc_lock; ++extern unsigned int unix_tot_inflight; ++ + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); +-void unix_destruct_scm(struct sk_buff *skb); + void unix_gc(void); + void wait_for_unix_gc(struct scm_fp_list *fpl); ++ + struct sock *unix_peer_get(struct sock *sk); + + #define UNIX_HASH_MOD (256 - 1) + #define UNIX_HASH_SIZE (256 * 2) + #define UNIX_HASH_BITS 8 + +-extern unsigned int unix_tot_inflight; +- + struct unix_address { + refcount_t refcnt; + int len; +--- a/net/Makefile ++++ b/net/Makefile +@@ -17,7 +17,7 @@ obj-$(CONFIG_NETFILTER) += netfilter/ + obj-$(CONFIG_INET) += ipv4/ + obj-$(CONFIG_TLS) += tls/ + obj-$(CONFIG_XFRM) += xfrm/ +-obj-$(CONFIG_UNIX_SCM) += unix/ ++obj-$(CONFIG_UNIX) += unix/ + obj-y += ipv6/ + obj-$(CONFIG_BPFILTER) += bpfilter/ + obj-$(CONFIG_PACKET) += packet/ +--- a/net/unix/Kconfig ++++ b/net/unix/Kconfig +@@ -16,11 +16,6 @@ config UNIX + + Say Y unless you know what you are doing. + +-config UNIX_SCM +- bool +- depends on UNIX +- default y +- + config AF_UNIX_OOB + bool + depends on UNIX +--- a/net/unix/Makefile ++++ b/net/unix/Makefile +@@ -11,5 +11,3 @@ unix-$(CONFIG_BPF_SYSCALL) += unix_bpf.o + + obj-$(CONFIG_UNIX_DIAG) += unix_diag.o + unix_diag-y := diag.o +- +-obj-$(CONFIG_UNIX_SCM) += scm.o +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -117,8 +117,6 @@ + #include + #include + +-#include "scm.h" +- + static atomic_long_t unix_nr_socks; + static struct hlist_head bsd_socket_buckets[UNIX_HASH_SIZE / 2]; + static spinlock_t bsd_socket_locks[UNIX_HASH_SIZE / 2]; +@@ -1752,6 +1750,52 @@ out: + return err; + } + ++/* The "user->unix_inflight" variable is protected by the garbage ++ * collection lock, and we just read it locklessly here. If you go ++ * over the limit, there might be a tiny race in actually noticing ++ * it across threads. Tough. ++ */ ++static inline bool too_many_unix_fds(struct task_struct *p) ++{ ++ struct user_struct *user = current_user(); ++ ++ if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) ++ return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); ++ return false; ++} ++ ++static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++{ ++ int i; ++ ++ if (too_many_unix_fds(current)) ++ return -ETOOMANYREFS; ++ ++ /* Need to duplicate file references for the sake of garbage ++ * collection. Otherwise a socket in the fps might become a ++ * candidate for GC while the skb is not yet queued. ++ */ ++ UNIXCB(skb).fp = scm_fp_dup(scm->fp); ++ if (!UNIXCB(skb).fp) ++ return -ENOMEM; ++ ++ for (i = scm->fp->count - 1; i >= 0; i--) ++ unix_inflight(scm->fp->user, scm->fp->fp[i]); ++ ++ return 0; ++} ++ ++static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) ++{ ++ int i; ++ ++ scm->fp = UNIXCB(skb).fp; ++ UNIXCB(skb).fp = NULL; ++ ++ for (i = scm->fp->count - 1; i >= 0; i--) ++ unix_notinflight(scm->fp->user, scm->fp->fp[i]); ++} ++ + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) + { + scm->fp = scm_fp_dup(UNIXCB(skb).fp); +@@ -1799,6 +1843,21 @@ static void unix_peek_fds(struct scm_coo + spin_unlock(&unix_gc_lock); + } + ++static void unix_destruct_scm(struct sk_buff *skb) ++{ ++ struct scm_cookie scm; ++ ++ memset(&scm, 0, sizeof(scm)); ++ scm.pid = UNIXCB(skb).pid; ++ if (UNIXCB(skb).fp) ++ unix_detach_fds(&scm, skb); ++ ++ /* Alas, it calls VFS */ ++ /* So fscking what? fput() had been SMP-safe since the last Summer */ ++ scm_destroy(&scm); ++ sock_wfree(skb); ++} ++ + static int unix_scm_to_skb(struct scm_cookie *scm, struct sk_buff *skb, bool send_fds) + { + int err = 0; +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -81,11 +81,80 @@ + #include + #include + +-#include "scm.h" ++struct unix_sock *unix_get_socket(struct file *filp) ++{ ++ struct inode *inode = file_inode(filp); + +-/* Internal data structures and random procedures: */ ++ /* Socket ? */ ++ if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { ++ struct socket *sock = SOCKET_I(inode); ++ const struct proto_ops *ops; ++ struct sock *sk = sock->sk; + ++ ops = READ_ONCE(sock->ops); ++ ++ /* PF_UNIX ? */ ++ if (sk && ops && ops->family == PF_UNIX) ++ return unix_sk(sk); ++ } ++ ++ return NULL; ++} ++ ++DEFINE_SPINLOCK(unix_gc_lock); ++unsigned int unix_tot_inflight; + static LIST_HEAD(gc_candidates); ++static LIST_HEAD(gc_inflight_list); ++ ++/* Keep the number of times in flight count for the file ++ * descriptor if it is for an AF_UNIX socket. ++ */ ++void unix_inflight(struct user_struct *user, struct file *filp) ++{ ++ struct unix_sock *u = unix_get_socket(filp); ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (u) { ++ if (!u->inflight) { ++ WARN_ON_ONCE(!list_empty(&u->link)); ++ list_add_tail(&u->link, &gc_inflight_list); ++ } else { ++ WARN_ON_ONCE(list_empty(&u->link)); ++ } ++ u->inflight++; ++ ++ /* Paired with READ_ONCE() in wait_for_unix_gc() */ ++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); ++ } ++ ++ WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); ++ ++ spin_unlock(&unix_gc_lock); ++} ++ ++void unix_notinflight(struct user_struct *user, struct file *filp) ++{ ++ struct unix_sock *u = unix_get_socket(filp); ++ ++ spin_lock(&unix_gc_lock); ++ ++ if (u) { ++ WARN_ON_ONCE(!u->inflight); ++ WARN_ON_ONCE(list_empty(&u->link)); ++ ++ u->inflight--; ++ if (!u->inflight) ++ list_del_init(&u->link); ++ ++ /* Paired with READ_ONCE() in wait_for_unix_gc() */ ++ WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); ++ } ++ ++ WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); ++ ++ spin_unlock(&unix_gc_lock); ++} + + static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), + struct sk_buff_head *hitlist) +--- a/net/unix/scm.c ++++ /dev/null +@@ -1,150 +0,0 @@ +-// SPDX-License-Identifier: GPL-2.0 +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +-#include +- +-#include "scm.h" +- +-unsigned int unix_tot_inflight; +-EXPORT_SYMBOL(unix_tot_inflight); +- +-LIST_HEAD(gc_inflight_list); +-EXPORT_SYMBOL(gc_inflight_list); +- +-DEFINE_SPINLOCK(unix_gc_lock); +-EXPORT_SYMBOL(unix_gc_lock); +- +-struct unix_sock *unix_get_socket(struct file *filp) +-{ +- struct inode *inode = file_inode(filp); +- +- /* Socket ? */ +- if (S_ISSOCK(inode->i_mode) && !(filp->f_mode & FMODE_PATH)) { +- struct socket *sock = SOCKET_I(inode); +- const struct proto_ops *ops = READ_ONCE(sock->ops); +- struct sock *s = sock->sk; +- +- /* PF_UNIX ? */ +- if (s && ops && ops->family == PF_UNIX) +- return unix_sk(s); +- } +- +- return NULL; +-} +-EXPORT_SYMBOL(unix_get_socket); +- +-/* Keep the number of times in flight count for the file +- * descriptor if it is for an AF_UNIX socket. +- */ +-void unix_inflight(struct user_struct *user, struct file *fp) +-{ +- struct unix_sock *u = unix_get_socket(fp); +- +- spin_lock(&unix_gc_lock); +- +- if (u) { +- if (!u->inflight) { +- WARN_ON_ONCE(!list_empty(&u->link)); +- list_add_tail(&u->link, &gc_inflight_list); +- } else { +- WARN_ON_ONCE(list_empty(&u->link)); +- } +- u->inflight++; +- /* Paired with READ_ONCE() in wait_for_unix_gc() */ +- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + 1); +- } +- WRITE_ONCE(user->unix_inflight, user->unix_inflight + 1); +- spin_unlock(&unix_gc_lock); +-} +- +-void unix_notinflight(struct user_struct *user, struct file *fp) +-{ +- struct unix_sock *u = unix_get_socket(fp); +- +- spin_lock(&unix_gc_lock); +- +- if (u) { +- WARN_ON_ONCE(!u->inflight); +- WARN_ON_ONCE(list_empty(&u->link)); +- +- u->inflight--; +- if (!u->inflight) +- list_del_init(&u->link); +- /* Paired with READ_ONCE() in wait_for_unix_gc() */ +- WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - 1); +- } +- WRITE_ONCE(user->unix_inflight, user->unix_inflight - 1); +- spin_unlock(&unix_gc_lock); +-} +- +-/* +- * The "user->unix_inflight" variable is protected by the garbage +- * collection lock, and we just read it locklessly here. If you go +- * over the limit, there might be a tiny race in actually noticing +- * it across threads. Tough. +- */ +-static inline bool too_many_unix_fds(struct task_struct *p) +-{ +- struct user_struct *user = current_user(); +- +- if (unlikely(READ_ONCE(user->unix_inflight) > task_rlimit(p, RLIMIT_NOFILE))) +- return !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN); +- return false; +-} +- +-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) +-{ +- int i; +- +- if (too_many_unix_fds(current)) +- return -ETOOMANYREFS; +- +- /* +- * Need to duplicate file references for the sake of garbage +- * collection. Otherwise a socket in the fps might become a +- * candidate for GC while the skb is not yet queued. +- */ +- UNIXCB(skb).fp = scm_fp_dup(scm->fp); +- if (!UNIXCB(skb).fp) +- return -ENOMEM; +- +- for (i = scm->fp->count - 1; i >= 0; i--) +- unix_inflight(scm->fp->user, scm->fp->fp[i]); +- return 0; +-} +-EXPORT_SYMBOL(unix_attach_fds); +- +-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) +-{ +- int i; +- +- scm->fp = UNIXCB(skb).fp; +- UNIXCB(skb).fp = NULL; +- +- for (i = scm->fp->count-1; i >= 0; i--) +- unix_notinflight(scm->fp->user, scm->fp->fp[i]); +-} +-EXPORT_SYMBOL(unix_detach_fds); +- +-void unix_destruct_scm(struct sk_buff *skb) +-{ +- struct scm_cookie scm; +- +- memset(&scm, 0, sizeof(scm)); +- scm.pid = UNIXCB(skb).pid; +- if (UNIXCB(skb).fp) +- unix_detach_fds(&scm, skb); +- +- /* Alas, it calls VFS */ +- /* So fscking what? fput() had been SMP-safe since the last Summer */ +- scm_destroy(&scm); +- sock_wfree(skb); +-} +-EXPORT_SYMBOL(unix_destruct_scm); +--- a/net/unix/scm.h ++++ /dev/null +@@ -1,10 +0,0 @@ +-#ifndef NET_UNIX_SCM_H +-#define NET_UNIX_SCM_H +- +-extern struct list_head gc_inflight_list; +-extern spinlock_t unix_gc_lock; +- +-int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb); +-void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb); +- +-#endif diff --git a/queue-6.6/af_unix-remove-io_uring-code-for-gc.patch b/queue-6.6/af_unix-remove-io_uring-code-for-gc.patch new file mode 100644 index 0000000000..c3f48ef07b --- /dev/null +++ b/queue-6.6/af_unix-remove-io_uring-code-for-gc.patch @@ -0,0 +1,106 @@ +From stable+bounces-145852-greg=kroah.com@vger.kernel.org Wed May 21 16:51:43 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:13 +0000 +Subject: af_unix: Remove io_uring code for GC. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-6-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 11498715f266a3fb4caabba9dd575636cbcaa8f1 upstream. + +Since commit 705318a99a13 ("io_uring/af_unix: disable sending +io_uring over sockets"), io_uring's unix socket cannot be passed +via SCM_RIGHTS, so it does not contribute to cyclic reference and +no longer be candidate for garbage collection. + +Also, commit 6e5e6d274956 ("io_uring: drop any code related to +SCM_RIGHTS") cleaned up SCM_RIGHTS code in io_uring. + +Let's do it in AF_UNIX as well by reverting commit 0091bfc81741 +("io_uring/af_unix: defer registered files gc to io_uring release") +and commit 10369080454d ("net: reclaim skb->scm_io_uring bit"). + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Jens Axboe +Link: https://lore.kernel.org/r/20240129190435.57228-3-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 - + net/unix/garbage.c | 25 ++----------------------- + net/unix/scm.c | 6 ------ + 3 files changed, 2 insertions(+), 30 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -20,7 +20,6 @@ static inline struct unix_sock *unix_get + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); + void unix_destruct_scm(struct sk_buff *skb); +-void io_uring_destruct_scm(struct sk_buff *skb); + void unix_gc(void); + void wait_for_unix_gc(struct scm_fp_list *fpl); + struct sock *unix_peer_get(struct sock *sk); +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -184,12 +184,10 @@ static bool gc_in_progress; + + static void __unix_gc(struct work_struct *work) + { +- struct sk_buff *next_skb, *skb; +- struct unix_sock *u; +- struct unix_sock *next; + struct sk_buff_head hitlist; +- struct list_head cursor; ++ struct unix_sock *u, *next; + LIST_HEAD(not_cycle_list); ++ struct list_head cursor; + + spin_lock(&unix_gc_lock); + +@@ -293,30 +291,11 @@ static void __unix_gc(struct work_struct + + spin_unlock(&unix_gc_lock); + +- /* We need io_uring to clean its registered files, ignore all io_uring +- * originated skbs. It's fine as io_uring doesn't keep references to +- * other io_uring instances and so killing all other files in the cycle +- * will put all io_uring references forcing it to go through normal +- * release.path eventually putting registered files. +- */ +- skb_queue_walk_safe(&hitlist, skb, next_skb) { +- if (skb->destructor == io_uring_destruct_scm) { +- __skb_unlink(skb, &hitlist); +- skb_queue_tail(&skb->sk->sk_receive_queue, skb); +- } +- } +- + /* Here we are. Hitlist is filled. Die. */ + __skb_queue_purge(&hitlist); + + spin_lock(&unix_gc_lock); + +- /* There could be io_uring registered files, just push them back to +- * the inflight list +- */ +- list_for_each_entry_safe(u, next, &gc_candidates, link) +- list_move_tail(&u->link, &gc_inflight_list); +- + /* All candidates should have been detached by now. */ + WARN_ON_ONCE(!list_empty(&gc_candidates)); + +--- a/net/unix/scm.c ++++ b/net/unix/scm.c +@@ -148,9 +148,3 @@ void unix_destruct_scm(struct sk_buff *s + sock_wfree(skb); + } + EXPORT_SYMBOL(unix_destruct_scm); +- +-void io_uring_destruct_scm(struct sk_buff *skb) +-{ +- unix_destruct_scm(skb); +-} +-EXPORT_SYMBOL(io_uring_destruct_scm); diff --git a/queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch b/queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch new file mode 100644 index 0000000000..59578d3c86 --- /dev/null +++ b/queue-6.6/af_unix-remove-lock-dance-in-unix_peek_fds.patch @@ -0,0 +1,115 @@ +From stable+bounces-145868-greg=kroah.com@vger.kernel.org Wed May 21 16:56:33 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:29 +0000 +Subject: af_unix: Remove lock dance in unix_peek_fds(). +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-22-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 118f457da9ed58a79e24b73c2ef0aa1987241f0e upstream. + +In the previous GC implementation, the shape of the inflight socket +graph was not expected to change while GC was in progress. + +MSG_PEEK was tricky because it could install inflight fd silently +and transform the graph. + +Let's say we peeked a fd, which was a listening socket, and accept()ed +some embryo sockets from it. The garbage collection algorithm would +have been confused because the set of sockets visited in scan_inflight() +would change within the same GC invocation. + +That's why we placed spin_lock(&unix_gc_lock) and spin_unlock() in +unix_peek_fds() with a fat comment. + +In the new GC implementation, we no longer garbage-collect the socket +if it exists in another queue, that is, if it has a bridge to another +SCC. Also, accept() will require the lock if it has edges. + +Thus, we need not do the complicated lock dance. + +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240401173125.92184-3-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 - + net/unix/af_unix.c | 42 ------------------------------------------ + net/unix/garbage.c | 2 +- + 3 files changed, 1 insertion(+), 44 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -17,7 +17,6 @@ static inline struct unix_sock *unix_get + } + #endif + +-extern spinlock_t unix_gc_lock; + extern unsigned int unix_tot_inflight; + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); + void unix_del_edges(struct scm_fp_list *fpl); +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1796,48 +1796,6 @@ static void unix_detach_fds(struct scm_c + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) + { + scm->fp = scm_fp_dup(UNIXCB(skb).fp); +- +- /* +- * Garbage collection of unix sockets starts by selecting a set of +- * candidate sockets which have reference only from being in flight +- * (total_refs == inflight_refs). This condition is checked once during +- * the candidate collection phase, and candidates are marked as such, so +- * that non-candidates can later be ignored. While inflight_refs is +- * protected by unix_gc_lock, total_refs (file count) is not, hence this +- * is an instantaneous decision. +- * +- * Once a candidate, however, the socket must not be reinstalled into a +- * file descriptor while the garbage collection is in progress. +- * +- * If the above conditions are met, then the directed graph of +- * candidates (*) does not change while unix_gc_lock is held. +- * +- * Any operations that changes the file count through file descriptors +- * (dup, close, sendmsg) does not change the graph since candidates are +- * not installed in fds. +- * +- * Dequeing a candidate via recvmsg would install it into an fd, but +- * that takes unix_gc_lock to decrement the inflight count, so it's +- * serialized with garbage collection. +- * +- * MSG_PEEK is special in that it does not change the inflight count, +- * yet does install the socket into an fd. The following lock/unlock +- * pair is to ensure serialization with garbage collection. It must be +- * done between incrementing the file count and installing the file into +- * an fd. +- * +- * If garbage collection starts after the barrier provided by the +- * lock/unlock, then it will see the elevated refcount and not mark this +- * as a candidate. If a garbage collection is already in progress +- * before the file count was incremented, then the lock/unlock pair will +- * ensure that garbage collection is finished before progressing to +- * installing the fd. +- * +- * (*) A -> B where B is on the queue of A or B is on the queue of C +- * which is on the queue of listening socket A. +- */ +- spin_lock(&unix_gc_lock); +- spin_unlock(&unix_gc_lock); + } + + static void unix_destruct_scm(struct sk_buff *skb) +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -183,7 +183,7 @@ static void unix_free_vertices(struct sc + } + } + +-DEFINE_SPINLOCK(unix_gc_lock); ++static DEFINE_SPINLOCK(unix_gc_lock); + unsigned int unix_tot_inflight; + + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver) diff --git a/queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch b/queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch new file mode 100644 index 0000000000..9e6c46ad1f --- /dev/null +++ b/queue-6.6/af_unix-replace-bug_on-with-warn_on_once.patch @@ -0,0 +1,83 @@ +From stable+bounces-145851-greg=kroah.com@vger.kernel.org Wed May 21 16:51:24 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:12 +0000 +Subject: af_unix: Replace BUG_ON() with WARN_ON_ONCE(). +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-5-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit d0f6dc26346863e1f4a23117f5468614e54df064 upstream. + +This is a prep patch for the last patch in this series so that +checkpatch will not warn about BUG_ON(). + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Jens Axboe +Link: https://lore.kernel.org/r/20240129190435.57228-2-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 8 ++++---- + net/unix/scm.c | 8 ++++---- + 2 files changed, 8 insertions(+), 8 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -145,7 +145,7 @@ static void scan_children(struct sock *x + /* An embryo cannot be in-flight, so it's safe + * to use the list link. + */ +- BUG_ON(!list_empty(&u->link)); ++ WARN_ON_ONCE(!list_empty(&u->link)); + list_add_tail(&u->link, &embryos); + } + spin_unlock(&x->sk_receive_queue.lock); +@@ -224,8 +224,8 @@ static void __unix_gc(struct work_struct + + total_refs = file_count(sk->sk_socket->file); + +- BUG_ON(!u->inflight); +- BUG_ON(total_refs < u->inflight); ++ WARN_ON_ONCE(!u->inflight); ++ WARN_ON_ONCE(total_refs < u->inflight); + if (total_refs == u->inflight) { + list_move_tail(&u->link, &gc_candidates); + __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); +@@ -318,7 +318,7 @@ static void __unix_gc(struct work_struct + list_move_tail(&u->link, &gc_inflight_list); + + /* All candidates should have been detached by now. */ +- BUG_ON(!list_empty(&gc_candidates)); ++ WARN_ON_ONCE(!list_empty(&gc_candidates)); + + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); +--- a/net/unix/scm.c ++++ b/net/unix/scm.c +@@ -51,10 +51,10 @@ void unix_inflight(struct user_struct *u + + if (u) { + if (!u->inflight) { +- BUG_ON(!list_empty(&u->link)); ++ WARN_ON_ONCE(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); + } else { +- BUG_ON(list_empty(&u->link)); ++ WARN_ON_ONCE(list_empty(&u->link)); + } + u->inflight++; + /* Paired with READ_ONCE() in wait_for_unix_gc() */ +@@ -71,8 +71,8 @@ void unix_notinflight(struct user_struct + spin_lock(&unix_gc_lock); + + if (u) { +- BUG_ON(!u->inflight); +- BUG_ON(list_empty(&u->link)); ++ WARN_ON_ONCE(!u->inflight); ++ WARN_ON_ONCE(list_empty(&u->link)); + + u->inflight--; + if (!u->inflight) diff --git a/queue-6.6/af_unix-replace-garbage-collection-algorithm.patch b/queue-6.6/af_unix-replace-garbage-collection-algorithm.patch new file mode 100644 index 0000000000..dbbecab704 --- /dev/null +++ b/queue-6.6/af_unix-replace-garbage-collection-algorithm.patch @@ -0,0 +1,497 @@ +From stable+bounces-145867-greg=kroah.com@vger.kernel.org Wed May 21 16:59:26 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:28 +0000 +Subject: af_unix: Replace garbage collection algorithm. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-21-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 4090fa373f0e763c43610853d2774b5979915959 upstream. + +If we find a dead SCC during iteration, we call unix_collect_skb() +to splice all skb in the SCC to the global sk_buff_head, hitlist. + +After iterating all SCC, we unlock unix_gc_lock and purge the queue. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-15-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 8 - + net/unix/af_unix.c | 12 - + net/unix/garbage.c | 318 ++++++++++---------------------------------------- + 3 files changed, 64 insertions(+), 274 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -19,9 +19,6 @@ static inline struct unix_sock *unix_get + + extern spinlock_t unix_gc_lock; + extern unsigned int unix_tot_inflight; +- +-void unix_inflight(struct user_struct *user, struct file *fp); +-void unix_notinflight(struct user_struct *user, struct file *fp); + void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver); + void unix_del_edges(struct scm_fp_list *fpl); + void unix_update_edges(struct unix_sock *receiver); +@@ -85,12 +82,7 @@ struct unix_sock { + struct sock *peer; + struct sock *listener; + struct unix_vertex *vertex; +- struct list_head link; +- unsigned long inflight; + spinlock_t lock; +- unsigned long gc_flags; +-#define UNIX_GC_CANDIDATE 0 +-#define UNIX_GC_MAYBE_CYCLE 1 + struct socket_wq peer_wq; + wait_queue_entry_t peer_wake; + struct scm_stat scm_stat; +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -979,12 +979,10 @@ static struct sock *unix_create1(struct + sk->sk_destruct = unix_sock_destructor; + u = unix_sk(sk); + u->listener = NULL; +- u->inflight = 0; + u->vertex = NULL; + u->path.dentry = NULL; + u->path.mnt = NULL; + spin_lock_init(&u->lock); +- INIT_LIST_HEAD(&u->link); + mutex_init(&u->iolock); /* single task reading lock */ + mutex_init(&u->bindlock); /* single task binding lock */ + init_waitqueue_head(&u->peer_wait); +@@ -1770,8 +1768,6 @@ static inline bool too_many_unix_fds(str + + static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb) + { +- int i; +- + if (too_many_unix_fds(current)) + return -ETOOMANYREFS; + +@@ -1783,9 +1779,6 @@ static int unix_attach_fds(struct scm_co + if (!UNIXCB(skb).fp) + return -ENOMEM; + +- for (i = scm->fp->count - 1; i >= 0; i--) +- unix_inflight(scm->fp->user, scm->fp->fp[i]); +- + if (unix_prepare_fpl(UNIXCB(skb).fp)) + return -ENOMEM; + +@@ -1794,15 +1787,10 @@ static int unix_attach_fds(struct scm_co + + static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb) + { +- int i; +- + scm->fp = UNIXCB(skb).fp; + UNIXCB(skb).fp = NULL; + + unix_destroy_fpl(scm->fp); +- +- for (i = scm->fp->count - 1; i >= 0; i--) +- unix_notinflight(scm->fp->user, scm->fp->fp[i]); + } + + static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb) +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -322,6 +322,52 @@ static bool unix_vertex_dead(struct unix + return true; + } + ++enum unix_recv_queue_lock_class { ++ U_RECVQ_LOCK_NORMAL, ++ U_RECVQ_LOCK_EMBRYO, ++}; ++ ++static void unix_collect_skb(struct list_head *scc, struct sk_buff_head *hitlist) ++{ ++ struct unix_vertex *vertex; ++ ++ list_for_each_entry_reverse(vertex, scc, scc_entry) { ++ struct sk_buff_head *queue; ++ struct unix_edge *edge; ++ struct unix_sock *u; ++ ++ edge = list_first_entry(&vertex->edges, typeof(*edge), vertex_entry); ++ u = edge->predecessor; ++ queue = &u->sk.sk_receive_queue; ++ ++ spin_lock(&queue->lock); ++ ++ if (u->sk.sk_state == TCP_LISTEN) { ++ struct sk_buff *skb; ++ ++ skb_queue_walk(queue, skb) { ++ struct sk_buff_head *embryo_queue = &skb->sk->sk_receive_queue; ++ ++ /* listener -> embryo order, the inversion never happens. */ ++ spin_lock_nested(&embryo_queue->lock, U_RECVQ_LOCK_EMBRYO); ++ skb_queue_splice_init(embryo_queue, hitlist); ++ spin_unlock(&embryo_queue->lock); ++ } ++ } else { ++ skb_queue_splice_init(queue, hitlist); ++ ++#if IS_ENABLED(CONFIG_AF_UNIX_OOB) ++ if (u->oob_skb) { ++ kfree_skb(u->oob_skb); ++ u->oob_skb = NULL; ++ } ++#endif ++ } ++ ++ spin_unlock(&queue->lock); ++ } ++} ++ + static bool unix_scc_cyclic(struct list_head *scc) + { + struct unix_vertex *vertex; +@@ -345,7 +391,8 @@ static bool unix_scc_cyclic(struct list_ + static LIST_HEAD(unix_visited_vertices); + static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +-static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index) ++static void __unix_walk_scc(struct unix_vertex *vertex, unsigned long *last_index, ++ struct sk_buff_head *hitlist) + { + LIST_HEAD(vertex_stack); + struct unix_edge *edge; +@@ -430,7 +477,9 @@ prev_vertex: + scc_dead = unix_vertex_dead(vertex); + } + +- if (!unix_graph_maybe_cyclic) ++ if (scc_dead) ++ unix_collect_skb(&scc, hitlist); ++ else if (!unix_graph_maybe_cyclic) + unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); + + list_del(&scc); +@@ -441,7 +490,7 @@ prev_vertex: + goto prev_vertex; + } + +-static void unix_walk_scc(void) ++static void unix_walk_scc(struct sk_buff_head *hitlist) + { + unsigned long last_index = UNIX_VERTEX_INDEX_START; + +@@ -454,7 +503,7 @@ static void unix_walk_scc(void) + struct unix_vertex *vertex; + + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); +- __unix_walk_scc(vertex, &last_index); ++ __unix_walk_scc(vertex, &last_index, hitlist); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); +@@ -463,7 +512,7 @@ static void unix_walk_scc(void) + unix_graph_grouped = true; + } + +-static void unix_walk_scc_fast(void) ++static void unix_walk_scc_fast(struct sk_buff_head *hitlist) + { + while (!list_empty(&unix_unvisited_vertices)) { + struct unix_vertex *vertex; +@@ -480,279 +529,40 @@ static void unix_walk_scc_fast(void) + scc_dead = unix_vertex_dead(vertex); + } + ++ if (scc_dead) ++ unix_collect_skb(&scc, hitlist); ++ + list_del(&scc); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); + } + +-static LIST_HEAD(gc_candidates); +-static LIST_HEAD(gc_inflight_list); +- +-/* Keep the number of times in flight count for the file +- * descriptor if it is for an AF_UNIX socket. +- */ +-void unix_inflight(struct user_struct *user, struct file *filp) +-{ +- struct unix_sock *u = unix_get_socket(filp); +- +- spin_lock(&unix_gc_lock); +- +- if (u) { +- if (!u->inflight) { +- WARN_ON_ONCE(!list_empty(&u->link)); +- list_add_tail(&u->link, &gc_inflight_list); +- } else { +- WARN_ON_ONCE(list_empty(&u->link)); +- } +- u->inflight++; +- } +- +- spin_unlock(&unix_gc_lock); +-} +- +-void unix_notinflight(struct user_struct *user, struct file *filp) +-{ +- struct unix_sock *u = unix_get_socket(filp); +- +- spin_lock(&unix_gc_lock); +- +- if (u) { +- WARN_ON_ONCE(!u->inflight); +- WARN_ON_ONCE(list_empty(&u->link)); +- +- u->inflight--; +- if (!u->inflight) +- list_del_init(&u->link); +- } +- +- spin_unlock(&unix_gc_lock); +-} +- +-static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), +- struct sk_buff_head *hitlist) +-{ +- struct sk_buff *skb; +- struct sk_buff *next; +- +- spin_lock(&x->sk_receive_queue.lock); +- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { +- /* Do we have file descriptors ? */ +- if (UNIXCB(skb).fp) { +- bool hit = false; +- /* Process the descriptors of this socket */ +- int nfd = UNIXCB(skb).fp->count; +- struct file **fp = UNIXCB(skb).fp->fp; +- +- while (nfd--) { +- /* Get the socket the fd matches if it indeed does so */ +- struct unix_sock *u = unix_get_socket(*fp++); +- +- /* Ignore non-candidates, they could have been added +- * to the queues after starting the garbage collection +- */ +- if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { +- hit = true; +- +- func(u); +- } +- } +- if (hit && hitlist != NULL) { +- __skb_unlink(skb, &x->sk_receive_queue); +- __skb_queue_tail(hitlist, skb); +- } +- } +- } +- spin_unlock(&x->sk_receive_queue.lock); +-} +- +-static void scan_children(struct sock *x, void (*func)(struct unix_sock *), +- struct sk_buff_head *hitlist) +-{ +- if (x->sk_state != TCP_LISTEN) { +- scan_inflight(x, func, hitlist); +- } else { +- struct sk_buff *skb; +- struct sk_buff *next; +- struct unix_sock *u; +- LIST_HEAD(embryos); +- +- /* For a listening socket collect the queued embryos +- * and perform a scan on them as well. +- */ +- spin_lock(&x->sk_receive_queue.lock); +- skb_queue_walk_safe(&x->sk_receive_queue, skb, next) { +- u = unix_sk(skb->sk); +- +- /* An embryo cannot be in-flight, so it's safe +- * to use the list link. +- */ +- WARN_ON_ONCE(!list_empty(&u->link)); +- list_add_tail(&u->link, &embryos); +- } +- spin_unlock(&x->sk_receive_queue.lock); +- +- while (!list_empty(&embryos)) { +- u = list_entry(embryos.next, struct unix_sock, link); +- scan_inflight(&u->sk, func, hitlist); +- list_del_init(&u->link); +- } +- } +-} +- +-static void dec_inflight(struct unix_sock *usk) +-{ +- usk->inflight--; +-} +- +-static void inc_inflight(struct unix_sock *usk) +-{ +- usk->inflight++; +-} +- +-static void inc_inflight_move_tail(struct unix_sock *u) +-{ +- u->inflight++; +- +- /* If this still might be part of a cycle, move it to the end +- * of the list, so that it's checked even if it was already +- * passed over +- */ +- if (test_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags)) +- list_move_tail(&u->link, &gc_candidates); +-} +- + static bool gc_in_progress; + + static void __unix_gc(struct work_struct *work) + { + struct sk_buff_head hitlist; +- struct unix_sock *u, *next; +- LIST_HEAD(not_cycle_list); +- struct list_head cursor; + + spin_lock(&unix_gc_lock); + +- if (!unix_graph_maybe_cyclic) ++ if (!unix_graph_maybe_cyclic) { ++ spin_unlock(&unix_gc_lock); + goto skip_gc; +- +- if (unix_graph_grouped) +- unix_walk_scc_fast(); +- else +- unix_walk_scc(); +- +- /* First, select candidates for garbage collection. Only +- * in-flight sockets are considered, and from those only ones +- * which don't have any external reference. +- * +- * Holding unix_gc_lock will protect these candidates from +- * being detached, and hence from gaining an external +- * reference. Since there are no possible receivers, all +- * buffers currently on the candidates' queues stay there +- * during the garbage collection. +- * +- * We also know that no new candidate can be added onto the +- * receive queues. Other, non candidate sockets _can_ be +- * added to queue, so we must make sure only to touch +- * candidates. +- * +- * Embryos, though never candidates themselves, affect which +- * candidates are reachable by the garbage collector. Before +- * being added to a listener's queue, an embryo may already +- * receive data carrying SCM_RIGHTS, potentially making the +- * passed socket a candidate that is not yet reachable by the +- * collector. It becomes reachable once the embryo is +- * enqueued. Therefore, we must ensure that no SCM-laden +- * embryo appears in a (candidate) listener's queue between +- * consecutive scan_children() calls. +- */ +- list_for_each_entry_safe(u, next, &gc_inflight_list, link) { +- struct sock *sk = &u->sk; +- long total_refs; +- +- total_refs = file_count(sk->sk_socket->file); +- +- WARN_ON_ONCE(!u->inflight); +- WARN_ON_ONCE(total_refs < u->inflight); +- if (total_refs == u->inflight) { +- list_move_tail(&u->link, &gc_candidates); +- __set_bit(UNIX_GC_CANDIDATE, &u->gc_flags); +- __set_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); +- +- if (sk->sk_state == TCP_LISTEN) { +- unix_state_lock_nested(sk, U_LOCK_GC_LISTENER); +- unix_state_unlock(sk); +- } +- } +- } +- +- /* Now remove all internal in-flight reference to children of +- * the candidates. +- */ +- list_for_each_entry(u, &gc_candidates, link) +- scan_children(&u->sk, dec_inflight, NULL); +- +- /* Restore the references for children of all candidates, +- * which have remaining references. Do this recursively, so +- * only those remain, which form cyclic references. +- * +- * Use a "cursor" link, to make the list traversal safe, even +- * though elements might be moved about. +- */ +- list_add(&cursor, &gc_candidates); +- while (cursor.next != &gc_candidates) { +- u = list_entry(cursor.next, struct unix_sock, link); +- +- /* Move cursor to after the current position. */ +- list_move(&cursor, &u->link); +- +- if (u->inflight) { +- list_move_tail(&u->link, ¬_cycle_list); +- __clear_bit(UNIX_GC_MAYBE_CYCLE, &u->gc_flags); +- scan_children(&u->sk, inc_inflight_move_tail, NULL); +- } + } +- list_del(&cursor); + +- /* Now gc_candidates contains only garbage. Restore original +- * inflight counters for these as well, and remove the skbuffs +- * which are creating the cycle(s). +- */ +- skb_queue_head_init(&hitlist); +- list_for_each_entry(u, &gc_candidates, link) { +- scan_children(&u->sk, inc_inflight, &hitlist); +- +-#if IS_ENABLED(CONFIG_AF_UNIX_OOB) +- if (u->oob_skb) { +- kfree_skb(u->oob_skb); +- u->oob_skb = NULL; +- } +-#endif +- } ++ __skb_queue_head_init(&hitlist); + +- /* not_cycle_list contains those sockets which do not make up a +- * cycle. Restore these to the inflight list. +- */ +- while (!list_empty(¬_cycle_list)) { +- u = list_entry(not_cycle_list.next, struct unix_sock, link); +- __clear_bit(UNIX_GC_CANDIDATE, &u->gc_flags); +- list_move_tail(&u->link, &gc_inflight_list); +- } ++ if (unix_graph_grouped) ++ unix_walk_scc_fast(&hitlist); ++ else ++ unix_walk_scc(&hitlist); + + spin_unlock(&unix_gc_lock); + +- /* Here we are. Hitlist is filled. Die. */ + __skb_queue_purge(&hitlist); +- +- spin_lock(&unix_gc_lock); +- +- /* All candidates should have been detached by now. */ +- WARN_ON_ONCE(!list_empty(&gc_candidates)); + skip_gc: +- /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); +- +- spin_unlock(&unix_gc_lock); + } + + static DECLARE_WORK(unix_gc_work, __unix_gc); diff --git a/queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch b/queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch new file mode 100644 index 0000000000..d669ef5f70 --- /dev/null +++ b/queue-6.6/af_unix-return-struct-unix_sock-from-unix_get_socket.patch @@ -0,0 +1,129 @@ +From stable+bounces-145848-greg=kroah.com@vger.kernel.org Wed May 21 16:50:06 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:09 +0000 +Subject: af_unix: Return struct unix_sock from unix_get_socket(). +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org, Simon Horman +Message-ID: <20250521144803.2050504-2-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 5b17307bd0789edea0675d524a2b277b93bbde62 upstream. + +Currently, unix_get_socket() returns struct sock, but after calling +it, we always cast it to unix_sk(). + +Let's return struct unix_sock from unix_get_socket(). + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Pavel Begunkov +Reviewed-by: Simon Horman +Link: https://lore.kernel.org/r/20240123170856.41348-4-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 2 +- + net/unix/garbage.c | 19 +++++++------------ + net/unix/scm.c | 19 +++++++------------ + 3 files changed, 15 insertions(+), 25 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -14,7 +14,7 @@ void unix_destruct_scm(struct sk_buff *s + void io_uring_destruct_scm(struct sk_buff *skb); + void unix_gc(void); + void wait_for_unix_gc(void); +-struct sock *unix_get_socket(struct file *filp); ++struct unix_sock *unix_get_socket(struct file *filp); + struct sock *unix_peer_get(struct sock *sk); + + #define UNIX_HASH_MOD (256 - 1) +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -105,20 +105,15 @@ static void scan_inflight(struct sock *x + + while (nfd--) { + /* Get the socket the fd matches if it indeed does so */ +- struct sock *sk = unix_get_socket(*fp++); ++ struct unix_sock *u = unix_get_socket(*fp++); + +- if (sk) { +- struct unix_sock *u = unix_sk(sk); ++ /* Ignore non-candidates, they could have been added ++ * to the queues after starting the garbage collection ++ */ ++ if (u && test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { ++ hit = true; + +- /* Ignore non-candidates, they could +- * have been added to the queues after +- * starting the garbage collection +- */ +- if (test_bit(UNIX_GC_CANDIDATE, &u->gc_flags)) { +- hit = true; +- +- func(u); +- } ++ func(u); + } + } + if (hit && hitlist != NULL) { +--- a/net/unix/scm.c ++++ b/net/unix/scm.c +@@ -21,9 +21,8 @@ EXPORT_SYMBOL(gc_inflight_list); + DEFINE_SPINLOCK(unix_gc_lock); + EXPORT_SYMBOL(unix_gc_lock); + +-struct sock *unix_get_socket(struct file *filp) ++struct unix_sock *unix_get_socket(struct file *filp) + { +- struct sock *u_sock = NULL; + struct inode *inode = file_inode(filp); + + /* Socket ? */ +@@ -34,10 +33,10 @@ struct sock *unix_get_socket(struct file + + /* PF_UNIX ? */ + if (s && ops && ops->family == PF_UNIX) +- u_sock = s; ++ return unix_sk(s); + } + +- return u_sock; ++ return NULL; + } + EXPORT_SYMBOL(unix_get_socket); + +@@ -46,13 +45,11 @@ EXPORT_SYMBOL(unix_get_socket); + */ + void unix_inflight(struct user_struct *user, struct file *fp) + { +- struct sock *s = unix_get_socket(fp); ++ struct unix_sock *u = unix_get_socket(fp); + + spin_lock(&unix_gc_lock); + +- if (s) { +- struct unix_sock *u = unix_sk(s); +- ++ if (u) { + if (!u->inflight) { + BUG_ON(!list_empty(&u->link)); + list_add_tail(&u->link, &gc_inflight_list); +@@ -69,13 +66,11 @@ void unix_inflight(struct user_struct *u + + void unix_notinflight(struct user_struct *user, struct file *fp) + { +- struct sock *s = unix_get_socket(fp); ++ struct unix_sock *u = unix_get_socket(fp); + + spin_lock(&unix_gc_lock); + +- if (s) { +- struct unix_sock *u = unix_sk(s); +- ++ if (u) { + BUG_ON(!u->inflight); + BUG_ON(list_empty(&u->link)); + diff --git a/queue-6.6/af_unix-run-gc-on-only-one-cpu.patch b/queue-6.6/af_unix-run-gc-on-only-one-cpu.patch new file mode 100644 index 0000000000..e9610ea9ff --- /dev/null +++ b/queue-6.6/af_unix-run-gc-on-only-one-cpu.patch @@ -0,0 +1,131 @@ +From stable+bounces-145849-greg=kroah.com@vger.kernel.org Wed May 21 16:50:23 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:10 +0000 +Subject: af_unix: Run GC on only one CPU. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-3-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 8b90a9f819dc2a06baae4ec1a64d875e53b824ec upstream. + +If more than 16000 inflight AF_UNIX sockets exist and the garbage +collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). +Also, they wait for unix_gc() to complete. + +In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, +and more if they are the GC candidate. Thus, sendmsg() significantly +slows down with too many inflight AF_UNIX sockets. + +There is a small window to invoke multiple unix_gc() instances, which +will then be blocked by the same spinlock except for one. + +Let's convert unix_gc() to use struct work so that it will not consume +CPUs unnecessarily. + +Note WRITE_ONCE(gc_in_progress, true) is moved before running GC. +If we leave the WRITE_ONCE() as is and use the following test to +call flush_work(), a process might not call it. + + CPU 0 CPU 1 + --- --- + start work and call __unix_gc() + if (work_pending(&unix_gc_work) || <-- false + READ_ONCE(gc_in_progress)) <-- false + flush_work(); <-- missed! + WRITE_ONCE(gc_in_progress, true) + +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240123170856.41348-5-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 54 ++++++++++++++++++++++++++--------------------------- + 1 file changed, 27 insertions(+), 27 deletions(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -86,7 +86,6 @@ + /* Internal data structures and random procedures: */ + + static LIST_HEAD(gc_candidates); +-static DECLARE_WAIT_QUEUE_HEAD(unix_gc_wait); + + static void scan_inflight(struct sock *x, void (*func)(struct unix_sock *), + struct sk_buff_head *hitlist) +@@ -182,23 +181,8 @@ static void inc_inflight_move_tail(struc + } + + static bool gc_in_progress; +-#define UNIX_INFLIGHT_TRIGGER_GC 16000 +- +-void wait_for_unix_gc(void) +-{ +- /* If number of inflight sockets is insane, +- * force a garbage collect right now. +- * Paired with the WRITE_ONCE() in unix_inflight(), +- * unix_notinflight() and gc_in_progress(). +- */ +- if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && +- !READ_ONCE(gc_in_progress)) +- unix_gc(); +- wait_event(unix_gc_wait, !READ_ONCE(gc_in_progress)); +-} + +-/* The external entry point: unix_gc() */ +-void unix_gc(void) ++static void __unix_gc(struct work_struct *work) + { + struct sk_buff *next_skb, *skb; + struct unix_sock *u; +@@ -209,13 +193,6 @@ void unix_gc(void) + + spin_lock(&unix_gc_lock); + +- /* Avoid a recursive GC. */ +- if (gc_in_progress) +- goto out; +- +- /* Paired with READ_ONCE() in wait_for_unix_gc(). */ +- WRITE_ONCE(gc_in_progress, true); +- + /* First, select candidates for garbage collection. Only + * in-flight sockets are considered, and from those only ones + * which don't have any external reference. +@@ -346,8 +323,31 @@ void unix_gc(void) + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + +- wake_up(&unix_gc_wait); +- +- out: + spin_unlock(&unix_gc_lock); + } ++ ++static DECLARE_WORK(unix_gc_work, __unix_gc); ++ ++void unix_gc(void) ++{ ++ WRITE_ONCE(gc_in_progress, true); ++ queue_work(system_unbound_wq, &unix_gc_work); ++} ++ ++#define UNIX_INFLIGHT_TRIGGER_GC 16000 ++ ++void wait_for_unix_gc(void) ++{ ++ /* If number of inflight sockets is insane, ++ * force a garbage collect right now. ++ * ++ * Paired with the WRITE_ONCE() in unix_inflight(), ++ * unix_notinflight(), and __unix_gc(). ++ */ ++ if (READ_ONCE(unix_tot_inflight) > UNIX_INFLIGHT_TRIGGER_GC && ++ !READ_ONCE(gc_in_progress)) ++ unix_gc(); ++ ++ if (READ_ONCE(gc_in_progress)) ++ flush_work(&unix_gc_work); ++} diff --git a/queue-6.6/af_unix-save-listener-for-embryo-socket.patch b/queue-6.6/af_unix-save-listener-for-embryo-socket.patch new file mode 100644 index 0000000000..3e449bd79a --- /dev/null +++ b/queue-6.6/af_unix-save-listener-for-embryo-socket.patch @@ -0,0 +1,79 @@ +From stable+bounces-145860-greg=kroah.com@vger.kernel.org Wed May 21 16:55:46 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:21 +0000 +Subject: af_unix: Save listener for embryo socket. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-14-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit aed6ecef55d70de3762ce41c561b7f547dbaf107 upstream. + +This is a prep patch for the following change, where we need to +fetch the listening socket from the successor embryo socket +during GC. + +We add a new field to struct unix_sock to save a pointer to a +listening socket. + +We set it when connect() creates a new socket, and clear it when +accept() is called. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-8-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 + + net/unix/af_unix.c | 5 ++++- + 2 files changed, 5 insertions(+), 1 deletion(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -83,6 +83,7 @@ struct unix_sock { + struct path path; + struct mutex iolock, bindlock; + struct sock *peer; ++ struct sock *listener; + struct unix_vertex *vertex; + struct list_head link; + unsigned long inflight; +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -978,6 +978,7 @@ static struct sock *unix_create1(struct + sk->sk_max_ack_backlog = READ_ONCE(net->unx.sysctl_max_dgram_qlen); + sk->sk_destruct = unix_sock_destructor; + u = unix_sk(sk); ++ u->listener = NULL; + u->inflight = 0; + u->vertex = NULL; + u->path.dentry = NULL; +@@ -1582,6 +1583,7 @@ restart: + newsk->sk_type = sk->sk_type; + init_peercred(newsk); + newu = unix_sk(newsk); ++ newu->listener = other; + RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq); + otheru = unix_sk(other); + +@@ -1677,8 +1679,8 @@ static int unix_accept(struct socket *so + bool kern) + { + struct sock *sk = sock->sk; +- struct sock *tsk; + struct sk_buff *skb; ++ struct sock *tsk; + int err; + + err = -EOPNOTSUPP; +@@ -1703,6 +1705,7 @@ static int unix_accept(struct socket *so + } + + tsk = skb->sk; ++ unix_sk(tsk)->listener = NULL; + skb_free_datagram(sk, skb); + wake_up_interruptible(&unix_sk(sk)->peer_wait); + diff --git a/queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch b/queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch new file mode 100644 index 0000000000..64a019f4e3 --- /dev/null +++ b/queue-6.6/af_unix-save-o-n-setup-of-tarjan-s-algo.patch @@ -0,0 +1,158 @@ +From stable+bounces-145862-greg=kroah.com@vger.kernel.org Wed May 21 16:54:49 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:23 +0000 +Subject: af_unix: Save O(n) setup of Tarjan's algo. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-16-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit ba31b4a4e1018f5844c6eb31734976e2184f2f9a upstream. + +Before starting Tarjan's algorithm, we need to mark all vertices +as unvisited. We can save this O(n) setup by reserving two special +indices (0, 1) and using two variables. + +The first time we link a vertex to unix_unvisited_vertices, we set +unix_vertex_unvisited_index to index. + +During DFS, we can see that the index of unvisited vertices is the +same as unix_vertex_unvisited_index. + +When we finalise SCC later, we set unix_vertex_grouped_index to each +vertex's index. + +Then, we can know (i) that the vertex is on the stack if the index +of a visited vertex is >= 2 and (ii) that it is not on the stack and +belongs to a different SCC if the index is unix_vertex_grouped_index. + +After the whole algorithm, all indices of vertices are set as +unix_vertex_grouped_index. + +Next time we start DFS, we know that all unvisited vertices have +unix_vertex_grouped_index, and we can use unix_vertex_unvisited_index +as the not-on-stack marker. + +To use the same variable in __unix_walk_scc(), we can swap +unix_vertex_(grouped|unvisited)_index at the end of Tarjan's +algorithm. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-10-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 - + net/unix/garbage.c | 26 +++++++++++++++----------- + 2 files changed, 15 insertions(+), 12 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -37,7 +37,6 @@ struct unix_vertex { + unsigned long out_degree; + unsigned long index; + unsigned long lowlink; +- bool on_stack; + }; + + struct unix_edge { +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -115,16 +115,20 @@ static struct unix_vertex *unix_edge_suc + static LIST_HEAD(unix_unvisited_vertices); + + enum unix_vertex_index { +- UNIX_VERTEX_INDEX_UNVISITED, ++ UNIX_VERTEX_INDEX_MARK1, ++ UNIX_VERTEX_INDEX_MARK2, + UNIX_VERTEX_INDEX_START, + }; + ++static unsigned long unix_vertex_unvisited_index = UNIX_VERTEX_INDEX_MARK1; ++ + static void unix_add_edge(struct scm_fp_list *fpl, struct unix_edge *edge) + { + struct unix_vertex *vertex = edge->predecessor->vertex; + + if (!vertex) { + vertex = list_first_entry(&fpl->vertices, typeof(*vertex), entry); ++ vertex->index = unix_vertex_unvisited_index; + vertex->out_degree = 0; + INIT_LIST_HEAD(&vertex->edges); + +@@ -265,6 +269,7 @@ void unix_destroy_fpl(struct scm_fp_list + } + + static LIST_HEAD(unix_visited_vertices); ++static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + + static void __unix_walk_scc(struct unix_vertex *vertex) + { +@@ -274,10 +279,10 @@ static void __unix_walk_scc(struct unix_ + LIST_HEAD(edge_stack); + + next_vertex: +- /* Push vertex to vertex_stack. ++ /* Push vertex to vertex_stack and mark it as on-stack ++ * (index >= UNIX_VERTEX_INDEX_START). + * The vertex will be popped when finalising SCC later. + */ +- vertex->on_stack = true; + list_add(&vertex->scc_entry, &vertex_stack); + + vertex->index = index; +@@ -291,7 +296,7 @@ next_vertex: + if (!next_vertex) + continue; + +- if (next_vertex->index == UNIX_VERTEX_INDEX_UNVISITED) { ++ if (next_vertex->index == unix_vertex_unvisited_index) { + /* Iterative deepening depth first search + * + * 1. Push a forward edge to edge_stack and set +@@ -317,7 +322,7 @@ prev_vertex: + * to skip SCC finalisation. + */ + vertex->lowlink = min(vertex->lowlink, next_vertex->lowlink); +- } else if (next_vertex->on_stack) { ++ } else if (next_vertex->index != unix_vertex_grouped_index) { + /* Loop detected by a back/cross edge. + * + * The successor is on vertex_stack, so two vertices are +@@ -344,7 +349,8 @@ prev_vertex: + /* Don't restart DFS from this vertex in unix_walk_scc(). */ + list_move_tail(&vertex->entry, &unix_visited_vertices); + +- vertex->on_stack = false; ++ /* Mark vertex as off-stack. */ ++ vertex->index = unix_vertex_grouped_index; + } + + list_del(&scc); +@@ -357,20 +363,18 @@ prev_vertex: + + static void unix_walk_scc(void) + { +- struct unix_vertex *vertex; +- +- list_for_each_entry(vertex, &unix_unvisited_vertices, entry) +- vertex->index = UNIX_VERTEX_INDEX_UNVISITED; +- + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. + */ + while (!list_empty(&unix_unvisited_vertices)) { ++ struct unix_vertex *vertex; ++ + vertex = list_first_entry(&unix_unvisited_vertices, typeof(*vertex), entry); + __unix_walk_scc(vertex); + } + + list_replace_init(&unix_visited_vertices, &unix_unvisited_vertices); ++ swap(unix_vertex_unvisited_index, unix_vertex_grouped_index); + } + + static LIST_HEAD(gc_candidates); diff --git a/queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch b/queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch new file mode 100644 index 0000000000..8cc410dcb0 --- /dev/null +++ b/queue-6.6/af_unix-skip-gc-if-no-cycle-exists.patch @@ -0,0 +1,157 @@ +From stable+bounces-145863-greg=kroah.com@vger.kernel.org Wed May 21 16:57:29 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:24 +0000 +Subject: af_unix: Skip GC if no cycle exists. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-17-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit 77e5593aebba823bcbcf2c4b58b07efcd63933b8 upstream. + +We do not need to run GC if there is no possible cyclic reference. +We use unix_graph_maybe_cyclic to decide if we should run GC. + +If a fd of an AF_UNIX socket is passed to an already inflight AF_UNIX +socket, they could form a cyclic reference. Then, we set true to +unix_graph_maybe_cyclic and later run Tarjan's algorithm to group +them into SCC. + +Once we run Tarjan's algorithm, we are 100% sure whether cyclic +references exist or not. If there is no cycle, we set false to +unix_graph_maybe_cyclic and can skip the entire garbage collection +next time. + +When finalising SCC, we set true to unix_graph_maybe_cyclic if SCC +consists of multiple vertices. + +Even if SCC is a single vertex, a cycle might exist as self-fd passing. +Given the corner case is rare, we detect it by checking all edges of +the vertex and set true to unix_graph_maybe_cyclic. + +With this change, __unix_gc() is just a spin_lock() dance in the normal +usage. + +Signed-off-by: Kuniyuki Iwashima +Acked-by: Paolo Abeni +Link: https://lore.kernel.org/r/20240325202425.60930-11-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + net/unix/garbage.c | 48 +++++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 47 insertions(+), 1 deletion(-) + +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -112,6 +112,19 @@ static struct unix_vertex *unix_edge_suc + return edge->successor->vertex; + } + ++static bool unix_graph_maybe_cyclic; ++ ++static void unix_update_graph(struct unix_vertex *vertex) ++{ ++ /* If the receiver socket is not inflight, no cyclic ++ * reference could be formed. ++ */ ++ if (!vertex) ++ return; ++ ++ unix_graph_maybe_cyclic = true; ++} ++ + static LIST_HEAD(unix_unvisited_vertices); + + enum unix_vertex_index { +@@ -138,12 +151,16 @@ static void unix_add_edge(struct scm_fp_ + + vertex->out_degree++; + list_add_tail(&edge->vertex_entry, &vertex->edges); ++ ++ unix_update_graph(unix_edge_successor(edge)); + } + + static void unix_del_edge(struct scm_fp_list *fpl, struct unix_edge *edge) + { + struct unix_vertex *vertex = edge->predecessor->vertex; + ++ unix_update_graph(unix_edge_successor(edge)); ++ + list_del(&edge->vertex_entry); + vertex->out_degree--; + +@@ -227,6 +244,7 @@ out: + void unix_update_edges(struct unix_sock *receiver) + { + spin_lock(&unix_gc_lock); ++ unix_update_graph(unix_sk(receiver->listener)->vertex); + receiver->listener = NULL; + spin_unlock(&unix_gc_lock); + } +@@ -268,6 +286,26 @@ void unix_destroy_fpl(struct scm_fp_list + unix_free_vertices(fpl); + } + ++static bool unix_scc_cyclic(struct list_head *scc) ++{ ++ struct unix_vertex *vertex; ++ struct unix_edge *edge; ++ ++ /* SCC containing multiple vertices ? */ ++ if (!list_is_singular(scc)) ++ return true; ++ ++ vertex = list_first_entry(scc, typeof(*vertex), scc_entry); ++ ++ /* Self-reference or a embryo-listener circle ? */ ++ list_for_each_entry(edge, &vertex->edges, vertex_entry) { ++ if (unix_edge_successor(edge) == vertex) ++ return true; ++ } ++ ++ return false; ++} ++ + static LIST_HEAD(unix_visited_vertices); + static unsigned long unix_vertex_grouped_index = UNIX_VERTEX_INDEX_MARK2; + +@@ -353,6 +391,9 @@ prev_vertex: + vertex->index = unix_vertex_grouped_index; + } + ++ if (!unix_graph_maybe_cyclic) ++ unix_graph_maybe_cyclic = unix_scc_cyclic(&scc); ++ + list_del(&scc); + } + +@@ -363,6 +404,8 @@ prev_vertex: + + static void unix_walk_scc(void) + { ++ unix_graph_maybe_cyclic = false; ++ + /* Visit every vertex exactly once. + * __unix_walk_scc() moves visited vertices to unix_visited_vertices. + */ +@@ -524,6 +567,9 @@ static void __unix_gc(struct work_struct + + spin_lock(&unix_gc_lock); + ++ if (!unix_graph_maybe_cyclic) ++ goto skip_gc; ++ + unix_walk_scc(); + + /* First, select candidates for garbage collection. Only +@@ -633,7 +679,7 @@ static void __unix_gc(struct work_struct + + /* All candidates should have been detached by now. */ + WARN_ON_ONCE(!list_empty(&gc_candidates)); +- ++skip_gc: + /* Paired with READ_ONCE() in wait_for_unix_gc(). */ + WRITE_ONCE(gc_in_progress, false); + diff --git a/queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch b/queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch new file mode 100644 index 0000000000..6b0fc63e1a --- /dev/null +++ b/queue-6.6/af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch @@ -0,0 +1,120 @@ +From stable+bounces-145869-greg=kroah.com@vger.kernel.org Wed May 21 17:00:36 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:30 +0000 +Subject: af_unix: Try not to hold unix_gc_lock during accept(). +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Pavel Begunkov , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org, kernel test robot +Message-ID: <20250521144803.2050504-23-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit fd86344823b521149bb31d91eba900ba3525efa6 upstream. + +Commit dcf70df2048d ("af_unix: Fix up unix_edge.successor for embryo +socket.") added spin_lock(&unix_gc_lock) in accept() path, and it +caused regression in a stress test as reported by kernel test robot. + +If the embryo socket is not part of the inflight graph, we need not +hold the lock. + +To decide that in O(1) time and avoid the regression in the normal +use case, + + 1. add a new stat unix_sk(sk)->scm_stat.nr_unix_fds + + 2. count the number of inflight AF_UNIX sockets in the receive + queue under unix_state_lock() + + 3. move unix_update_edges() call under unix_state_lock() + + 4. avoid locking if nr_unix_fds is 0 in unix_update_edges() + +Reported-by: kernel test robot +Closes: https://lore.kernel.org/oe-lkp/202404101427.92a08551-oliver.sang@intel.com +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240413021928.20946-1-kuniyu@amazon.com +Signed-off-by: Paolo Abeni +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 1 + + net/unix/af_unix.c | 2 +- + net/unix/garbage.c | 20 ++++++++++++++++---- + 3 files changed, 18 insertions(+), 5 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -67,6 +67,7 @@ struct unix_skb_parms { + + struct scm_stat { + atomic_t nr_fds; ++ unsigned long nr_unix_fds; + }; + + #define UNIXCB(skb) (*(struct unix_skb_parms *)&((skb)->cb)) +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1703,12 +1703,12 @@ static int unix_accept(struct socket *so + } + + tsk = skb->sk; +- unix_update_edges(unix_sk(tsk)); + skb_free_datagram(sk, skb); + wake_up_interruptible(&unix_sk(sk)->peer_wait); + + /* attach accepted sock to socket */ + unix_state_lock(tsk); ++ unix_update_edges(unix_sk(tsk)); + newsock->state = SS_CONNECTED; + unix_sock_inherit_flags(sock, newsock); + sock_graft(tsk, newsock); +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -209,6 +209,7 @@ void unix_add_edges(struct scm_fp_list * + unix_add_edge(fpl, edge); + } while (i < fpl->count_unix); + ++ receiver->scm_stat.nr_unix_fds += fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight + fpl->count_unix); + out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight + fpl->count); +@@ -222,6 +223,7 @@ out: + + void unix_del_edges(struct scm_fp_list *fpl) + { ++ struct unix_sock *receiver; + int i = 0; + + spin_lock(&unix_gc_lock); +@@ -235,6 +237,8 @@ void unix_del_edges(struct scm_fp_list * + unix_del_edge(fpl, edge); + } while (i < fpl->count_unix); + ++ receiver = fpl->edges[0].successor; ++ receiver->scm_stat.nr_unix_fds -= fpl->count_unix; + WRITE_ONCE(unix_tot_inflight, unix_tot_inflight - fpl->count_unix); + out: + WRITE_ONCE(fpl->user->unix_inflight, fpl->user->unix_inflight - fpl->count); +@@ -246,10 +250,18 @@ out: + + void unix_update_edges(struct unix_sock *receiver) + { +- spin_lock(&unix_gc_lock); +- unix_update_graph(unix_sk(receiver->listener)->vertex); +- receiver->listener = NULL; +- spin_unlock(&unix_gc_lock); ++ /* nr_unix_fds is only updated under unix_state_lock(). ++ * If it's 0 here, the embryo socket is not part of the ++ * inflight graph, and GC will not see it, so no lock needed. ++ */ ++ if (!receiver->scm_stat.nr_unix_fds) { ++ receiver->listener = NULL; ++ } else { ++ spin_lock(&unix_gc_lock); ++ unix_update_graph(unix_sk(receiver->listener)->vertex); ++ receiver->listener = NULL; ++ spin_unlock(&unix_gc_lock); ++ } + } + + int unix_prepare_fpl(struct scm_fp_list *fpl) diff --git a/queue-6.6/af_unix-try-to-run-gc-async.patch b/queue-6.6/af_unix-try-to-run-gc-async.patch new file mode 100644 index 0000000000..ecdf887769 --- /dev/null +++ b/queue-6.6/af_unix-try-to-run-gc-async.patch @@ -0,0 +1,200 @@ +From stable+bounces-145850-greg=kroah.com@vger.kernel.org Wed May 21 16:50:41 2025 +From: Lee Jones +Date: Wed, 21 May 2025 14:45:11 +0000 +Subject: af_unix: Try to run GC async. +To: lee@kernel.org, "David S. Miller" , Eric Dumazet , Jakub Kicinski , Paolo Abeni , Kuniyuki Iwashima , Jens Axboe , Sasha Levin , Michal Luczaj , Rao Shoaib , Simon Horman , linux-kernel@vger.kernel.org, netdev@vger.kernel.org +Cc: stable@vger.kernel.org +Message-ID: <20250521144803.2050504-4-lee@kernel.org> + +From: Kuniyuki Iwashima + +commit d9f21b3613337b55cc9d4a6ead484dca68475143 upstream. + +If more than 16000 inflight AF_UNIX sockets exist and the garbage +collector is not running, unix_(dgram|stream)_sendmsg() call unix_gc(). +Also, they wait for unix_gc() to complete. + +In unix_gc(), all inflight AF_UNIX sockets are traversed at least once, +and more if they are the GC candidate. Thus, sendmsg() significantly +slows down with too many inflight AF_UNIX sockets. + +However, if a process sends data with no AF_UNIX FD, the sendmsg() call +does not need to wait for GC. After this change, only the process that +meets the condition below will be blocked under such a situation. + + 1) cmsg contains AF_UNIX socket + 2) more than 32 AF_UNIX sent by the same user are still inflight + +Note that even a sendmsg() call that does not meet the condition but has +AF_UNIX FD will be blocked later in unix_scm_to_skb() by the spinlock, +but we allow that as a bonus for sane users. + +The results below are the time spent in unix_dgram_sendmsg() sending 1 +byte of data with no FD 4096 times on a host where 32K inflight AF_UNIX +sockets exist. + +Without series: the sane sendmsg() needs to wait gc unreasonably. + + $ sudo /usr/share/bcc/tools/funclatency -p 11165 unix_dgram_sendmsg + Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. + ^C + nsecs : count distribution + [...] + 524288 -> 1048575 : 0 | | + 1048576 -> 2097151 : 3881 |****************************************| + 2097152 -> 4194303 : 214 |** | + 4194304 -> 8388607 : 1 | | + + avg = 1825567 nsecs, total: 7477526027 nsecs, count: 4096 + +With series: the sane sendmsg() can finish much faster. + + $ sudo /usr/share/bcc/tools/funclatency -p 8702 unix_dgram_sendmsg + Tracing 1 functions for "unix_dgram_sendmsg"... Hit Ctrl-C to end. + ^C + nsecs : count distribution + [...] + 128 -> 255 : 0 | | + 256 -> 511 : 4092 |****************************************| + 512 -> 1023 : 2 | | + 1024 -> 2047 : 0 | | + 2048 -> 4095 : 0 | | + 4096 -> 8191 : 1 | | + 8192 -> 16383 : 1 | | + + avg = 410 nsecs, total: 1680510 nsecs, count: 4096 + +Signed-off-by: Kuniyuki Iwashima +Link: https://lore.kernel.org/r/20240123170856.41348-6-kuniyu@amazon.com +Signed-off-by: Jakub Kicinski +Signed-off-by: Lee Jones +Signed-off-by: Greg Kroah-Hartman +--- + include/net/af_unix.h | 12 ++++++++++-- + include/net/scm.h | 1 + + net/core/scm.c | 5 +++++ + net/unix/af_unix.c | 6 ++++-- + net/unix/garbage.c | 10 +++++++++- + 5 files changed, 29 insertions(+), 5 deletions(-) + +--- a/include/net/af_unix.h ++++ b/include/net/af_unix.h +@@ -8,13 +8,21 @@ + #include + #include + ++#if IS_ENABLED(CONFIG_UNIX) ++struct unix_sock *unix_get_socket(struct file *filp); ++#else ++static inline struct unix_sock *unix_get_socket(struct file *filp) ++{ ++ return NULL; ++} ++#endif ++ + void unix_inflight(struct user_struct *user, struct file *fp); + void unix_notinflight(struct user_struct *user, struct file *fp); + void unix_destruct_scm(struct sk_buff *skb); + void io_uring_destruct_scm(struct sk_buff *skb); + void unix_gc(void); +-void wait_for_unix_gc(void); +-struct unix_sock *unix_get_socket(struct file *filp); ++void wait_for_unix_gc(struct scm_fp_list *fpl); + struct sock *unix_peer_get(struct sock *sk); + + #define UNIX_HASH_MOD (256 - 1) +--- a/include/net/scm.h ++++ b/include/net/scm.h +@@ -24,6 +24,7 @@ struct scm_creds { + + struct scm_fp_list { + short count; ++ short count_unix; + short max; + struct user_struct *user; + struct file *fp[SCM_MAX_FD]; +--- a/net/core/scm.c ++++ b/net/core/scm.c +@@ -36,6 +36,7 @@ + #include + #include + #include ++#include + + + /* +@@ -85,6 +86,7 @@ static int scm_fp_copy(struct cmsghdr *c + return -ENOMEM; + *fplp = fpl; + fpl->count = 0; ++ fpl->count_unix = 0; + fpl->max = SCM_MAX_FD; + fpl->user = NULL; + } +@@ -109,6 +111,9 @@ static int scm_fp_copy(struct cmsghdr *c + fput(file); + return -EINVAL; + } ++ if (unix_get_socket(file)) ++ fpl->count_unix++; ++ + *fpp++ = file; + fpl->count++; + } +--- a/net/unix/af_unix.c ++++ b/net/unix/af_unix.c +@@ -1885,11 +1885,12 @@ static int unix_dgram_sendmsg(struct soc + long timeo; + int err; + +- wait_for_unix_gc(); + err = scm_send(sock, msg, &scm, false); + if (err < 0) + return err; + ++ wait_for_unix_gc(scm.fp); ++ + err = -EOPNOTSUPP; + if (msg->msg_flags&MSG_OOB) + goto out; +@@ -2157,11 +2158,12 @@ static int unix_stream_sendmsg(struct so + bool fds_sent = false; + int data_len; + +- wait_for_unix_gc(); + err = scm_send(sock, msg, &scm, false); + if (err < 0) + return err; + ++ wait_for_unix_gc(scm.fp); ++ + err = -EOPNOTSUPP; + if (msg->msg_flags & MSG_OOB) { + #if IS_ENABLED(CONFIG_AF_UNIX_OOB) +--- a/net/unix/garbage.c ++++ b/net/unix/garbage.c +@@ -335,8 +335,9 @@ void unix_gc(void) + } + + #define UNIX_INFLIGHT_TRIGGER_GC 16000 ++#define UNIX_INFLIGHT_SANE_USER (SCM_MAX_FD * 8) + +-void wait_for_unix_gc(void) ++void wait_for_unix_gc(struct scm_fp_list *fpl) + { + /* If number of inflight sockets is insane, + * force a garbage collect right now. +@@ -348,6 +349,13 @@ void wait_for_unix_gc(void) + !READ_ONCE(gc_in_progress)) + unix_gc(); + ++ /* Penalise users who want to send AF_UNIX sockets ++ * but whose sockets have not been received yet. ++ */ ++ if (!fpl || !fpl->count_unix || ++ READ_ONCE(fpl->user->unix_inflight) < UNIX_INFLIGHT_SANE_USER) ++ return; ++ + if (READ_ONCE(gc_in_progress)) + flush_work(&unix_gc_work); + } diff --git a/queue-6.6/series b/queue-6.6/series index 19f974e8dd..d42c7b4d03 100644 --- a/queue-6.6/series +++ b/queue-6.6/series @@ -392,3 +392,29 @@ drm-gem-internally-test-import_attach-for-imported-objects.patch can-kvaser_pciefd-force-irq-edge-in-case-of-nested-irq.patch hrtimers-force-migrate-away-hrtimers-queued-after-cpuhp_ap_hrtimers_dying.patch btrfs-check-folio-mapping-after-unlock-in-relocate_one_folio.patch +af_unix-return-struct-unix_sock-from-unix_get_socket.patch +af_unix-run-gc-on-only-one-cpu.patch +af_unix-try-to-run-gc-async.patch +af_unix-replace-bug_on-with-warn_on_once.patch +af_unix-remove-io_uring-code-for-gc.patch +af_unix-remove-config_unix_scm.patch +af_unix-allocate-struct-unix_vertex-for-each-inflight-af_unix-fd.patch +af_unix-allocate-struct-unix_edge-for-each-inflight-af_unix-fd.patch +af_unix-link-struct-unix_edge-when-queuing-skb.patch +af_unix-bulk-update-unix_tot_inflight-unix_inflight-when-queuing-skb.patch +af_unix-iterate-all-vertices-by-dfs.patch +af_unix-detect-strongly-connected-components.patch +af_unix-save-listener-for-embryo-socket.patch +af_unix-fix-up-unix_edge.successor-for-embryo-socket.patch +af_unix-save-o-n-setup-of-tarjan-s-algo.patch +af_unix-skip-gc-if-no-cycle-exists.patch +af_unix-avoid-tarjan-s-algorithm-if-unnecessary.patch +af_unix-assign-a-unique-index-to-scc.patch +af_unix-detect-dead-scc.patch +af_unix-replace-garbage-collection-algorithm.patch +af_unix-remove-lock-dance-in-unix_peek_fds.patch +af_unix-try-not-to-hold-unix_gc_lock-during-accept.patch +af_unix-don-t-access-successor-in-unix_del_edges-during-gc.patch +af_unix-add-dead-flag-to-struct-scm_fp_list.patch +af_unix-fix-garbage-collection-of-embryos-carrying-oob-with-scm_rights.patch +af_unix-fix-uninit-value-in-__unix_walk_scc.patch