1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * NET4: Implementation of BSD Unix domain sockets.
5 * Authors: Alan Cox, <alan@lxorguk.ukuu.org.uk>
8 * Linus Torvalds : Assorted bug cures.
9 * Niibe Yutaka : async I/O support.
10 * Carsten Paeth : PF_UNIX check, address fixes.
11 * Alan Cox : Limit size of allocated blocks.
12 * Alan Cox : Fixed the stupid socketpair bug.
13 * Alan Cox : BSD compatibility fine tuning.
14 * Alan Cox : Fixed a bug in connect when interrupted.
15 * Alan Cox : Sorted out a proper draft version of
16 * file descriptor passing hacked up from
18 * Marty Leisner : Fixes to fd passing
19 * Nick Nevin : recvmsg bugfix.
20 * Alan Cox : Started proper garbage collector
21 * Heiko EiBfeldt : Missing verify_area check
22 * Alan Cox : Started POSIXisms
23 * Andreas Schwab : Replace inode by dentry for proper
25 * Kirk Petersen : Made this a module
26 * Christoph Rohland : Elegant non-blocking accept/connect algorithm.
28 * Alexey Kuznetosv : Repaired (I hope) bugs introduces
29 * by above two patches.
30 * Andrea Arcangeli : If possible we block in connect(2)
31 * if the max backlog of the listen socket
32 * is been reached. This won't break
33 * old apps and it will avoid huge amount
34 * of socks hashed (this for unix_gc()
35 * performances reasons).
36 * Security fix that limits the max
37 * number of socks to 2*max_files and
38 * the number of skb queueable in the
40 * Artur Skawina : Hash function optimizations
41 * Alexey Kuznetsov : Full scale SMP. Lot of bugs are introduced 8)
42 * Malcolm Beattie : Set peercred for socketpair
43 * Michal Ostrowski : Module initialization cleanup.
44 * Arnaldo C. Melo : Remove MOD_{INC,DEC}_USE_COUNT,
45 * the core infrastructure is doing that
46 * for all net proto families now (2.5.69+)
48 * Known differences from reference BSD that was tested:
51 * ECONNREFUSED is not returned from one end of a connected() socket to the
52 * other the moment one end closes.
53 * fstat() doesn't return st_dev=0, and give the blksize as high water mark
54 * and a fake inode identifier (nor the BSD first socket fstat twice bug).
56 * accept() returns a path name even if the connecting socket has closed
57 * in the meantime (BSD loses the path and gives up).
58 * accept() returns 0 length path for an unbound connector. BSD returns 16
59 * and a null first byte in the path (but not for gethost/peername - BSD bug ??)
60 * socketpair(...SOCK_RAW..) doesn't panic the kernel.
61 * BSD af_unix apparently has connect forgetting to block properly.
62 * (need to check this with the POSIX spec in detail)
64 * Differences from 2.0.0-11-... (ANK)
65 * Bug fixes and improvements.
66 * - client shutdown killed server socket.
67 * - removed all useless cli/sti pairs.
69 * Semantic changes/extensions.
70 * - generic control message passing.
71 * - SCM_CREDENTIALS control message.
72 * - "Abstract" (not FS based) socket bindings.
73 * Abstract names are sequences of bytes (not zero terminated)
74 * started by 0, so that this name space does not intersect
78 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
80 #include <linux/bpf-cgroup.h>
81 #include <linux/btf_ids.h>
82 #include <linux/dcache.h>
83 #include <linux/errno.h>
84 #include <linux/fcntl.h>
85 #include <linux/file.h>
86 #include <linux/filter.h>
88 #include <linux/fs_struct.h>
89 #include <linux/init.h>
90 #include <linux/kernel.h>
91 #include <linux/mount.h>
92 #include <linux/namei.h>
93 #include <linux/net.h>
94 #include <linux/pidfs.h>
95 #include <linux/poll.h>
96 #include <linux/proc_fs.h>
97 #include <linux/sched/signal.h>
98 #include <linux/security.h>
99 #include <linux/seq_file.h>
100 #include <linux/skbuff.h>
101 #include <linux/slab.h>
102 #include <linux/socket.h>
103 #include <linux/splice.h>
104 #include <linux/string.h>
105 #include <linux/uaccess.h>
106 #include <net/af_unix.h>
107 #include <net/net_namespace.h>
109 #include <net/tcp_states.h>
110 #include <uapi/linux/sockios.h>
111 #include <uapi/linux/termios.h>
115 static atomic_long_t unix_nr_socks
;
116 static struct hlist_head bsd_socket_buckets
[UNIX_HASH_SIZE
/ 2];
117 static spinlock_t bsd_socket_locks
[UNIX_HASH_SIZE
/ 2];
119 /* SMP locking strategy:
120 * hash table is protected with spinlock.
121 * each socket state is protected by separate spinlock.
123 #ifdef CONFIG_PROVE_LOCKING
124 #define cmp_ptr(l, r) (((l) > (r)) - ((l) < (r)))
126 static int unix_table_lock_cmp_fn(const struct lockdep_map
*a
,
127 const struct lockdep_map
*b
)
129 return cmp_ptr(a
, b
);
132 static int unix_state_lock_cmp_fn(const struct lockdep_map
*_a
,
133 const struct lockdep_map
*_b
)
135 const struct unix_sock
*a
, *b
;
137 a
= container_of(_a
, struct unix_sock
, lock
.dep_map
);
138 b
= container_of(_b
, struct unix_sock
, lock
.dep_map
);
140 if (a
->sk
.sk_state
== TCP_LISTEN
) {
141 /* unix_stream_connect(): Before the 2nd unix_state_lock(),
143 * 1. a is TCP_LISTEN.
145 * 3. concurrent connect(b -> a) must fail.
147 * Except for 2. & 3., the b's state can be any possible
148 * value due to concurrent connect() or listen().
150 * 2. is detected in debug_spin_lock_before(), and 3. cannot
151 * be expressed as lock_cmp_fn.
153 switch (b
->sk
.sk_state
) {
155 case TCP_ESTABLISHED
:
164 /* Should never happen. Just to be symmetric. */
165 if (b
->sk
.sk_state
== TCP_LISTEN
) {
166 switch (b
->sk
.sk_state
) {
168 case TCP_ESTABLISHED
:
175 /* unix_state_double_lock(): ascending address order. */
176 return cmp_ptr(a
, b
);
179 static int unix_recvq_lock_cmp_fn(const struct lockdep_map
*_a
,
180 const struct lockdep_map
*_b
)
182 const struct sock
*a
, *b
;
184 a
= container_of(_a
, struct sock
, sk_receive_queue
.lock
.dep_map
);
185 b
= container_of(_b
, struct sock
, sk_receive_queue
.lock
.dep_map
);
187 /* unix_collect_skb(): listener -> embryo order. */
188 if (a
->sk_state
== TCP_LISTEN
&& unix_sk(b
)->listener
== a
)
191 /* Should never happen. Just to be symmetric. */
192 if (b
->sk_state
== TCP_LISTEN
&& unix_sk(a
)->listener
== b
)
199 static unsigned int unix_unbound_hash(struct sock
*sk
)
201 unsigned long hash
= (unsigned long)sk
;
207 return hash
& UNIX_HASH_MOD
;
210 static unsigned int unix_bsd_hash(struct inode
*i
)
212 return i
->i_ino
& UNIX_HASH_MOD
;
215 static unsigned int unix_abstract_hash(struct sockaddr_un
*sunaddr
,
216 int addr_len
, int type
)
218 __wsum csum
= csum_partial(sunaddr
, addr_len
, 0);
221 hash
= (__force
unsigned int)csum_fold(csum
);
225 return UNIX_HASH_MOD
+ 1 + (hash
& UNIX_HASH_MOD
);
228 static void unix_table_double_lock(struct net
*net
,
229 unsigned int hash1
, unsigned int hash2
)
231 if (hash1
== hash2
) {
232 spin_lock(&net
->unx
.table
.locks
[hash1
]);
239 spin_lock(&net
->unx
.table
.locks
[hash1
]);
240 spin_lock(&net
->unx
.table
.locks
[hash2
]);
243 static void unix_table_double_unlock(struct net
*net
,
244 unsigned int hash1
, unsigned int hash2
)
246 if (hash1
== hash2
) {
247 spin_unlock(&net
->unx
.table
.locks
[hash1
]);
251 spin_unlock(&net
->unx
.table
.locks
[hash1
]);
252 spin_unlock(&net
->unx
.table
.locks
[hash2
]);
255 #ifdef CONFIG_SECURITY_NETWORK
256 static void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
258 UNIXCB(skb
).secid
= scm
->secid
;
261 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
263 scm
->secid
= UNIXCB(skb
).secid
;
266 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
268 return (scm
->secid
== UNIXCB(skb
).secid
);
271 static inline void unix_get_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
274 static inline void unix_set_secdata(struct scm_cookie
*scm
, struct sk_buff
*skb
)
277 static inline bool unix_secdata_eq(struct scm_cookie
*scm
, struct sk_buff
*skb
)
281 #endif /* CONFIG_SECURITY_NETWORK */
283 static inline int unix_may_send(struct sock
*sk
, struct sock
*osk
)
285 return !unix_peer(osk
) || unix_peer(osk
) == sk
;
288 static inline int unix_recvq_full_lockless(const struct sock
*sk
)
290 return skb_queue_len_lockless(&sk
->sk_receive_queue
) > sk
->sk_max_ack_backlog
;
293 struct sock
*unix_peer_get(struct sock
*s
)
301 unix_state_unlock(s
);
304 EXPORT_SYMBOL_GPL(unix_peer_get
);
306 static struct unix_address
*unix_create_addr(struct sockaddr_un
*sunaddr
,
309 struct unix_address
*addr
;
311 addr
= kmalloc(sizeof(*addr
) + addr_len
, GFP_KERNEL
);
315 refcount_set(&addr
->refcnt
, 1);
316 addr
->len
= addr_len
;
317 memcpy(addr
->name
, sunaddr
, addr_len
);
322 static inline void unix_release_addr(struct unix_address
*addr
)
324 if (refcount_dec_and_test(&addr
->refcnt
))
329 * Check unix socket name:
330 * - should be not zero length.
331 * - if started by not zero, should be NULL terminated (FS object)
332 * - if started by zero, it is abstract name.
335 static int unix_validate_addr(struct sockaddr_un
*sunaddr
, int addr_len
)
337 if (addr_len
<= offsetof(struct sockaddr_un
, sun_path
) ||
338 addr_len
> sizeof(*sunaddr
))
341 if (sunaddr
->sun_family
!= AF_UNIX
)
347 static int unix_mkname_bsd(struct sockaddr_un
*sunaddr
, int addr_len
)
349 struct sockaddr_storage
*addr
= (struct sockaddr_storage
*)sunaddr
;
350 short offset
= offsetof(struct sockaddr_storage
, __data
);
352 BUILD_BUG_ON(offset
!= offsetof(struct sockaddr_un
, sun_path
));
354 /* This may look like an off by one error but it is a bit more
355 * subtle. 108 is the longest valid AF_UNIX path for a binding.
356 * sun_path[108] doesn't as such exist. However in kernel space
357 * we are guaranteed that it is a valid memory location in our
358 * kernel address buffer because syscall functions always pass
359 * a pointer of struct sockaddr_storage which has a bigger buffer
360 * than 108. Also, we must terminate sun_path for strlen() in
363 addr
->__data
[addr_len
- offset
] = 0;
365 /* Don't pass sunaddr->sun_path to strlen(). Otherwise, 108 will
366 * cause panic if CONFIG_FORTIFY_SOURCE=y. Let __fortify_strlen()
367 * know the actual buffer.
369 return strlen(addr
->__data
) + offset
+ 1;
372 static void __unix_remove_socket(struct sock
*sk
)
374 sk_del_node_init(sk
);
377 static void __unix_insert_socket(struct net
*net
, struct sock
*sk
)
379 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk
));
380 sk_add_node(sk
, &net
->unx
.table
.buckets
[sk
->sk_hash
]);
383 static void __unix_set_addr_hash(struct net
*net
, struct sock
*sk
,
384 struct unix_address
*addr
, unsigned int hash
)
386 __unix_remove_socket(sk
);
387 smp_store_release(&unix_sk(sk
)->addr
, addr
);
390 __unix_insert_socket(net
, sk
);
393 static void unix_remove_socket(struct net
*net
, struct sock
*sk
)
395 spin_lock(&net
->unx
.table
.locks
[sk
->sk_hash
]);
396 __unix_remove_socket(sk
);
397 spin_unlock(&net
->unx
.table
.locks
[sk
->sk_hash
]);
400 static void unix_insert_unbound_socket(struct net
*net
, struct sock
*sk
)
402 spin_lock(&net
->unx
.table
.locks
[sk
->sk_hash
]);
403 __unix_insert_socket(net
, sk
);
404 spin_unlock(&net
->unx
.table
.locks
[sk
->sk_hash
]);
407 static void unix_insert_bsd_socket(struct sock
*sk
)
409 spin_lock(&bsd_socket_locks
[sk
->sk_hash
]);
410 sk_add_bind_node(sk
, &bsd_socket_buckets
[sk
->sk_hash
]);
411 spin_unlock(&bsd_socket_locks
[sk
->sk_hash
]);
414 static void unix_remove_bsd_socket(struct sock
*sk
)
416 if (!hlist_unhashed(&sk
->sk_bind_node
)) {
417 spin_lock(&bsd_socket_locks
[sk
->sk_hash
]);
418 __sk_del_bind_node(sk
);
419 spin_unlock(&bsd_socket_locks
[sk
->sk_hash
]);
421 sk_node_init(&sk
->sk_bind_node
);
425 static struct sock
*__unix_find_socket_byname(struct net
*net
,
426 struct sockaddr_un
*sunname
,
427 int len
, unsigned int hash
)
431 sk_for_each(s
, &net
->unx
.table
.buckets
[hash
]) {
432 struct unix_sock
*u
= unix_sk(s
);
434 if (u
->addr
->len
== len
&&
435 !memcmp(u
->addr
->name
, sunname
, len
))
441 static inline struct sock
*unix_find_socket_byname(struct net
*net
,
442 struct sockaddr_un
*sunname
,
443 int len
, unsigned int hash
)
447 spin_lock(&net
->unx
.table
.locks
[hash
]);
448 s
= __unix_find_socket_byname(net
, sunname
, len
, hash
);
451 spin_unlock(&net
->unx
.table
.locks
[hash
]);
455 static struct sock
*unix_find_socket_byinode(struct inode
*i
)
457 unsigned int hash
= unix_bsd_hash(i
);
460 spin_lock(&bsd_socket_locks
[hash
]);
461 sk_for_each_bound(s
, &bsd_socket_buckets
[hash
]) {
462 struct dentry
*dentry
= unix_sk(s
)->path
.dentry
;
464 if (dentry
&& d_backing_inode(dentry
) == i
) {
466 spin_unlock(&bsd_socket_locks
[hash
]);
470 spin_unlock(&bsd_socket_locks
[hash
]);
474 /* Support code for asymmetrically connected dgram sockets
476 * If a datagram socket is connected to a socket not itself connected
477 * to the first socket (eg, /dev/log), clients may only enqueue more
478 * messages if the present receive queue of the server socket is not
479 * "too large". This means there's a second writeability condition
480 * poll and sendmsg need to test. The dgram recv code will do a wake
481 * up on the peer_wait wait queue of a socket upon reception of a
482 * datagram which needs to be propagated to sleeping would-be writers
483 * since these might not have sent anything so far. This can't be
484 * accomplished via poll_wait because the lifetime of the server
485 * socket might be less than that of its clients if these break their
486 * association with it or if the server socket is closed while clients
487 * are still connected to it and there's no way to inform "a polling
488 * implementation" that it should let go of a certain wait queue
490 * In order to propagate a wake up, a wait_queue_entry_t of the client
491 * socket is enqueued on the peer_wait queue of the server socket
492 * whose wake function does a wake_up on the ordinary client socket
493 * wait queue. This connection is established whenever a write (or
494 * poll for write) hit the flow control condition and broken when the
495 * association to the server socket is dissolved or after a wake up
499 static int unix_dgram_peer_wake_relay(wait_queue_entry_t
*q
, unsigned mode
, int flags
,
503 wait_queue_head_t
*u_sleep
;
505 u
= container_of(q
, struct unix_sock
, peer_wake
);
507 __remove_wait_queue(&unix_sk(u
->peer_wake
.private)->peer_wait
,
509 u
->peer_wake
.private = NULL
;
511 /* relaying can only happen while the wq still exists */
512 u_sleep
= sk_sleep(&u
->sk
);
514 wake_up_interruptible_poll(u_sleep
, key_to_poll(key
));
519 static int unix_dgram_peer_wake_connect(struct sock
*sk
, struct sock
*other
)
521 struct unix_sock
*u
, *u_other
;
525 u_other
= unix_sk(other
);
527 spin_lock(&u_other
->peer_wait
.lock
);
529 if (!u
->peer_wake
.private) {
530 u
->peer_wake
.private = other
;
531 __add_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
536 spin_unlock(&u_other
->peer_wait
.lock
);
540 static void unix_dgram_peer_wake_disconnect(struct sock
*sk
,
543 struct unix_sock
*u
, *u_other
;
546 u_other
= unix_sk(other
);
547 spin_lock(&u_other
->peer_wait
.lock
);
549 if (u
->peer_wake
.private == other
) {
550 __remove_wait_queue(&u_other
->peer_wait
, &u
->peer_wake
);
551 u
->peer_wake
.private = NULL
;
554 spin_unlock(&u_other
->peer_wait
.lock
);
557 static void unix_dgram_peer_wake_disconnect_wakeup(struct sock
*sk
,
560 unix_dgram_peer_wake_disconnect(sk
, other
);
561 wake_up_interruptible_poll(sk_sleep(sk
),
568 * - unix_peer(sk) == other
569 * - association is stable
571 static int unix_dgram_peer_wake_me(struct sock
*sk
, struct sock
*other
)
575 connected
= unix_dgram_peer_wake_connect(sk
, other
);
577 /* If other is SOCK_DEAD, we want to make sure we signal
578 * POLLOUT, such that a subsequent write() can get a
579 * -ECONNREFUSED. Otherwise, if we haven't queued any skbs
580 * to other and its full, we will hang waiting for POLLOUT.
582 if (unix_recvq_full_lockless(other
) && !sock_flag(other
, SOCK_DEAD
))
586 unix_dgram_peer_wake_disconnect(sk
, other
);
591 static int unix_writable(const struct sock
*sk
, unsigned char state
)
593 return state
!= TCP_LISTEN
&&
594 (refcount_read(&sk
->sk_wmem_alloc
) << 2) <= READ_ONCE(sk
->sk_sndbuf
);
597 static void unix_write_space(struct sock
*sk
)
599 struct socket_wq
*wq
;
602 if (unix_writable(sk
, READ_ONCE(sk
->sk_state
))) {
603 wq
= rcu_dereference(sk
->sk_wq
);
604 if (skwq_has_sleeper(wq
))
605 wake_up_interruptible_sync_poll(&wq
->wait
,
606 EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
);
607 sk_wake_async_rcu(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
612 /* When dgram socket disconnects (or changes its peer), we clear its receive
613 * queue of packets arrived from previous peer. First, it allows to do
614 * flow control based only on wmem_alloc; second, sk connected to peer
615 * may receive messages only from that peer. */
616 static void unix_dgram_disconnected(struct sock
*sk
, struct sock
*other
)
618 if (!skb_queue_empty(&sk
->sk_receive_queue
)) {
619 skb_queue_purge_reason(&sk
->sk_receive_queue
,
620 SKB_DROP_REASON_UNIX_DISCONNECT
);
622 wake_up_interruptible_all(&unix_sk(sk
)->peer_wait
);
624 /* If one link of bidirectional dgram pipe is disconnected,
625 * we signal error. Messages are lost. Do not make this,
626 * when peer was not connected to us.
628 if (!sock_flag(other
, SOCK_DEAD
) && unix_peer(other
) == sk
) {
629 WRITE_ONCE(other
->sk_err
, ECONNRESET
);
630 sk_error_report(other
);
635 static void unix_sock_destructor(struct sock
*sk
)
637 struct unix_sock
*u
= unix_sk(sk
);
639 skb_queue_purge_reason(&sk
->sk_receive_queue
, SKB_DROP_REASON_SOCKET_CLOSE
);
641 DEBUG_NET_WARN_ON_ONCE(refcount_read(&sk
->sk_wmem_alloc
));
642 DEBUG_NET_WARN_ON_ONCE(!sk_unhashed(sk
));
643 DEBUG_NET_WARN_ON_ONCE(sk
->sk_socket
);
644 if (!sock_flag(sk
, SOCK_DEAD
)) {
645 pr_info("Attempt to release alive unix socket: %p\n", sk
);
650 pidfs_put_pid(sk
->sk_peer_pid
);
653 unix_release_addr(u
->addr
);
655 atomic_long_dec(&unix_nr_socks
);
656 sock_prot_inuse_add(sock_net(sk
), sk
->sk_prot
, -1);
657 #ifdef UNIX_REFCNT_DEBUG
658 pr_debug("UNIX %p is destroyed, %ld are still alive.\n", sk
,
659 atomic_long_read(&unix_nr_socks
));
663 static unsigned int unix_skb_len(const struct sk_buff
*skb
)
665 return skb
->len
- UNIXCB(skb
).consumed
;
668 static void unix_release_sock(struct sock
*sk
, int embrion
)
670 struct unix_sock
*u
= unix_sk(sk
);
676 unix_remove_socket(sock_net(sk
), sk
);
677 unix_remove_bsd_socket(sk
);
682 WRITE_ONCE(sk
->sk_shutdown
, SHUTDOWN_MASK
);
684 u
->path
.dentry
= NULL
;
686 state
= sk
->sk_state
;
687 WRITE_ONCE(sk
->sk_state
, TCP_CLOSE
);
689 skpair
= unix_peer(sk
);
690 unix_peer(sk
) = NULL
;
692 unix_state_unlock(sk
);
694 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
698 wake_up_interruptible_all(&u
->peer_wait
);
700 if (skpair
!= NULL
) {
701 if (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) {
702 struct sk_buff
*skb
= skb_peek(&sk
->sk_receive_queue
);
704 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
705 if (skb
&& !unix_skb_len(skb
))
706 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
708 unix_state_lock(skpair
);
710 WRITE_ONCE(skpair
->sk_shutdown
, SHUTDOWN_MASK
);
712 WRITE_ONCE(skpair
->sk_err
, ECONNRESET
);
713 unix_state_unlock(skpair
);
714 skpair
->sk_state_change(skpair
);
715 sk_wake_async(skpair
, SOCK_WAKE_WAITD
, POLL_HUP
);
718 unix_dgram_peer_wake_disconnect(sk
, skpair
);
719 sock_put(skpair
); /* It may now die */
722 /* Try to flush out this socket. Throw out buffers at least */
724 while ((skb
= skb_dequeue(&sk
->sk_receive_queue
)) != NULL
) {
725 if (state
== TCP_LISTEN
)
726 unix_release_sock(skb
->sk
, 1);
728 /* passed fds are erased in the kfree_skb hook */
729 kfree_skb_reason(skb
, SKB_DROP_REASON_SOCKET_CLOSE
);
737 /* ---- Socket is dead now and most probably destroyed ---- */
740 * Fixme: BSD difference: In BSD all sockets connected to us get
741 * ECONNRESET and we die on the spot. In Linux we behave
742 * like files and pipes do and wait for the last
745 * Can't we simply set sock->err?
747 * What the above comment does talk about? --ANK(980817)
750 if (READ_ONCE(unix_tot_inflight
))
751 unix_gc(); /* Garbage collect fds */
754 struct unix_peercred
{
755 struct pid
*peer_pid
;
756 const struct cred
*peer_cred
;
759 static inline int prepare_peercred(struct unix_peercred
*peercred
)
764 pid
= task_tgid(current
);
765 err
= pidfs_register_pid(pid
);
767 peercred
->peer_pid
= get_pid(pid
);
768 peercred
->peer_cred
= get_current_cred();
773 static void drop_peercred(struct unix_peercred
*peercred
)
775 const struct cred
*cred
= NULL
;
776 struct pid
*pid
= NULL
;
780 swap(peercred
->peer_pid
, pid
);
781 swap(peercred
->peer_cred
, cred
);
788 static inline void init_peercred(struct sock
*sk
,
789 const struct unix_peercred
*peercred
)
791 sk
->sk_peer_pid
= peercred
->peer_pid
;
792 sk
->sk_peer_cred
= peercred
->peer_cred
;
795 static void update_peercred(struct sock
*sk
, struct unix_peercred
*peercred
)
797 const struct cred
*old_cred
;
800 spin_lock(&sk
->sk_peer_lock
);
801 old_pid
= sk
->sk_peer_pid
;
802 old_cred
= sk
->sk_peer_cred
;
803 init_peercred(sk
, peercred
);
804 spin_unlock(&sk
->sk_peer_lock
);
806 peercred
->peer_pid
= old_pid
;
807 peercred
->peer_cred
= old_cred
;
810 static void copy_peercred(struct sock
*sk
, struct sock
*peersk
)
812 lockdep_assert_held(&unix_sk(peersk
)->lock
);
814 spin_lock(&sk
->sk_peer_lock
);
815 sk
->sk_peer_pid
= get_pid(peersk
->sk_peer_pid
);
816 pidfs_get_pid(sk
->sk_peer_pid
);
817 sk
->sk_peer_cred
= get_cred(peersk
->sk_peer_cred
);
818 spin_unlock(&sk
->sk_peer_lock
);
821 static bool unix_may_passcred(const struct sock
*sk
)
823 return sk
->sk_scm_credentials
|| sk
->sk_scm_pidfd
;
826 static int unix_listen(struct socket
*sock
, int backlog
)
829 struct sock
*sk
= sock
->sk
;
830 struct unix_sock
*u
= unix_sk(sk
);
831 struct unix_peercred peercred
= {};
834 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
835 goto out
; /* Only stream/seqpacket sockets accept */
837 if (!READ_ONCE(u
->addr
))
838 goto out
; /* No listens on an unbound socket */
839 err
= prepare_peercred(&peercred
);
843 if (sk
->sk_state
!= TCP_CLOSE
&& sk
->sk_state
!= TCP_LISTEN
)
845 if (backlog
> sk
->sk_max_ack_backlog
)
846 wake_up_interruptible_all(&u
->peer_wait
);
847 sk
->sk_max_ack_backlog
= backlog
;
848 WRITE_ONCE(sk
->sk_state
, TCP_LISTEN
);
850 /* set credentials so connect can copy them */
851 update_peercred(sk
, &peercred
);
855 unix_state_unlock(sk
);
856 drop_peercred(&peercred
);
861 static int unix_release(struct socket
*);
862 static int unix_bind(struct socket
*, struct sockaddr
*, int);
863 static int unix_stream_connect(struct socket
*, struct sockaddr
*,
864 int addr_len
, int flags
);
865 static int unix_socketpair(struct socket
*, struct socket
*);
866 static int unix_accept(struct socket
*, struct socket
*, struct proto_accept_arg
*arg
);
867 static int unix_getname(struct socket
*, struct sockaddr
*, int);
868 static __poll_t
unix_poll(struct file
*, struct socket
*, poll_table
*);
869 static __poll_t
unix_dgram_poll(struct file
*, struct socket
*,
871 static int unix_ioctl(struct socket
*, unsigned int, unsigned long);
873 static int unix_compat_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
);
875 static int unix_shutdown(struct socket
*, int);
876 static int unix_stream_sendmsg(struct socket
*, struct msghdr
*, size_t);
877 static int unix_stream_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
878 static ssize_t
unix_stream_splice_read(struct socket
*, loff_t
*ppos
,
879 struct pipe_inode_info
*, size_t size
,
881 static int unix_dgram_sendmsg(struct socket
*, struct msghdr
*, size_t);
882 static int unix_dgram_recvmsg(struct socket
*, struct msghdr
*, size_t, int);
883 static int unix_read_skb(struct sock
*sk
, skb_read_actor_t recv_actor
);
884 static int unix_stream_read_skb(struct sock
*sk
, skb_read_actor_t recv_actor
);
885 static int unix_dgram_connect(struct socket
*, struct sockaddr
*,
887 static int unix_seqpacket_sendmsg(struct socket
*, struct msghdr
*, size_t);
888 static int unix_seqpacket_recvmsg(struct socket
*, struct msghdr
*, size_t,
891 #ifdef CONFIG_PROC_FS
892 static int unix_count_nr_fds(struct sock
*sk
)
898 spin_lock(&sk
->sk_receive_queue
.lock
);
899 skb
= skb_peek(&sk
->sk_receive_queue
);
901 u
= unix_sk(skb
->sk
);
902 nr_fds
+= atomic_read(&u
->scm_stat
.nr_fds
);
903 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
905 spin_unlock(&sk
->sk_receive_queue
.lock
);
910 static void unix_show_fdinfo(struct seq_file
*m
, struct socket
*sock
)
912 struct sock
*sk
= sock
->sk
;
913 unsigned char s_state
;
918 s_state
= READ_ONCE(sk
->sk_state
);
921 /* SOCK_STREAM and SOCK_SEQPACKET sockets never change their
922 * sk_state after switching to TCP_ESTABLISHED or TCP_LISTEN.
923 * SOCK_DGRAM is ordinary. So, no lock is needed.
925 if (sock
->type
== SOCK_DGRAM
|| s_state
== TCP_ESTABLISHED
)
926 nr_fds
= atomic_read(&u
->scm_stat
.nr_fds
);
927 else if (s_state
== TCP_LISTEN
)
928 nr_fds
= unix_count_nr_fds(sk
);
930 seq_printf(m
, "scm_fds: %u\n", nr_fds
);
934 #define unix_show_fdinfo NULL
937 static const struct proto_ops unix_stream_ops
= {
939 .owner
= THIS_MODULE
,
940 .release
= unix_release
,
942 .connect
= unix_stream_connect
,
943 .socketpair
= unix_socketpair
,
944 .accept
= unix_accept
,
945 .getname
= unix_getname
,
949 .compat_ioctl
= unix_compat_ioctl
,
951 .listen
= unix_listen
,
952 .shutdown
= unix_shutdown
,
953 .sendmsg
= unix_stream_sendmsg
,
954 .recvmsg
= unix_stream_recvmsg
,
955 .read_skb
= unix_stream_read_skb
,
956 .mmap
= sock_no_mmap
,
957 .splice_read
= unix_stream_splice_read
,
958 .set_peek_off
= sk_set_peek_off
,
959 .show_fdinfo
= unix_show_fdinfo
,
962 static const struct proto_ops unix_dgram_ops
= {
964 .owner
= THIS_MODULE
,
965 .release
= unix_release
,
967 .connect
= unix_dgram_connect
,
968 .socketpair
= unix_socketpair
,
969 .accept
= sock_no_accept
,
970 .getname
= unix_getname
,
971 .poll
= unix_dgram_poll
,
974 .compat_ioctl
= unix_compat_ioctl
,
976 .listen
= sock_no_listen
,
977 .shutdown
= unix_shutdown
,
978 .sendmsg
= unix_dgram_sendmsg
,
979 .read_skb
= unix_read_skb
,
980 .recvmsg
= unix_dgram_recvmsg
,
981 .mmap
= sock_no_mmap
,
982 .set_peek_off
= sk_set_peek_off
,
983 .show_fdinfo
= unix_show_fdinfo
,
986 static const struct proto_ops unix_seqpacket_ops
= {
988 .owner
= THIS_MODULE
,
989 .release
= unix_release
,
991 .connect
= unix_stream_connect
,
992 .socketpair
= unix_socketpair
,
993 .accept
= unix_accept
,
994 .getname
= unix_getname
,
995 .poll
= unix_dgram_poll
,
998 .compat_ioctl
= unix_compat_ioctl
,
1000 .listen
= unix_listen
,
1001 .shutdown
= unix_shutdown
,
1002 .sendmsg
= unix_seqpacket_sendmsg
,
1003 .recvmsg
= unix_seqpacket_recvmsg
,
1004 .mmap
= sock_no_mmap
,
1005 .set_peek_off
= sk_set_peek_off
,
1006 .show_fdinfo
= unix_show_fdinfo
,
1009 static void unix_close(struct sock
*sk
, long timeout
)
1011 /* Nothing to do here, unix socket does not need a ->close().
1012 * This is merely for sockmap.
1016 static bool unix_bpf_bypass_getsockopt(int level
, int optname
)
1018 if (level
== SOL_SOCKET
) {
1030 struct proto unix_dgram_proto
= {
1032 .owner
= THIS_MODULE
,
1033 .obj_size
= sizeof(struct unix_sock
),
1034 .close
= unix_close
,
1035 .bpf_bypass_getsockopt
= unix_bpf_bypass_getsockopt
,
1036 #ifdef CONFIG_BPF_SYSCALL
1037 .psock_update_sk_prot
= unix_dgram_bpf_update_proto
,
1041 struct proto unix_stream_proto
= {
1042 .name
= "UNIX-STREAM",
1043 .owner
= THIS_MODULE
,
1044 .obj_size
= sizeof(struct unix_sock
),
1045 .close
= unix_close
,
1046 .bpf_bypass_getsockopt
= unix_bpf_bypass_getsockopt
,
1047 #ifdef CONFIG_BPF_SYSCALL
1048 .psock_update_sk_prot
= unix_stream_bpf_update_proto
,
1052 static struct sock
*unix_create1(struct net
*net
, struct socket
*sock
, int kern
, int type
)
1054 struct unix_sock
*u
;
1058 atomic_long_inc(&unix_nr_socks
);
1059 if (atomic_long_read(&unix_nr_socks
) > 2 * get_max_files()) {
1064 if (type
== SOCK_STREAM
)
1065 sk
= sk_alloc(net
, PF_UNIX
, GFP_KERNEL
, &unix_stream_proto
, kern
);
1066 else /*dgram and seqpacket */
1067 sk
= sk_alloc(net
, PF_UNIX
, GFP_KERNEL
, &unix_dgram_proto
, kern
);
1074 sock_init_data(sock
, sk
);
1076 sk
->sk_scm_rights
= 1;
1077 sk
->sk_hash
= unix_unbound_hash(sk
);
1078 sk
->sk_allocation
= GFP_KERNEL_ACCOUNT
;
1079 sk
->sk_write_space
= unix_write_space
;
1080 sk
->sk_max_ack_backlog
= READ_ONCE(net
->unx
.sysctl_max_dgram_qlen
);
1081 sk
->sk_destruct
= unix_sock_destructor
;
1082 lock_set_cmp_fn(&sk
->sk_receive_queue
.lock
, unix_recvq_lock_cmp_fn
, NULL
);
1087 u
->path
.dentry
= NULL
;
1089 spin_lock_init(&u
->lock
);
1090 lock_set_cmp_fn(&u
->lock
, unix_state_lock_cmp_fn
, NULL
);
1091 mutex_init(&u
->iolock
); /* single task reading lock */
1092 mutex_init(&u
->bindlock
); /* single task binding lock */
1093 init_waitqueue_head(&u
->peer_wait
);
1094 init_waitqueue_func_entry(&u
->peer_wake
, unix_dgram_peer_wake_relay
);
1095 memset(&u
->scm_stat
, 0, sizeof(struct scm_stat
));
1096 unix_insert_unbound_socket(net
, sk
);
1098 sock_prot_inuse_add(net
, sk
->sk_prot
, 1);
1103 atomic_long_dec(&unix_nr_socks
);
1104 return ERR_PTR(err
);
1107 static int unix_create(struct net
*net
, struct socket
*sock
, int protocol
,
1112 if (protocol
&& protocol
!= PF_UNIX
)
1113 return -EPROTONOSUPPORT
;
1115 sock
->state
= SS_UNCONNECTED
;
1117 switch (sock
->type
) {
1119 sock
->ops
= &unix_stream_ops
;
1122 * Believe it or not BSD has AF_UNIX, SOCK_RAW though
1126 sock
->type
= SOCK_DGRAM
;
1129 sock
->ops
= &unix_dgram_ops
;
1131 case SOCK_SEQPACKET
:
1132 sock
->ops
= &unix_seqpacket_ops
;
1135 return -ESOCKTNOSUPPORT
;
1138 sk
= unix_create1(net
, sock
, kern
, sock
->type
);
1145 static int unix_release(struct socket
*sock
)
1147 struct sock
*sk
= sock
->sk
;
1152 sk
->sk_prot
->close(sk
, 0);
1153 unix_release_sock(sk
, 0);
1159 static struct sock
*unix_find_bsd(struct sockaddr_un
*sunaddr
, int addr_len
,
1160 int type
, int flags
)
1162 struct inode
*inode
;
1167 unix_mkname_bsd(sunaddr
, addr_len
);
1169 if (flags
& SOCK_COREDUMP
) {
1170 const struct cred
*cred
;
1174 kcred
= prepare_kernel_cred(&init_task
);
1180 task_lock(&init_task
);
1181 get_fs_root(init_task
.fs
, &root
);
1182 task_unlock(&init_task
);
1184 cred
= override_creds(kcred
);
1185 err
= vfs_path_lookup(root
.dentry
, root
.mnt
, sunaddr
->sun_path
,
1186 LOOKUP_BENEATH
| LOOKUP_NO_SYMLINKS
|
1187 LOOKUP_NO_MAGICLINKS
, &path
);
1188 put_cred(revert_creds(cred
));
1193 err
= kern_path(sunaddr
->sun_path
, LOOKUP_FOLLOW
, &path
);
1197 err
= path_permission(&path
, MAY_WRITE
);
1202 err
= -ECONNREFUSED
;
1203 inode
= d_backing_inode(path
.dentry
);
1204 if (!S_ISSOCK(inode
->i_mode
))
1207 sk
= unix_find_socket_byinode(inode
);
1212 if (sk
->sk_type
== type
)
1226 return ERR_PTR(err
);
1229 static struct sock
*unix_find_abstract(struct net
*net
,
1230 struct sockaddr_un
*sunaddr
,
1231 int addr_len
, int type
)
1233 unsigned int hash
= unix_abstract_hash(sunaddr
, addr_len
, type
);
1234 struct dentry
*dentry
;
1237 sk
= unix_find_socket_byname(net
, sunaddr
, addr_len
, hash
);
1239 return ERR_PTR(-ECONNREFUSED
);
1241 dentry
= unix_sk(sk
)->path
.dentry
;
1243 touch_atime(&unix_sk(sk
)->path
);
1248 static struct sock
*unix_find_other(struct net
*net
,
1249 struct sockaddr_un
*sunaddr
,
1250 int addr_len
, int type
, int flags
)
1254 if (sunaddr
->sun_path
[0])
1255 sk
= unix_find_bsd(sunaddr
, addr_len
, type
, flags
);
1257 sk
= unix_find_abstract(net
, sunaddr
, addr_len
, type
);
1262 static int unix_autobind(struct sock
*sk
)
1264 struct unix_sock
*u
= unix_sk(sk
);
1265 unsigned int new_hash
, old_hash
;
1266 struct net
*net
= sock_net(sk
);
1267 struct unix_address
*addr
;
1268 u32 lastnum
, ordernum
;
1271 err
= mutex_lock_interruptible(&u
->bindlock
);
1279 addr
= kzalloc(sizeof(*addr
) +
1280 offsetof(struct sockaddr_un
, sun_path
) + 16, GFP_KERNEL
);
1284 addr
->len
= offsetof(struct sockaddr_un
, sun_path
) + 6;
1285 addr
->name
->sun_family
= AF_UNIX
;
1286 refcount_set(&addr
->refcnt
, 1);
1288 old_hash
= sk
->sk_hash
;
1289 ordernum
= get_random_u32();
1290 lastnum
= ordernum
& 0xFFFFF;
1292 ordernum
= (ordernum
+ 1) & 0xFFFFF;
1293 sprintf(addr
->name
->sun_path
+ 1, "%05x", ordernum
);
1295 new_hash
= unix_abstract_hash(addr
->name
, addr
->len
, sk
->sk_type
);
1296 unix_table_double_lock(net
, old_hash
, new_hash
);
1298 if (__unix_find_socket_byname(net
, addr
->name
, addr
->len
, new_hash
)) {
1299 unix_table_double_unlock(net
, old_hash
, new_hash
);
1301 /* __unix_find_socket_byname() may take long time if many names
1302 * are already in use.
1306 if (ordernum
== lastnum
) {
1307 /* Give up if all names seems to be in use. */
1309 unix_release_addr(addr
);
1316 __unix_set_addr_hash(net
, sk
, addr
, new_hash
);
1317 unix_table_double_unlock(net
, old_hash
, new_hash
);
1320 out
: mutex_unlock(&u
->bindlock
);
1324 static int unix_bind_bsd(struct sock
*sk
, struct sockaddr_un
*sunaddr
,
1327 umode_t mode
= S_IFSOCK
|
1328 (SOCK_INODE(sk
->sk_socket
)->i_mode
& ~current_umask());
1329 struct unix_sock
*u
= unix_sk(sk
);
1330 unsigned int new_hash
, old_hash
;
1331 struct net
*net
= sock_net(sk
);
1332 struct mnt_idmap
*idmap
;
1333 struct unix_address
*addr
;
1334 struct dentry
*dentry
;
1338 addr_len
= unix_mkname_bsd(sunaddr
, addr_len
);
1339 addr
= unix_create_addr(sunaddr
, addr_len
);
1344 * Get the parent directory, calculate the hash for last
1347 dentry
= kern_path_create(AT_FDCWD
, addr
->name
->sun_path
, &parent
, 0);
1348 if (IS_ERR(dentry
)) {
1349 err
= PTR_ERR(dentry
);
1354 * All right, let's create it.
1356 idmap
= mnt_idmap(parent
.mnt
);
1357 err
= security_path_mknod(&parent
, dentry
, mode
, 0);
1359 err
= vfs_mknod(idmap
, d_inode(parent
.dentry
), dentry
, mode
, 0);
1362 err
= mutex_lock_interruptible(&u
->bindlock
);
1368 old_hash
= sk
->sk_hash
;
1369 new_hash
= unix_bsd_hash(d_backing_inode(dentry
));
1370 unix_table_double_lock(net
, old_hash
, new_hash
);
1371 u
->path
.mnt
= mntget(parent
.mnt
);
1372 u
->path
.dentry
= dget(dentry
);
1373 __unix_set_addr_hash(net
, sk
, addr
, new_hash
);
1374 unix_table_double_unlock(net
, old_hash
, new_hash
);
1375 unix_insert_bsd_socket(sk
);
1376 mutex_unlock(&u
->bindlock
);
1377 done_path_create(&parent
, dentry
);
1381 mutex_unlock(&u
->bindlock
);
1384 /* failed after successful mknod? unlink what we'd created... */
1385 vfs_unlink(idmap
, d_inode(parent
.dentry
), dentry
, NULL
);
1387 done_path_create(&parent
, dentry
);
1389 unix_release_addr(addr
);
1390 return err
== -EEXIST
? -EADDRINUSE
: err
;
1393 static int unix_bind_abstract(struct sock
*sk
, struct sockaddr_un
*sunaddr
,
1396 struct unix_sock
*u
= unix_sk(sk
);
1397 unsigned int new_hash
, old_hash
;
1398 struct net
*net
= sock_net(sk
);
1399 struct unix_address
*addr
;
1402 addr
= unix_create_addr(sunaddr
, addr_len
);
1406 err
= mutex_lock_interruptible(&u
->bindlock
);
1415 old_hash
= sk
->sk_hash
;
1416 new_hash
= unix_abstract_hash(addr
->name
, addr
->len
, sk
->sk_type
);
1417 unix_table_double_lock(net
, old_hash
, new_hash
);
1419 if (__unix_find_socket_byname(net
, addr
->name
, addr
->len
, new_hash
))
1422 __unix_set_addr_hash(net
, sk
, addr
, new_hash
);
1423 unix_table_double_unlock(net
, old_hash
, new_hash
);
1424 mutex_unlock(&u
->bindlock
);
1428 unix_table_double_unlock(net
, old_hash
, new_hash
);
1431 mutex_unlock(&u
->bindlock
);
1433 unix_release_addr(addr
);
1437 static int unix_bind(struct socket
*sock
, struct sockaddr
*uaddr
, int addr_len
)
1439 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
1440 struct sock
*sk
= sock
->sk
;
1443 if (addr_len
== offsetof(struct sockaddr_un
, sun_path
) &&
1444 sunaddr
->sun_family
== AF_UNIX
)
1445 return unix_autobind(sk
);
1447 err
= unix_validate_addr(sunaddr
, addr_len
);
1451 if (sunaddr
->sun_path
[0])
1452 err
= unix_bind_bsd(sk
, sunaddr
, addr_len
);
1454 err
= unix_bind_abstract(sk
, sunaddr
, addr_len
);
1459 static void unix_state_double_lock(struct sock
*sk1
, struct sock
*sk2
)
1461 if (unlikely(sk1
== sk2
) || !sk2
) {
1462 unix_state_lock(sk1
);
1469 unix_state_lock(sk1
);
1470 unix_state_lock(sk2
);
1473 static void unix_state_double_unlock(struct sock
*sk1
, struct sock
*sk2
)
1475 if (unlikely(sk1
== sk2
) || !sk2
) {
1476 unix_state_unlock(sk1
);
1479 unix_state_unlock(sk1
);
1480 unix_state_unlock(sk2
);
1483 static int unix_dgram_connect(struct socket
*sock
, struct sockaddr
*addr
,
1484 int alen
, int flags
)
1486 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)addr
;
1487 struct sock
*sk
= sock
->sk
;
1492 if (alen
< offsetofend(struct sockaddr
, sa_family
))
1495 if (addr
->sa_family
!= AF_UNSPEC
) {
1496 err
= unix_validate_addr(sunaddr
, alen
);
1500 err
= BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk
, addr
, &alen
);
1504 if (unix_may_passcred(sk
) && !READ_ONCE(unix_sk(sk
)->addr
)) {
1505 err
= unix_autobind(sk
);
1511 other
= unix_find_other(sock_net(sk
), sunaddr
, alen
, sock
->type
, 0);
1512 if (IS_ERR(other
)) {
1513 err
= PTR_ERR(other
);
1517 unix_state_double_lock(sk
, other
);
1519 /* Apparently VFS overslept socket death. Retry. */
1520 if (sock_flag(other
, SOCK_DEAD
)) {
1521 unix_state_double_unlock(sk
, other
);
1527 if (!unix_may_send(sk
, other
))
1530 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
1534 WRITE_ONCE(sk
->sk_state
, TCP_ESTABLISHED
);
1535 WRITE_ONCE(other
->sk_state
, TCP_ESTABLISHED
);
1538 * 1003.1g breaking connected state with AF_UNSPEC
1541 unix_state_double_lock(sk
, other
);
1545 * If it was connected, reconnect.
1547 if (unix_peer(sk
)) {
1548 struct sock
*old_peer
= unix_peer(sk
);
1550 unix_peer(sk
) = other
;
1552 WRITE_ONCE(sk
->sk_state
, TCP_CLOSE
);
1553 unix_dgram_peer_wake_disconnect_wakeup(sk
, old_peer
);
1555 unix_state_double_unlock(sk
, other
);
1557 if (other
!= old_peer
) {
1558 unix_dgram_disconnected(sk
, old_peer
);
1560 unix_state_lock(old_peer
);
1561 if (!unix_peer(old_peer
))
1562 WRITE_ONCE(old_peer
->sk_state
, TCP_CLOSE
);
1563 unix_state_unlock(old_peer
);
1568 unix_peer(sk
) = other
;
1569 unix_state_double_unlock(sk
, other
);
1575 unix_state_double_unlock(sk
, other
);
1581 static long unix_wait_for_peer(struct sock
*other
, long timeo
)
1583 struct unix_sock
*u
= unix_sk(other
);
1587 prepare_to_wait_exclusive(&u
->peer_wait
, &wait
, TASK_INTERRUPTIBLE
);
1589 sched
= !sock_flag(other
, SOCK_DEAD
) &&
1590 !(other
->sk_shutdown
& RCV_SHUTDOWN
) &&
1591 unix_recvq_full_lockless(other
);
1593 unix_state_unlock(other
);
1596 timeo
= schedule_timeout(timeo
);
1598 finish_wait(&u
->peer_wait
, &wait
);
1602 static int unix_stream_connect(struct socket
*sock
, struct sockaddr
*uaddr
,
1603 int addr_len
, int flags
)
1605 struct sockaddr_un
*sunaddr
= (struct sockaddr_un
*)uaddr
;
1606 struct sock
*sk
= sock
->sk
, *newsk
= NULL
, *other
= NULL
;
1607 struct unix_sock
*u
= unix_sk(sk
), *newu
, *otheru
;
1608 struct unix_peercred peercred
= {};
1609 struct net
*net
= sock_net(sk
);
1610 struct sk_buff
*skb
= NULL
;
1611 unsigned char state
;
1615 err
= unix_validate_addr(sunaddr
, addr_len
);
1619 err
= BPF_CGROUP_RUN_PROG_UNIX_CONNECT_LOCK(sk
, uaddr
, &addr_len
);
1623 if (unix_may_passcred(sk
) && !READ_ONCE(u
->addr
)) {
1624 err
= unix_autobind(sk
);
1629 timeo
= sock_sndtimeo(sk
, flags
& O_NONBLOCK
);
1631 /* First of all allocate resources.
1632 * If we will make it after state is locked,
1633 * we will have to recheck all again in any case.
1636 /* create new sock for complete connection */
1637 newsk
= unix_create1(net
, NULL
, 0, sock
->type
);
1638 if (IS_ERR(newsk
)) {
1639 err
= PTR_ERR(newsk
);
1643 err
= prepare_peercred(&peercred
);
1647 /* Allocate skb for sending to listening sock */
1648 skb
= sock_wmalloc(newsk
, 1, 0, GFP_KERNEL
);
1655 /* Find listening sock. */
1656 other
= unix_find_other(net
, sunaddr
, addr_len
, sk
->sk_type
, flags
);
1657 if (IS_ERR(other
)) {
1658 err
= PTR_ERR(other
);
1662 unix_state_lock(other
);
1664 /* Apparently VFS overslept socket death. Retry. */
1665 if (sock_flag(other
, SOCK_DEAD
)) {
1666 unix_state_unlock(other
);
1671 if (other
->sk_state
!= TCP_LISTEN
||
1672 other
->sk_shutdown
& RCV_SHUTDOWN
) {
1673 err
= -ECONNREFUSED
;
1677 if (unix_recvq_full_lockless(other
)) {
1683 timeo
= unix_wait_for_peer(other
, timeo
);
1686 err
= sock_intr_errno(timeo
);
1687 if (signal_pending(current
))
1693 /* self connect and simultaneous connect are eliminated
1694 * by rejecting TCP_LISTEN socket to avoid deadlock.
1696 state
= READ_ONCE(sk
->sk_state
);
1697 if (unlikely(state
!= TCP_CLOSE
)) {
1698 err
= state
== TCP_ESTABLISHED
? -EISCONN
: -EINVAL
;
1702 unix_state_lock(sk
);
1704 if (unlikely(sk
->sk_state
!= TCP_CLOSE
)) {
1705 err
= sk
->sk_state
== TCP_ESTABLISHED
? -EISCONN
: -EINVAL
;
1706 unix_state_unlock(sk
);
1710 err
= security_unix_stream_connect(sk
, other
, newsk
);
1712 unix_state_unlock(sk
);
1716 /* The way is open! Fastly set all the necessary fields... */
1719 unix_peer(newsk
) = sk
;
1720 newsk
->sk_state
= TCP_ESTABLISHED
;
1721 newsk
->sk_type
= sk
->sk_type
;
1722 newsk
->sk_scm_recv_flags
= other
->sk_scm_recv_flags
;
1723 init_peercred(newsk
, &peercred
);
1725 newu
= unix_sk(newsk
);
1726 newu
->listener
= other
;
1727 RCU_INIT_POINTER(newsk
->sk_wq
, &newu
->peer_wq
);
1728 otheru
= unix_sk(other
);
1730 /* copy address information from listening to new sock
1732 * The contents of *(otheru->addr) and otheru->path
1733 * are seen fully set up here, since we have found
1734 * otheru in hash under its lock. Insertion into the
1735 * hash chain we'd found it in had been done in an
1736 * earlier critical area protected by the chain's lock,
1737 * the same one where we'd set *(otheru->addr) contents,
1738 * as well as otheru->path and otheru->addr itself.
1740 * Using smp_store_release() here to set newu->addr
1741 * is enough to make those stores, as well as stores
1742 * to newu->path visible to anyone who gets newu->addr
1743 * by smp_load_acquire(). IOW, the same warranties
1744 * as for unix_sock instances bound in unix_bind() or
1745 * in unix_autobind().
1747 if (otheru
->path
.dentry
) {
1748 path_get(&otheru
->path
);
1749 newu
->path
= otheru
->path
;
1751 refcount_inc(&otheru
->addr
->refcnt
);
1752 smp_store_release(&newu
->addr
, otheru
->addr
);
1754 /* Set credentials */
1755 copy_peercred(sk
, other
);
1757 sock
->state
= SS_CONNECTED
;
1758 WRITE_ONCE(sk
->sk_state
, TCP_ESTABLISHED
);
1761 smp_mb__after_atomic(); /* sock_hold() does an atomic_inc() */
1762 unix_peer(sk
) = newsk
;
1764 unix_state_unlock(sk
);
1766 /* take ten and send info to listening sock */
1767 spin_lock(&other
->sk_receive_queue
.lock
);
1768 __skb_queue_tail(&other
->sk_receive_queue
, skb
);
1769 spin_unlock(&other
->sk_receive_queue
.lock
);
1770 unix_state_unlock(other
);
1771 other
->sk_data_ready(other
);
1776 unix_state_unlock(other
);
1781 unix_release_sock(newsk
, 0);
1783 drop_peercred(&peercred
);
1787 static int unix_socketpair(struct socket
*socka
, struct socket
*sockb
)
1789 struct unix_peercred ska_peercred
= {}, skb_peercred
= {};
1790 struct sock
*ska
= socka
->sk
, *skb
= sockb
->sk
;
1793 err
= prepare_peercred(&ska_peercred
);
1797 err
= prepare_peercred(&skb_peercred
);
1799 drop_peercred(&ska_peercred
);
1803 /* Join our sockets back to back */
1806 unix_peer(ska
) = skb
;
1807 unix_peer(skb
) = ska
;
1808 init_peercred(ska
, &ska_peercred
);
1809 init_peercred(skb
, &skb_peercred
);
1811 ska
->sk_state
= TCP_ESTABLISHED
;
1812 skb
->sk_state
= TCP_ESTABLISHED
;
1813 socka
->state
= SS_CONNECTED
;
1814 sockb
->state
= SS_CONNECTED
;
1818 static int unix_accept(struct socket
*sock
, struct socket
*newsock
,
1819 struct proto_accept_arg
*arg
)
1821 struct sock
*sk
= sock
->sk
;
1822 struct sk_buff
*skb
;
1825 arg
->err
= -EOPNOTSUPP
;
1826 if (sock
->type
!= SOCK_STREAM
&& sock
->type
!= SOCK_SEQPACKET
)
1830 if (READ_ONCE(sk
->sk_state
) != TCP_LISTEN
)
1833 /* If socket state is TCP_LISTEN it cannot change (for now...),
1834 * so that no locks are necessary.
1837 skb
= skb_recv_datagram(sk
, (arg
->flags
& O_NONBLOCK
) ? MSG_DONTWAIT
: 0,
1840 /* This means receive shutdown. */
1847 skb_free_datagram(sk
, skb
);
1848 wake_up_interruptible(&unix_sk(sk
)->peer_wait
);
1850 /* attach accepted sock to socket */
1851 unix_state_lock(tsk
);
1852 unix_update_edges(unix_sk(tsk
));
1853 newsock
->state
= SS_CONNECTED
;
1854 sock_graft(tsk
, newsock
);
1855 unix_state_unlock(tsk
);
1863 static int unix_getname(struct socket
*sock
, struct sockaddr
*uaddr
, int peer
)
1865 struct sock
*sk
= sock
->sk
;
1866 struct unix_address
*addr
;
1867 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
, uaddr
);
1871 sk
= unix_peer_get(sk
);
1881 addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
1883 sunaddr
->sun_family
= AF_UNIX
;
1884 sunaddr
->sun_path
[0] = 0;
1885 err
= offsetof(struct sockaddr_un
, sun_path
);
1888 memcpy(sunaddr
, addr
->name
, addr
->len
);
1891 BPF_CGROUP_RUN_SA_PROG(sk
, uaddr
, &err
,
1892 CGROUP_UNIX_GETPEERNAME
);
1894 BPF_CGROUP_RUN_SA_PROG(sk
, uaddr
, &err
,
1895 CGROUP_UNIX_GETSOCKNAME
);
1902 /* The "user->unix_inflight" variable is protected by the garbage
1903 * collection lock, and we just read it locklessly here. If you go
1904 * over the limit, there might be a tiny race in actually noticing
1905 * it across threads. Tough.
1907 static inline bool too_many_unix_fds(struct task_struct
*p
)
1909 struct user_struct
*user
= current_user();
1911 if (unlikely(READ_ONCE(user
->unix_inflight
) > task_rlimit(p
, RLIMIT_NOFILE
)))
1912 return !capable(CAP_SYS_RESOURCE
) && !capable(CAP_SYS_ADMIN
);
1916 static int unix_attach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1918 if (too_many_unix_fds(current
))
1919 return -ETOOMANYREFS
;
1921 UNIXCB(skb
).fp
= scm
->fp
;
1924 if (unix_prepare_fpl(UNIXCB(skb
).fp
))
1930 static void unix_detach_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1932 scm
->fp
= UNIXCB(skb
).fp
;
1933 UNIXCB(skb
).fp
= NULL
;
1935 unix_destroy_fpl(scm
->fp
);
1938 static void unix_peek_fds(struct scm_cookie
*scm
, struct sk_buff
*skb
)
1940 scm
->fp
= scm_fp_dup(UNIXCB(skb
).fp
);
1943 static void unix_destruct_scm(struct sk_buff
*skb
)
1945 struct scm_cookie scm
;
1947 memset(&scm
, 0, sizeof(scm
));
1948 scm
.pid
= UNIXCB(skb
).pid
;
1950 unix_detach_fds(&scm
, skb
);
1952 /* Alas, it calls VFS */
1953 /* So fscking what? fput() had been SMP-safe since the last Summer */
1958 static int unix_scm_to_skb(struct scm_cookie
*scm
, struct sk_buff
*skb
, bool send_fds
)
1962 UNIXCB(skb
).pid
= get_pid(scm
->pid
);
1963 UNIXCB(skb
).uid
= scm
->creds
.uid
;
1964 UNIXCB(skb
).gid
= scm
->creds
.gid
;
1965 UNIXCB(skb
).fp
= NULL
;
1966 unix_get_secdata(scm
, skb
);
1967 if (scm
->fp
&& send_fds
)
1968 err
= unix_attach_fds(scm
, skb
);
1970 skb
->destructor
= unix_destruct_scm
;
1975 * Some apps rely on write() giving SCM_CREDENTIALS
1976 * We include credentials if source or destination socket
1977 * asserted SOCK_PASSCRED.
1979 static void unix_maybe_add_creds(struct sk_buff
*skb
, const struct sock
*sk
,
1980 const struct sock
*other
)
1982 if (UNIXCB(skb
).pid
)
1985 if (unix_may_passcred(sk
) || unix_may_passcred(other
) ||
1986 !other
->sk_socket
) {
1987 UNIXCB(skb
).pid
= get_pid(task_tgid(current
));
1988 current_uid_gid(&UNIXCB(skb
).uid
, &UNIXCB(skb
).gid
);
1992 static bool unix_skb_scm_eq(struct sk_buff
*skb
,
1993 struct scm_cookie
*scm
)
1995 return UNIXCB(skb
).pid
== scm
->pid
&&
1996 uid_eq(UNIXCB(skb
).uid
, scm
->creds
.uid
) &&
1997 gid_eq(UNIXCB(skb
).gid
, scm
->creds
.gid
) &&
1998 unix_secdata_eq(scm
, skb
);
2001 static void scm_stat_add(struct sock
*sk
, struct sk_buff
*skb
)
2003 struct scm_fp_list
*fp
= UNIXCB(skb
).fp
;
2004 struct unix_sock
*u
= unix_sk(sk
);
2006 if (unlikely(fp
&& fp
->count
)) {
2007 atomic_add(fp
->count
, &u
->scm_stat
.nr_fds
);
2008 unix_add_edges(fp
, u
);
2012 static void scm_stat_del(struct sock
*sk
, struct sk_buff
*skb
)
2014 struct scm_fp_list
*fp
= UNIXCB(skb
).fp
;
2015 struct unix_sock
*u
= unix_sk(sk
);
2017 if (unlikely(fp
&& fp
->count
)) {
2018 atomic_sub(fp
->count
, &u
->scm_stat
.nr_fds
);
2024 * Send AF_UNIX data.
2027 static int unix_dgram_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
2030 struct sock
*sk
= sock
->sk
, *other
= NULL
;
2031 struct unix_sock
*u
= unix_sk(sk
);
2032 struct scm_cookie scm
;
2033 struct sk_buff
*skb
;
2039 err
= scm_send(sock
, msg
, &scm
, false);
2043 wait_for_unix_gc(scm
.fp
);
2045 if (msg
->msg_flags
& MSG_OOB
) {
2050 if (msg
->msg_namelen
) {
2051 err
= unix_validate_addr(msg
->msg_name
, msg
->msg_namelen
);
2055 err
= BPF_CGROUP_RUN_PROG_UNIX_SENDMSG_LOCK(sk
,
2063 if (unix_may_passcred(sk
) && !READ_ONCE(u
->addr
)) {
2064 err
= unix_autobind(sk
);
2069 if (len
> READ_ONCE(sk
->sk_sndbuf
) - 32) {
2074 if (len
> SKB_MAX_ALLOC
) {
2075 data_len
= min_t(size_t,
2076 len
- SKB_MAX_ALLOC
,
2077 MAX_SKB_FRAGS
* PAGE_SIZE
);
2078 data_len
= PAGE_ALIGN(data_len
);
2080 BUILD_BUG_ON(SKB_MAX_ALLOC
< PAGE_SIZE
);
2083 skb
= sock_alloc_send_pskb(sk
, len
- data_len
, data_len
,
2084 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
2085 PAGE_ALLOC_COSTLY_ORDER
);
2089 err
= unix_scm_to_skb(&scm
, skb
, true);
2093 skb_put(skb
, len
- data_len
);
2094 skb
->data_len
= data_len
;
2096 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, len
);
2100 timeo
= sock_sndtimeo(sk
, msg
->msg_flags
& MSG_DONTWAIT
);
2102 if (msg
->msg_namelen
) {
2104 other
= unix_find_other(sock_net(sk
), msg
->msg_name
,
2105 msg
->msg_namelen
, sk
->sk_type
, 0);
2106 if (IS_ERR(other
)) {
2107 err
= PTR_ERR(other
);
2111 other
= unix_peer_get(sk
);
2118 if (sk_filter(other
, skb
) < 0) {
2119 /* Toss the packet but do not return any error to the sender */
2126 unix_state_lock(other
);
2129 if (!unix_may_send(sk
, other
)) {
2134 if (unlikely(sock_flag(other
, SOCK_DEAD
))) {
2135 /* Check with 1003.1g - what should datagram error */
2137 unix_state_unlock(other
);
2139 if (sk
->sk_type
== SOCK_SEQPACKET
) {
2140 /* We are here only when racing with unix_release_sock()
2141 * is clearing @other. Never change state to TCP_CLOSE
2142 * unlike SOCK_DGRAM wants.
2149 unix_state_lock(sk
);
2151 if (unix_peer(sk
) == other
) {
2152 unix_peer(sk
) = NULL
;
2153 unix_dgram_peer_wake_disconnect_wakeup(sk
, other
);
2155 WRITE_ONCE(sk
->sk_state
, TCP_CLOSE
);
2156 unix_state_unlock(sk
);
2158 unix_dgram_disconnected(sk
, other
);
2160 err
= -ECONNREFUSED
;
2164 unix_state_unlock(sk
);
2166 if (!msg
->msg_namelen
) {
2175 if (other
->sk_shutdown
& RCV_SHUTDOWN
) {
2180 if (UNIXCB(skb
).fp
&& !other
->sk_scm_rights
) {
2185 if (sk
->sk_type
!= SOCK_SEQPACKET
) {
2186 err
= security_unix_may_send(sk
->sk_socket
, other
->sk_socket
);
2191 /* other == sk && unix_peer(other) != sk if
2192 * - unix_peer(sk) == NULL, destination address bound to sk
2193 * - unix_peer(sk) == sk by time of get but disconnected before lock
2196 unlikely(unix_peer(other
) != sk
&&
2197 unix_recvq_full_lockless(other
))) {
2199 timeo
= unix_wait_for_peer(other
, timeo
);
2201 err
= sock_intr_errno(timeo
);
2202 if (signal_pending(current
))
2209 unix_state_unlock(other
);
2210 unix_state_double_lock(sk
, other
);
2213 if (unix_peer(sk
) != other
||
2214 unix_dgram_peer_wake_me(sk
, other
)) {
2222 goto restart_locked
;
2226 if (unlikely(sk_locked
))
2227 unix_state_unlock(sk
);
2229 if (sock_flag(other
, SOCK_RCVTSTAMP
))
2230 __net_timestamp(skb
);
2232 unix_maybe_add_creds(skb
, sk
, other
);
2233 scm_stat_add(other
, skb
);
2234 skb_queue_tail(&other
->sk_receive_queue
, skb
);
2235 unix_state_unlock(other
);
2236 other
->sk_data_ready(other
);
2243 unix_state_unlock(sk
);
2244 unix_state_unlock(other
);
2254 /* We use paged skbs for stream sockets, and limit occupancy to 32768
2255 * bytes, and a minimum of a full page.
2257 #define UNIX_SKB_FRAGS_SZ (PAGE_SIZE << get_order(32768))
2259 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2260 static int queue_oob(struct sock
*sk
, struct msghdr
*msg
, struct sock
*other
,
2261 struct scm_cookie
*scm
, bool fds_sent
)
2263 struct unix_sock
*ousk
= unix_sk(other
);
2264 struct sk_buff
*skb
;
2267 skb
= sock_alloc_send_skb(sk
, 1, msg
->msg_flags
& MSG_DONTWAIT
, &err
);
2272 err
= unix_scm_to_skb(scm
, skb
, !fds_sent
);
2277 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, 1);
2282 unix_state_lock(other
);
2284 if (sock_flag(other
, SOCK_DEAD
) ||
2285 (other
->sk_shutdown
& RCV_SHUTDOWN
)) {
2290 if (UNIXCB(skb
).fp
&& !other
->sk_scm_rights
) {
2295 unix_maybe_add_creds(skb
, sk
, other
);
2296 scm_stat_add(other
, skb
);
2298 spin_lock(&other
->sk_receive_queue
.lock
);
2299 WRITE_ONCE(ousk
->oob_skb
, skb
);
2300 __skb_queue_tail(&other
->sk_receive_queue
, skb
);
2301 spin_unlock(&other
->sk_receive_queue
.lock
);
2303 sk_send_sigurg(other
);
2304 unix_state_unlock(other
);
2305 other
->sk_data_ready(other
);
2309 unix_state_unlock(other
);
2316 static int unix_stream_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
2319 struct sock
*sk
= sock
->sk
;
2320 struct sk_buff
*skb
= NULL
;
2321 struct sock
*other
= NULL
;
2322 struct scm_cookie scm
;
2323 bool fds_sent
= false;
2326 err
= scm_send(sock
, msg
, &scm
, false);
2330 wait_for_unix_gc(scm
.fp
);
2332 if (msg
->msg_flags
& MSG_OOB
) {
2334 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2342 if (msg
->msg_namelen
) {
2343 err
= READ_ONCE(sk
->sk_state
) == TCP_ESTABLISHED
? -EISCONN
: -EOPNOTSUPP
;
2346 other
= unix_peer(sk
);
2353 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2356 while (sent
< len
) {
2357 int size
= len
- sent
;
2360 if (unlikely(msg
->msg_flags
& MSG_SPLICE_PAGES
)) {
2361 skb
= sock_alloc_send_pskb(sk
, 0, 0,
2362 msg
->msg_flags
& MSG_DONTWAIT
,
2365 /* Keep two messages in the pipe so it schedules better */
2366 size
= min_t(int, size
, (READ_ONCE(sk
->sk_sndbuf
) >> 1) - 64);
2368 /* allow fallback to order-0 allocations */
2369 size
= min_t(int, size
, SKB_MAX_HEAD(0) + UNIX_SKB_FRAGS_SZ
);
2371 data_len
= max_t(int, 0, size
- SKB_MAX_HEAD(0));
2373 data_len
= min_t(size_t, size
, PAGE_ALIGN(data_len
));
2375 skb
= sock_alloc_send_pskb(sk
, size
- data_len
, data_len
,
2376 msg
->msg_flags
& MSG_DONTWAIT
, &err
,
2377 get_order(UNIX_SKB_FRAGS_SZ
));
2382 /* Only send the fds in the first buffer */
2383 err
= unix_scm_to_skb(&scm
, skb
, !fds_sent
);
2389 if (unlikely(msg
->msg_flags
& MSG_SPLICE_PAGES
)) {
2390 skb
->ip_summed
= CHECKSUM_UNNECESSARY
;
2391 err
= skb_splice_from_iter(skb
, &msg
->msg_iter
, size
,
2397 refcount_add(size
, &sk
->sk_wmem_alloc
);
2399 skb_put(skb
, size
- data_len
);
2400 skb
->data_len
= data_len
;
2402 err
= skb_copy_datagram_from_iter(skb
, 0, &msg
->msg_iter
, size
);
2407 unix_state_lock(other
);
2409 if (sock_flag(other
, SOCK_DEAD
) ||
2410 (other
->sk_shutdown
& RCV_SHUTDOWN
))
2411 goto out_pipe_unlock
;
2413 if (UNIXCB(skb
).fp
&& !other
->sk_scm_rights
) {
2414 unix_state_unlock(other
);
2419 unix_maybe_add_creds(skb
, sk
, other
);
2420 scm_stat_add(other
, skb
);
2421 skb_queue_tail(&other
->sk_receive_queue
, skb
);
2422 unix_state_unlock(other
);
2423 other
->sk_data_ready(other
);
2427 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2428 if (msg
->msg_flags
& MSG_OOB
) {
2429 err
= queue_oob(sk
, msg
, other
, &scm
, fds_sent
);
2441 unix_state_unlock(other
);
2443 if (!sent
&& !(msg
->msg_flags
& MSG_NOSIGNAL
))
2444 send_sig(SIGPIPE
, current
, 0);
2450 return sent
? : err
;
2453 static int unix_seqpacket_sendmsg(struct socket
*sock
, struct msghdr
*msg
,
2457 struct sock
*sk
= sock
->sk
;
2459 err
= sock_error(sk
);
2463 if (READ_ONCE(sk
->sk_state
) != TCP_ESTABLISHED
)
2466 if (msg
->msg_namelen
)
2467 msg
->msg_namelen
= 0;
2469 return unix_dgram_sendmsg(sock
, msg
, len
);
2472 static int unix_seqpacket_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
2473 size_t size
, int flags
)
2475 struct sock
*sk
= sock
->sk
;
2477 if (READ_ONCE(sk
->sk_state
) != TCP_ESTABLISHED
)
2480 return unix_dgram_recvmsg(sock
, msg
, size
, flags
);
2483 static void unix_copy_addr(struct msghdr
*msg
, struct sock
*sk
)
2485 struct unix_address
*addr
= smp_load_acquire(&unix_sk(sk
)->addr
);
2488 msg
->msg_namelen
= addr
->len
;
2489 memcpy(msg
->msg_name
, addr
->name
, addr
->len
);
2493 int __unix_dgram_recvmsg(struct sock
*sk
, struct msghdr
*msg
, size_t size
,
2496 struct scm_cookie scm
;
2497 struct socket
*sock
= sk
->sk_socket
;
2498 struct unix_sock
*u
= unix_sk(sk
);
2499 struct sk_buff
*skb
, *last
;
2508 timeo
= sock_rcvtimeo(sk
, flags
& MSG_DONTWAIT
);
2511 mutex_lock(&u
->iolock
);
2513 skip
= sk_peek_offset(sk
, flags
);
2514 skb
= __skb_try_recv_datagram(sk
, &sk
->sk_receive_queue
, flags
,
2515 &skip
, &err
, &last
);
2517 if (!(flags
& MSG_PEEK
))
2518 scm_stat_del(sk
, skb
);
2522 mutex_unlock(&u
->iolock
);
2527 !__skb_wait_for_more_packets(sk
, &sk
->sk_receive_queue
,
2528 &err
, &timeo
, last
));
2530 if (!skb
) { /* implies iolock unlocked */
2531 unix_state_lock(sk
);
2532 /* Signal EOF on disconnected non-blocking SEQPACKET socket. */
2533 if (sk
->sk_type
== SOCK_SEQPACKET
&& err
== -EAGAIN
&&
2534 (sk
->sk_shutdown
& RCV_SHUTDOWN
))
2536 unix_state_unlock(sk
);
2540 if (wq_has_sleeper(&u
->peer_wait
))
2541 wake_up_interruptible_sync_poll(&u
->peer_wait
,
2542 EPOLLOUT
| EPOLLWRNORM
|
2545 if (msg
->msg_name
) {
2546 unix_copy_addr(msg
, skb
->sk
);
2548 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk
,
2553 if (size
> skb
->len
- skip
)
2554 size
= skb
->len
- skip
;
2555 else if (size
< skb
->len
- skip
)
2556 msg
->msg_flags
|= MSG_TRUNC
;
2558 err
= skb_copy_datagram_msg(skb
, skip
, msg
, size
);
2562 if (sock_flag(sk
, SOCK_RCVTSTAMP
))
2563 __sock_recv_timestamp(msg
, sk
, skb
);
2565 memset(&scm
, 0, sizeof(scm
));
2567 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2568 unix_set_secdata(&scm
, skb
);
2570 if (!(flags
& MSG_PEEK
)) {
2572 unix_detach_fds(&scm
, skb
);
2574 sk_peek_offset_bwd(sk
, skb
->len
);
2576 /* It is questionable: on PEEK we could:
2577 - do not return fds - good, but too simple 8)
2578 - return fds, and do not return them on read (old strategy,
2580 - clone fds (I chose it for now, it is the most universal
2583 POSIX 1003.1g does not actually define this clearly
2584 at all. POSIX 1003.1g doesn't define a lot of things
2589 sk_peek_offset_fwd(sk
, size
);
2592 unix_peek_fds(&scm
, skb
);
2594 err
= (flags
& MSG_TRUNC
) ? skb
->len
- skip
: size
;
2596 scm_recv_unix(sock
, msg
, &scm
, flags
);
2599 skb_free_datagram(sk
, skb
);
2600 mutex_unlock(&u
->iolock
);
2605 static int unix_dgram_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t size
,
2608 struct sock
*sk
= sock
->sk
;
2610 #ifdef CONFIG_BPF_SYSCALL
2611 const struct proto
*prot
= READ_ONCE(sk
->sk_prot
);
2613 if (prot
!= &unix_dgram_proto
)
2614 return prot
->recvmsg(sk
, msg
, size
, flags
, NULL
);
2616 return __unix_dgram_recvmsg(sk
, msg
, size
, flags
);
2619 static int unix_read_skb(struct sock
*sk
, skb_read_actor_t recv_actor
)
2621 struct unix_sock
*u
= unix_sk(sk
);
2622 struct sk_buff
*skb
;
2625 mutex_lock(&u
->iolock
);
2626 skb
= skb_recv_datagram(sk
, MSG_DONTWAIT
, &err
);
2627 mutex_unlock(&u
->iolock
);
2631 return recv_actor(sk
, skb
);
2635 * Sleep until more data has arrived. But check for races..
2637 static long unix_stream_data_wait(struct sock
*sk
, long timeo
,
2638 struct sk_buff
*last
, unsigned int last_len
,
2641 unsigned int state
= TASK_INTERRUPTIBLE
| freezable
* TASK_FREEZABLE
;
2642 struct sk_buff
*tail
;
2645 unix_state_lock(sk
);
2648 prepare_to_wait(sk_sleep(sk
), &wait
, state
);
2650 tail
= skb_peek_tail(&sk
->sk_receive_queue
);
2652 (tail
&& tail
->len
!= last_len
) ||
2654 (sk
->sk_shutdown
& RCV_SHUTDOWN
) ||
2655 signal_pending(current
) ||
2659 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2660 unix_state_unlock(sk
);
2661 timeo
= schedule_timeout(timeo
);
2662 unix_state_lock(sk
);
2664 if (sock_flag(sk
, SOCK_DEAD
))
2667 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
2670 finish_wait(sk_sleep(sk
), &wait
);
2671 unix_state_unlock(sk
);
2675 struct unix_stream_read_state
{
2676 int (*recv_actor
)(struct sk_buff
*, int, int,
2677 struct unix_stream_read_state
*);
2678 struct socket
*socket
;
2680 struct pipe_inode_info
*pipe
;
2683 unsigned int splice_flags
;
2686 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2687 static int unix_stream_recv_urg(struct unix_stream_read_state
*state
)
2689 struct sk_buff
*oob_skb
, *read_skb
= NULL
;
2690 struct socket
*sock
= state
->socket
;
2691 struct sock
*sk
= sock
->sk
;
2692 struct unix_sock
*u
= unix_sk(sk
);
2695 mutex_lock(&u
->iolock
);
2696 unix_state_lock(sk
);
2697 spin_lock(&sk
->sk_receive_queue
.lock
);
2699 if (sock_flag(sk
, SOCK_URGINLINE
) || !u
->oob_skb
) {
2700 spin_unlock(&sk
->sk_receive_queue
.lock
);
2701 unix_state_unlock(sk
);
2702 mutex_unlock(&u
->iolock
);
2706 oob_skb
= u
->oob_skb
;
2708 if (!(state
->flags
& MSG_PEEK
)) {
2709 WRITE_ONCE(u
->oob_skb
, NULL
);
2711 if (oob_skb
->prev
!= (struct sk_buff
*)&sk
->sk_receive_queue
&&
2712 !unix_skb_len(oob_skb
->prev
)) {
2713 read_skb
= oob_skb
->prev
;
2714 __skb_unlink(read_skb
, &sk
->sk_receive_queue
);
2718 spin_unlock(&sk
->sk_receive_queue
.lock
);
2719 unix_state_unlock(sk
);
2721 chunk
= state
->recv_actor(oob_skb
, 0, chunk
, state
);
2723 if (!(state
->flags
& MSG_PEEK
))
2724 UNIXCB(oob_skb
).consumed
+= 1;
2726 mutex_unlock(&u
->iolock
);
2728 consume_skb(read_skb
);
2733 state
->msg
->msg_flags
|= MSG_OOB
;
2737 static struct sk_buff
*manage_oob(struct sk_buff
*skb
, struct sock
*sk
,
2738 int flags
, int copied
)
2740 struct sk_buff
*read_skb
= NULL
, *unread_skb
= NULL
;
2741 struct unix_sock
*u
= unix_sk(sk
);
2743 if (likely(unix_skb_len(skb
) && skb
!= READ_ONCE(u
->oob_skb
)))
2746 spin_lock(&sk
->sk_receive_queue
.lock
);
2748 if (!unix_skb_len(skb
)) {
2749 if (copied
&& (!u
->oob_skb
|| skb
== u
->oob_skb
)) {
2751 } else if (flags
& MSG_PEEK
) {
2752 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2755 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2756 __skb_unlink(read_skb
, &sk
->sk_receive_queue
);
2763 if (skb
!= u
->oob_skb
)
2768 } else if (!(flags
& MSG_PEEK
)) {
2769 WRITE_ONCE(u
->oob_skb
, NULL
);
2771 if (!sock_flag(sk
, SOCK_URGINLINE
)) {
2772 __skb_unlink(skb
, &sk
->sk_receive_queue
);
2774 skb
= skb_peek(&sk
->sk_receive_queue
);
2776 } else if (!sock_flag(sk
, SOCK_URGINLINE
)) {
2777 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2781 spin_unlock(&sk
->sk_receive_queue
.lock
);
2783 consume_skb(read_skb
);
2784 kfree_skb_reason(unread_skb
, SKB_DROP_REASON_UNIX_SKIP_OOB
);
2790 static int unix_stream_read_skb(struct sock
*sk
, skb_read_actor_t recv_actor
)
2792 struct unix_sock
*u
= unix_sk(sk
);
2793 struct sk_buff
*skb
;
2796 if (unlikely(READ_ONCE(sk
->sk_state
) != TCP_ESTABLISHED
))
2799 mutex_lock(&u
->iolock
);
2800 skb
= skb_recv_datagram(sk
, MSG_DONTWAIT
, &err
);
2801 mutex_unlock(&u
->iolock
);
2805 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2806 if (unlikely(skb
== READ_ONCE(u
->oob_skb
))) {
2809 unix_state_lock(sk
);
2811 if (sock_flag(sk
, SOCK_DEAD
)) {
2812 unix_state_unlock(sk
);
2813 kfree_skb_reason(skb
, SKB_DROP_REASON_SOCKET_CLOSE
);
2817 spin_lock(&sk
->sk_receive_queue
.lock
);
2818 if (likely(skb
== u
->oob_skb
)) {
2819 WRITE_ONCE(u
->oob_skb
, NULL
);
2822 spin_unlock(&sk
->sk_receive_queue
.lock
);
2824 unix_state_unlock(sk
);
2827 kfree_skb_reason(skb
, SKB_DROP_REASON_UNIX_SKIP_OOB
);
2833 return recv_actor(sk
, skb
);
2836 static int unix_stream_read_generic(struct unix_stream_read_state
*state
,
2839 struct scm_cookie scm
;
2840 struct socket
*sock
= state
->socket
;
2841 struct sock
*sk
= sock
->sk
;
2842 struct unix_sock
*u
= unix_sk(sk
);
2844 int flags
= state
->flags
;
2845 int noblock
= flags
& MSG_DONTWAIT
;
2846 bool check_creds
= false;
2851 size_t size
= state
->size
;
2852 unsigned int last_len
;
2854 if (unlikely(READ_ONCE(sk
->sk_state
) != TCP_ESTABLISHED
)) {
2859 if (unlikely(flags
& MSG_OOB
)) {
2861 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2862 err
= unix_stream_recv_urg(state
);
2867 target
= sock_rcvlowat(sk
, flags
& MSG_WAITALL
, size
);
2868 timeo
= sock_rcvtimeo(sk
, noblock
);
2870 memset(&scm
, 0, sizeof(scm
));
2872 /* Lock the socket to prevent queue disordering
2873 * while sleeps in memcpy_tomsg
2875 mutex_lock(&u
->iolock
);
2877 skip
= max(sk_peek_offset(sk
, flags
), 0);
2880 struct sk_buff
*skb
, *last
;
2884 unix_state_lock(sk
);
2885 if (sock_flag(sk
, SOCK_DEAD
)) {
2889 last
= skb
= skb_peek(&sk
->sk_receive_queue
);
2890 last_len
= last
? last
->len
: 0;
2893 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
2895 skb
= manage_oob(skb
, sk
, flags
, copied
);
2896 if (!skb
&& copied
) {
2897 unix_state_unlock(sk
);
2903 if (copied
>= target
)
2907 * POSIX 1003.1g mandates this order.
2910 err
= sock_error(sk
);
2913 if (sk
->sk_shutdown
& RCV_SHUTDOWN
)
2916 unix_state_unlock(sk
);
2922 mutex_unlock(&u
->iolock
);
2924 timeo
= unix_stream_data_wait(sk
, timeo
, last
,
2925 last_len
, freezable
);
2927 if (signal_pending(current
)) {
2928 err
= sock_intr_errno(timeo
);
2933 mutex_lock(&u
->iolock
);
2936 unix_state_unlock(sk
);
2940 while (skip
>= unix_skb_len(skb
)) {
2941 skip
-= unix_skb_len(skb
);
2943 last_len
= skb
->len
;
2944 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
2949 unix_state_unlock(sk
);
2952 /* Never glue messages from different writers */
2953 if (!unix_skb_scm_eq(skb
, &scm
))
2955 } else if (unix_may_passcred(sk
)) {
2956 /* Copy credentials */
2957 scm_set_cred(&scm
, UNIXCB(skb
).pid
, UNIXCB(skb
).uid
, UNIXCB(skb
).gid
);
2958 unix_set_secdata(&scm
, skb
);
2962 /* Copy address just once */
2963 if (state
->msg
&& state
->msg
->msg_name
) {
2964 DECLARE_SOCKADDR(struct sockaddr_un
*, sunaddr
,
2965 state
->msg
->msg_name
);
2966 unix_copy_addr(state
->msg
, skb
->sk
);
2968 BPF_CGROUP_RUN_PROG_UNIX_RECVMSG_LOCK(sk
,
2969 state
->msg
->msg_name
,
2970 &state
->msg
->msg_namelen
);
2975 chunk
= min_t(unsigned int, unix_skb_len(skb
) - skip
, size
);
2976 chunk
= state
->recv_actor(skb
, skip
, chunk
, state
);
2985 /* Mark read part of skb as used */
2986 if (!(flags
& MSG_PEEK
)) {
2987 UNIXCB(skb
).consumed
+= chunk
;
2989 sk_peek_offset_bwd(sk
, chunk
);
2991 if (UNIXCB(skb
).fp
) {
2992 scm_stat_del(sk
, skb
);
2993 unix_detach_fds(&scm
, skb
);
2996 if (unix_skb_len(skb
))
2999 skb_unlink(skb
, &sk
->sk_receive_queue
);
3005 /* It is questionable, see note in unix_dgram_recvmsg.
3008 unix_peek_fds(&scm
, skb
);
3010 sk_peek_offset_fwd(sk
, chunk
);
3017 last_len
= skb
->len
;
3018 unix_state_lock(sk
);
3019 skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
3022 unix_state_unlock(sk
);
3027 mutex_unlock(&u
->iolock
);
3029 scm_recv_unix(sock
, state
->msg
, &scm
, flags
);
3033 return copied
? : err
;
3036 static int unix_stream_read_actor(struct sk_buff
*skb
,
3037 int skip
, int chunk
,
3038 struct unix_stream_read_state
*state
)
3042 ret
= skb_copy_datagram_msg(skb
, UNIXCB(skb
).consumed
+ skip
,
3044 return ret
?: chunk
;
3047 int __unix_stream_recvmsg(struct sock
*sk
, struct msghdr
*msg
,
3048 size_t size
, int flags
)
3050 struct unix_stream_read_state state
= {
3051 .recv_actor
= unix_stream_read_actor
,
3052 .socket
= sk
->sk_socket
,
3058 return unix_stream_read_generic(&state
, true);
3061 static int unix_stream_recvmsg(struct socket
*sock
, struct msghdr
*msg
,
3062 size_t size
, int flags
)
3064 struct unix_stream_read_state state
= {
3065 .recv_actor
= unix_stream_read_actor
,
3072 #ifdef CONFIG_BPF_SYSCALL
3073 struct sock
*sk
= sock
->sk
;
3074 const struct proto
*prot
= READ_ONCE(sk
->sk_prot
);
3076 if (prot
!= &unix_stream_proto
)
3077 return prot
->recvmsg(sk
, msg
, size
, flags
, NULL
);
3079 return unix_stream_read_generic(&state
, true);
3082 static int unix_stream_splice_actor(struct sk_buff
*skb
,
3083 int skip
, int chunk
,
3084 struct unix_stream_read_state
*state
)
3086 return skb_splice_bits(skb
, state
->socket
->sk
,
3087 UNIXCB(skb
).consumed
+ skip
,
3088 state
->pipe
, chunk
, state
->splice_flags
);
3091 static ssize_t
unix_stream_splice_read(struct socket
*sock
, loff_t
*ppos
,
3092 struct pipe_inode_info
*pipe
,
3093 size_t size
, unsigned int flags
)
3095 struct unix_stream_read_state state
= {
3096 .recv_actor
= unix_stream_splice_actor
,
3100 .splice_flags
= flags
,
3103 if (unlikely(*ppos
))
3106 if (sock
->file
->f_flags
& O_NONBLOCK
||
3107 flags
& SPLICE_F_NONBLOCK
)
3108 state
.flags
= MSG_DONTWAIT
;
3110 return unix_stream_read_generic(&state
, false);
3113 static int unix_shutdown(struct socket
*sock
, int mode
)
3115 struct sock
*sk
= sock
->sk
;
3118 if (mode
< SHUT_RD
|| mode
> SHUT_RDWR
)
3121 * SHUT_RD (0) -> RCV_SHUTDOWN (1)
3122 * SHUT_WR (1) -> SEND_SHUTDOWN (2)
3123 * SHUT_RDWR (2) -> SHUTDOWN_MASK (3)
3127 unix_state_lock(sk
);
3128 WRITE_ONCE(sk
->sk_shutdown
, sk
->sk_shutdown
| mode
);
3129 other
= unix_peer(sk
);
3132 unix_state_unlock(sk
);
3133 sk
->sk_state_change(sk
);
3136 (sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
)) {
3139 const struct proto
*prot
= READ_ONCE(other
->sk_prot
);
3142 prot
->unhash(other
);
3143 if (mode
&RCV_SHUTDOWN
)
3144 peer_mode
|= SEND_SHUTDOWN
;
3145 if (mode
&SEND_SHUTDOWN
)
3146 peer_mode
|= RCV_SHUTDOWN
;
3147 unix_state_lock(other
);
3148 WRITE_ONCE(other
->sk_shutdown
, other
->sk_shutdown
| peer_mode
);
3149 unix_state_unlock(other
);
3150 other
->sk_state_change(other
);
3151 if (peer_mode
== SHUTDOWN_MASK
)
3152 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_HUP
);
3153 else if (peer_mode
& RCV_SHUTDOWN
)
3154 sk_wake_async(other
, SOCK_WAKE_WAITD
, POLL_IN
);
3162 long unix_inq_len(struct sock
*sk
)
3164 struct sk_buff
*skb
;
3167 if (READ_ONCE(sk
->sk_state
) == TCP_LISTEN
)
3170 spin_lock(&sk
->sk_receive_queue
.lock
);
3171 if (sk
->sk_type
== SOCK_STREAM
||
3172 sk
->sk_type
== SOCK_SEQPACKET
) {
3173 skb_queue_walk(&sk
->sk_receive_queue
, skb
)
3174 amount
+= unix_skb_len(skb
);
3176 skb
= skb_peek(&sk
->sk_receive_queue
);
3180 spin_unlock(&sk
->sk_receive_queue
.lock
);
3184 EXPORT_SYMBOL_GPL(unix_inq_len
);
3186 long unix_outq_len(struct sock
*sk
)
3188 return sk_wmem_alloc_get(sk
);
3190 EXPORT_SYMBOL_GPL(unix_outq_len
);
3192 static int unix_open_file(struct sock
*sk
)
3198 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
3201 if (!smp_load_acquire(&unix_sk(sk
)->addr
))
3204 path
= unix_sk(sk
)->path
;
3210 fd
= get_unused_fd_flags(O_CLOEXEC
);
3214 f
= dentry_open(&path
, O_PATH
, current_cred());
3228 static int unix_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
3230 struct sock
*sk
= sock
->sk
;
3236 amount
= unix_outq_len(sk
);
3237 err
= put_user(amount
, (int __user
*)arg
);
3240 amount
= unix_inq_len(sk
);
3244 err
= put_user(amount
, (int __user
*)arg
);
3247 err
= unix_open_file(sk
);
3249 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3252 struct unix_sock
*u
= unix_sk(sk
);
3253 struct sk_buff
*skb
;
3256 mutex_lock(&u
->iolock
);
3258 skb
= skb_peek(&sk
->sk_receive_queue
);
3260 struct sk_buff
*oob_skb
= READ_ONCE(u
->oob_skb
);
3261 struct sk_buff
*next_skb
;
3263 next_skb
= skb_peek_next(skb
, &sk
->sk_receive_queue
);
3265 if (skb
== oob_skb
||
3266 (!unix_skb_len(skb
) &&
3267 (!oob_skb
|| next_skb
== oob_skb
)))
3271 mutex_unlock(&u
->iolock
);
3273 err
= put_user(answ
, (int __user
*)arg
);
3284 #ifdef CONFIG_COMPAT
3285 static int unix_compat_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
3287 return unix_ioctl(sock
, cmd
, (unsigned long)compat_ptr(arg
));
3291 static __poll_t
unix_poll(struct file
*file
, struct socket
*sock
, poll_table
*wait
)
3293 struct sock
*sk
= sock
->sk
;
3294 unsigned char state
;
3298 sock_poll_wait(file
, sock
, wait
);
3300 shutdown
= READ_ONCE(sk
->sk_shutdown
);
3301 state
= READ_ONCE(sk
->sk_state
);
3303 /* exceptional events? */
3304 if (READ_ONCE(sk
->sk_err
))
3306 if (shutdown
== SHUTDOWN_MASK
)
3308 if (shutdown
& RCV_SHUTDOWN
)
3309 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
3312 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
3313 mask
|= EPOLLIN
| EPOLLRDNORM
;
3314 if (sk_is_readable(sk
))
3315 mask
|= EPOLLIN
| EPOLLRDNORM
;
3316 #if IS_ENABLED(CONFIG_AF_UNIX_OOB)
3317 if (READ_ONCE(unix_sk(sk
)->oob_skb
))
3321 /* Connection-based need to check for termination and startup */
3322 if ((sk
->sk_type
== SOCK_STREAM
|| sk
->sk_type
== SOCK_SEQPACKET
) &&
3327 * we set writable also when the other side has shut down the
3328 * connection. This prevents stuck sockets.
3330 if (unix_writable(sk
, state
))
3331 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
3336 static __poll_t
unix_dgram_poll(struct file
*file
, struct socket
*sock
,
3339 struct sock
*sk
= sock
->sk
, *other
;
3340 unsigned int writable
;
3341 unsigned char state
;
3345 sock_poll_wait(file
, sock
, wait
);
3347 shutdown
= READ_ONCE(sk
->sk_shutdown
);
3348 state
= READ_ONCE(sk
->sk_state
);
3350 /* exceptional events? */
3351 if (READ_ONCE(sk
->sk_err
) ||
3352 !skb_queue_empty_lockless(&sk
->sk_error_queue
))
3354 (sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
) ? EPOLLPRI
: 0);
3356 if (shutdown
& RCV_SHUTDOWN
)
3357 mask
|= EPOLLRDHUP
| EPOLLIN
| EPOLLRDNORM
;
3358 if (shutdown
== SHUTDOWN_MASK
)
3362 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
3363 mask
|= EPOLLIN
| EPOLLRDNORM
;
3364 if (sk_is_readable(sk
))
3365 mask
|= EPOLLIN
| EPOLLRDNORM
;
3367 /* Connection-based need to check for termination and startup */
3368 if (sk
->sk_type
== SOCK_SEQPACKET
&& state
== TCP_CLOSE
)
3371 /* No write status requested, avoid expensive OUT tests. */
3372 if (!(poll_requested_events(wait
) & (EPOLLWRBAND
|EPOLLWRNORM
|EPOLLOUT
)))
3375 writable
= unix_writable(sk
, state
);
3377 unix_state_lock(sk
);
3379 other
= unix_peer(sk
);
3380 if (other
&& unix_peer(other
) != sk
&&
3381 unix_recvq_full_lockless(other
) &&
3382 unix_dgram_peer_wake_me(sk
, other
))
3385 unix_state_unlock(sk
);
3389 mask
|= EPOLLOUT
| EPOLLWRNORM
| EPOLLWRBAND
;
3391 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
3396 #ifdef CONFIG_PROC_FS
3398 #define BUCKET_SPACE (BITS_PER_LONG - (UNIX_HASH_BITS + 1) - 1)
3400 #define get_bucket(x) ((x) >> BUCKET_SPACE)
3401 #define get_offset(x) ((x) & ((1UL << BUCKET_SPACE) - 1))
3402 #define set_bucket_offset(b, o) ((b) << BUCKET_SPACE | (o))
3404 static struct sock
*unix_from_bucket(struct seq_file
*seq
, loff_t
*pos
)
3406 unsigned long offset
= get_offset(*pos
);
3407 unsigned long bucket
= get_bucket(*pos
);
3408 unsigned long count
= 0;
3411 for (sk
= sk_head(&seq_file_net(seq
)->unx
.table
.buckets
[bucket
]);
3412 sk
; sk
= sk_next(sk
)) {
3413 if (++count
== offset
)
3420 static struct sock
*unix_get_first(struct seq_file
*seq
, loff_t
*pos
)
3422 unsigned long bucket
= get_bucket(*pos
);
3423 struct net
*net
= seq_file_net(seq
);
3426 while (bucket
< UNIX_HASH_SIZE
) {
3427 spin_lock(&net
->unx
.table
.locks
[bucket
]);
3429 sk
= unix_from_bucket(seq
, pos
);
3433 spin_unlock(&net
->unx
.table
.locks
[bucket
]);
3435 *pos
= set_bucket_offset(++bucket
, 1);
3441 static struct sock
*unix_get_next(struct seq_file
*seq
, struct sock
*sk
,
3444 unsigned long bucket
= get_bucket(*pos
);
3451 spin_unlock(&seq_file_net(seq
)->unx
.table
.locks
[bucket
]);
3453 *pos
= set_bucket_offset(++bucket
, 1);
3455 return unix_get_first(seq
, pos
);
3458 static void *unix_seq_start(struct seq_file
*seq
, loff_t
*pos
)
3461 return SEQ_START_TOKEN
;
3463 return unix_get_first(seq
, pos
);
3466 static void *unix_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3470 if (v
== SEQ_START_TOKEN
)
3471 return unix_get_first(seq
, pos
);
3473 return unix_get_next(seq
, v
, pos
);
3476 static void unix_seq_stop(struct seq_file
*seq
, void *v
)
3478 struct sock
*sk
= v
;
3481 spin_unlock(&seq_file_net(seq
)->unx
.table
.locks
[sk
->sk_hash
]);
3484 static int unix_seq_show(struct seq_file
*seq
, void *v
)
3487 if (v
== SEQ_START_TOKEN
)
3488 seq_puts(seq
, "Num RefCount Protocol Flags Type St "
3492 struct unix_sock
*u
= unix_sk(s
);
3495 seq_printf(seq
, "%pK: %08X %08X %08X %04X %02X %5lu",
3497 refcount_read(&s
->sk_refcnt
),
3499 s
->sk_state
== TCP_LISTEN
? __SO_ACCEPTCON
: 0,
3502 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTED
: SS_UNCONNECTED
) :
3503 (s
->sk_state
== TCP_ESTABLISHED
? SS_CONNECTING
: SS_DISCONNECTING
),
3506 if (u
->addr
) { // under a hash table lock here
3511 len
= u
->addr
->len
-
3512 offsetof(struct sockaddr_un
, sun_path
);
3513 if (u
->addr
->name
->sun_path
[0]) {
3519 for ( ; i
< len
; i
++)
3520 seq_putc(seq
, u
->addr
->name
->sun_path
[i
] ?:
3523 unix_state_unlock(s
);
3524 seq_putc(seq
, '\n');
3530 static const struct seq_operations unix_seq_ops
= {
3531 .start
= unix_seq_start
,
3532 .next
= unix_seq_next
,
3533 .stop
= unix_seq_stop
,
3534 .show
= unix_seq_show
,
3537 #ifdef CONFIG_BPF_SYSCALL
3538 struct bpf_unix_iter_state
{
3539 struct seq_net_private p
;
3540 unsigned int cur_sk
;
3541 unsigned int end_sk
;
3542 unsigned int max_sk
;
3543 struct sock
**batch
;
3544 bool st_bucket_done
;
3547 struct bpf_iter__unix
{
3548 __bpf_md_ptr(struct bpf_iter_meta
*, meta
);
3549 __bpf_md_ptr(struct unix_sock
*, unix_sk
);
3550 uid_t uid
__aligned(8);
3553 static int unix_prog_seq_show(struct bpf_prog
*prog
, struct bpf_iter_meta
*meta
,
3554 struct unix_sock
*unix_sk
, uid_t uid
)
3556 struct bpf_iter__unix ctx
;
3558 meta
->seq_num
--; /* skip SEQ_START_TOKEN */
3560 ctx
.unix_sk
= unix_sk
;
3562 return bpf_iter_run_prog(prog
, &ctx
);
3565 static int bpf_iter_unix_hold_batch(struct seq_file
*seq
, struct sock
*start_sk
)
3568 struct bpf_unix_iter_state
*iter
= seq
->private;
3569 unsigned int expected
= 1;
3572 sock_hold(start_sk
);
3573 iter
->batch
[iter
->end_sk
++] = start_sk
;
3575 for (sk
= sk_next(start_sk
); sk
; sk
= sk_next(sk
)) {
3576 if (iter
->end_sk
< iter
->max_sk
) {
3578 iter
->batch
[iter
->end_sk
++] = sk
;
3584 spin_unlock(&seq_file_net(seq
)->unx
.table
.locks
[start_sk
->sk_hash
]);
3589 static void bpf_iter_unix_put_batch(struct bpf_unix_iter_state
*iter
)
3591 while (iter
->cur_sk
< iter
->end_sk
)
3592 sock_put(iter
->batch
[iter
->cur_sk
++]);
3595 static int bpf_iter_unix_realloc_batch(struct bpf_unix_iter_state
*iter
,
3596 unsigned int new_batch_sz
)
3598 struct sock
**new_batch
;
3600 new_batch
= kvmalloc(sizeof(*new_batch
) * new_batch_sz
,
3601 GFP_USER
| __GFP_NOWARN
);
3605 bpf_iter_unix_put_batch(iter
);
3606 kvfree(iter
->batch
);
3607 iter
->batch
= new_batch
;
3608 iter
->max_sk
= new_batch_sz
;
3613 static struct sock
*bpf_iter_unix_batch(struct seq_file
*seq
,
3616 struct bpf_unix_iter_state
*iter
= seq
->private;
3617 unsigned int expected
;
3618 bool resized
= false;
3621 if (iter
->st_bucket_done
)
3622 *pos
= set_bucket_offset(get_bucket(*pos
) + 1, 1);
3625 /* Get a new batch */
3629 sk
= unix_get_first(seq
, pos
);
3631 return NULL
; /* Done */
3633 expected
= bpf_iter_unix_hold_batch(seq
, sk
);
3635 if (iter
->end_sk
== expected
) {
3636 iter
->st_bucket_done
= true;
3640 if (!resized
&& !bpf_iter_unix_realloc_batch(iter
, expected
* 3 / 2)) {
3648 static void *bpf_iter_unix_seq_start(struct seq_file
*seq
, loff_t
*pos
)
3651 return SEQ_START_TOKEN
;
3653 /* bpf iter does not support lseek, so it always
3654 * continue from where it was stop()-ped.
3656 return bpf_iter_unix_batch(seq
, pos
);
3659 static void *bpf_iter_unix_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
3661 struct bpf_unix_iter_state
*iter
= seq
->private;
3664 /* Whenever seq_next() is called, the iter->cur_sk is
3665 * done with seq_show(), so advance to the next sk in
3668 if (iter
->cur_sk
< iter
->end_sk
)
3669 sock_put(iter
->batch
[iter
->cur_sk
++]);
3673 if (iter
->cur_sk
< iter
->end_sk
)
3674 sk
= iter
->batch
[iter
->cur_sk
];
3676 sk
= bpf_iter_unix_batch(seq
, pos
);
3681 static int bpf_iter_unix_seq_show(struct seq_file
*seq
, void *v
)
3683 struct bpf_iter_meta meta
;
3684 struct bpf_prog
*prog
;
3685 struct sock
*sk
= v
;
3690 if (v
== SEQ_START_TOKEN
)
3693 slow
= lock_sock_fast(sk
);
3695 if (unlikely(sk_unhashed(sk
))) {
3700 uid
= from_kuid_munged(seq_user_ns(seq
), sock_i_uid(sk
));
3702 prog
= bpf_iter_get_info(&meta
, false);
3703 ret
= unix_prog_seq_show(prog
, &meta
, v
, uid
);
3705 unlock_sock_fast(sk
, slow
);
3709 static void bpf_iter_unix_seq_stop(struct seq_file
*seq
, void *v
)
3711 struct bpf_unix_iter_state
*iter
= seq
->private;
3712 struct bpf_iter_meta meta
;
3713 struct bpf_prog
*prog
;
3717 prog
= bpf_iter_get_info(&meta
, true);
3719 (void)unix_prog_seq_show(prog
, &meta
, v
, 0);
3722 if (iter
->cur_sk
< iter
->end_sk
)
3723 bpf_iter_unix_put_batch(iter
);
3726 static const struct seq_operations bpf_iter_unix_seq_ops
= {
3727 .start
= bpf_iter_unix_seq_start
,
3728 .next
= bpf_iter_unix_seq_next
,
3729 .stop
= bpf_iter_unix_seq_stop
,
3730 .show
= bpf_iter_unix_seq_show
,
3735 static const struct net_proto_family unix_family_ops
= {
3737 .create
= unix_create
,
3738 .owner
= THIS_MODULE
,
3742 static int __net_init
unix_net_init(struct net
*net
)
3746 net
->unx
.sysctl_max_dgram_qlen
= 10;
3747 if (unix_sysctl_register(net
))
3750 #ifdef CONFIG_PROC_FS
3751 if (!proc_create_net("unix", 0, net
->proc_net
, &unix_seq_ops
,
3752 sizeof(struct seq_net_private
)))
3756 net
->unx
.table
.locks
= kvmalloc_array(UNIX_HASH_SIZE
,
3757 sizeof(spinlock_t
), GFP_KERNEL
);
3758 if (!net
->unx
.table
.locks
)
3761 net
->unx
.table
.buckets
= kvmalloc_array(UNIX_HASH_SIZE
,
3762 sizeof(struct hlist_head
),
3764 if (!net
->unx
.table
.buckets
)
3767 for (i
= 0; i
< UNIX_HASH_SIZE
; i
++) {
3768 spin_lock_init(&net
->unx
.table
.locks
[i
]);
3769 lock_set_cmp_fn(&net
->unx
.table
.locks
[i
], unix_table_lock_cmp_fn
, NULL
);
3770 INIT_HLIST_HEAD(&net
->unx
.table
.buckets
[i
]);
3776 kvfree(net
->unx
.table
.locks
);
3778 #ifdef CONFIG_PROC_FS
3779 remove_proc_entry("unix", net
->proc_net
);
3782 unix_sysctl_unregister(net
);
3787 static void __net_exit
unix_net_exit(struct net
*net
)
3789 kvfree(net
->unx
.table
.buckets
);
3790 kvfree(net
->unx
.table
.locks
);
3791 unix_sysctl_unregister(net
);
3792 remove_proc_entry("unix", net
->proc_net
);
3795 static struct pernet_operations unix_net_ops
= {
3796 .init
= unix_net_init
,
3797 .exit
= unix_net_exit
,
3800 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3801 DEFINE_BPF_ITER_FUNC(unix
, struct bpf_iter_meta
*meta
,
3802 struct unix_sock
*unix_sk
, uid_t uid
)
3804 #define INIT_BATCH_SZ 16
3806 static int bpf_iter_init_unix(void *priv_data
, struct bpf_iter_aux_info
*aux
)
3808 struct bpf_unix_iter_state
*iter
= priv_data
;
3811 err
= bpf_iter_init_seq_net(priv_data
, aux
);
3815 err
= bpf_iter_unix_realloc_batch(iter
, INIT_BATCH_SZ
);
3817 bpf_iter_fini_seq_net(priv_data
);
3824 static void bpf_iter_fini_unix(void *priv_data
)
3826 struct bpf_unix_iter_state
*iter
= priv_data
;
3828 bpf_iter_fini_seq_net(priv_data
);
3829 kvfree(iter
->batch
);
3832 static const struct bpf_iter_seq_info unix_seq_info
= {
3833 .seq_ops
= &bpf_iter_unix_seq_ops
,
3834 .init_seq_private
= bpf_iter_init_unix
,
3835 .fini_seq_private
= bpf_iter_fini_unix
,
3836 .seq_priv_size
= sizeof(struct bpf_unix_iter_state
),
3839 static const struct bpf_func_proto
*
3840 bpf_iter_unix_get_func_proto(enum bpf_func_id func_id
,
3841 const struct bpf_prog
*prog
)
3844 case BPF_FUNC_setsockopt
:
3845 return &bpf_sk_setsockopt_proto
;
3846 case BPF_FUNC_getsockopt
:
3847 return &bpf_sk_getsockopt_proto
;
3853 static struct bpf_iter_reg unix_reg_info
= {
3855 .ctx_arg_info_size
= 1,
3857 { offsetof(struct bpf_iter__unix
, unix_sk
),
3858 PTR_TO_BTF_ID_OR_NULL
},
3860 .get_func_proto
= bpf_iter_unix_get_func_proto
,
3861 .seq_info
= &unix_seq_info
,
3864 static void __init
bpf_iter_register(void)
3866 unix_reg_info
.ctx_arg_info
[0].btf_id
= btf_sock_ids
[BTF_SOCK_TYPE_UNIX
];
3867 if (bpf_iter_reg_target(&unix_reg_info
))
3868 pr_warn("Warning: could not register bpf iterator unix\n");
3872 static int __init
af_unix_init(void)
3876 BUILD_BUG_ON(sizeof(struct unix_skb_parms
) > sizeof_field(struct sk_buff
, cb
));
3878 for (i
= 0; i
< UNIX_HASH_SIZE
/ 2; i
++) {
3879 spin_lock_init(&bsd_socket_locks
[i
]);
3880 INIT_HLIST_HEAD(&bsd_socket_buckets
[i
]);
3883 rc
= proto_register(&unix_dgram_proto
, 1);
3885 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__
);
3889 rc
= proto_register(&unix_stream_proto
, 1);
3891 pr_crit("%s: Cannot create unix_sock SLAB cache!\n", __func__
);
3892 proto_unregister(&unix_dgram_proto
);
3896 sock_register(&unix_family_ops
);
3897 register_pernet_subsys(&unix_net_ops
);
3898 unix_bpf_build_proto();
3900 #if defined(CONFIG_BPF_SYSCALL) && defined(CONFIG_PROC_FS)
3901 bpf_iter_register();
3908 /* Later than subsys_initcall() because we depend on stuff initialised there */
3909 fs_initcall(af_unix_init
);