1 // SPDX-License-Identifier: GPL-2.0-or-later
3 * INET An implementation of the TCP/IP protocol suite for the LINUX
4 * operating system. INET is implemented using the BSD Socket
5 * interface as the means of communication with the user level.
7 * Generic socket support routines. Memory allocators, socket lock/release
8 * handler for protocols to use and generic option handler.
11 * Fred N. van Kempen, <waltje@uWalt.NL.Mugnet.ORG>
12 * Florian La Roche, <flla@stud.uni-sb.de>
13 * Alan Cox, <A.Cox@swansea.ac.uk>
16 * Alan Cox : Numerous verify_area() problems
17 * Alan Cox : Connecting on a connecting socket
18 * now returns an error for tcp.
19 * Alan Cox : sock->protocol is set correctly.
20 * and is not sometimes left as 0.
21 * Alan Cox : connect handles icmp errors on a
22 * connect properly. Unfortunately there
23 * is a restart syscall nasty there. I
24 * can't match BSD without hacking the C
25 * library. Ideas urgently sought!
26 * Alan Cox : Disallow bind() to addresses that are
27 * not ours - especially broadcast ones!!
28 * Alan Cox : Socket 1024 _IS_ ok for users. (fencepost)
29 * Alan Cox : sock_wfree/sock_rfree don't destroy sockets,
30 * instead they leave that for the DESTROY timer.
31 * Alan Cox : Clean up error flag in accept
32 * Alan Cox : TCP ack handling is buggy, the DESTROY timer
33 * was buggy. Put a remove_sock() in the handler
34 * for memory when we hit 0. Also altered the timer
35 * code. The ACK stuff can wait and needs major
37 * Alan Cox : Fixed TCP ack bug, removed remove sock
38 * and fixed timer/inet_bh race.
39 * Alan Cox : Added zapped flag for TCP
40 * Alan Cox : Move kfree_skb into skbuff.c and tidied up surplus code
41 * Alan Cox : for new sk_buff allocations wmalloc/rmalloc now call alloc_skb
42 * Alan Cox : kfree_s calls now are kfree_skbmem so we can track skb resources
43 * Alan Cox : Supports socket option broadcast now as does udp. Packet and raw need fixing.
44 * Alan Cox : Added RCVBUF,SNDBUF size setting. It suddenly occurred to me how easy it was so...
45 * Rick Sladkey : Relaxed UDP rules for matching packets.
46 * C.E.Hawkins : IFF_PROMISC/SIOCGHWADDR support
47 * Pauline Middelink : identd support
48 * Alan Cox : Fixed connect() taking signals I think.
49 * Alan Cox : SO_LINGER supported
50 * Alan Cox : Error reporting fixes
51 * Anonymous : inet_create tidied up (sk->reuse setting)
52 * Alan Cox : inet sockets don't set sk->type!
53 * Alan Cox : Split socket option code
54 * Alan Cox : Callbacks
55 * Alan Cox : Nagle flag for Charles & Johannes stuff
56 * Alex : Removed restriction on inet fioctl
57 * Alan Cox : Splitting INET from NET core
58 * Alan Cox : Fixed bogus SO_TYPE handling in getsockopt()
59 * Adam Caldwell : Missing return in SO_DONTROUTE/SO_DEBUG code
60 * Alan Cox : Split IP from generic code
61 * Alan Cox : New kfree_skbmem()
62 * Alan Cox : Make SO_DEBUG superuser only.
63 * Alan Cox : Allow anyone to clear SO_DEBUG
65 * Alan Cox : Added optimistic memory grabbing for AF_UNIX throughput.
66 * Alan Cox : Allocator for a socket is settable.
67 * Alan Cox : SO_ERROR includes soft errors.
68 * Alan Cox : Allow NULL arguments on some SO_ opts
69 * Alan Cox : Generic socket allocation to make hooks
70 * easier (suggested by Craig Metz).
71 * Michael Pall : SO_ERROR returns positive errno again
72 * Steve Whitehouse: Added default destructor to free
73 * protocol private data.
74 * Steve Whitehouse: Added various other default routines
75 * common to several socket families.
76 * Chris Evans : Call suser() check last on F_SETOWN
77 * Jay Schulist : Added SO_ATTACH_FILTER and SO_DETACH_FILTER.
78 * Andi Kleen : Add sock_kmalloc()/sock_kfree_s()
79 * Andi Kleen : Fix write_space callback
80 * Chris Evans : Security fixes - signedness again
81 * Arnaldo C. Melo : cleanups, use skb_queue_purge
86 #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt
88 #include <asm/unaligned.h>
89 #include <linux/capability.h>
90 #include <linux/errno.h>
91 #include <linux/errqueue.h>
92 #include <linux/types.h>
93 #include <linux/socket.h>
95 #include <linux/kernel.h>
96 #include <linux/module.h>
97 #include <linux/proc_fs.h>
98 #include <linux/seq_file.h>
99 #include <linux/sched.h>
100 #include <linux/sched/mm.h>
101 #include <linux/timer.h>
102 #include <linux/string.h>
103 #include <linux/sockios.h>
104 #include <linux/net.h>
105 #include <linux/mm.h>
106 #include <linux/slab.h>
107 #include <linux/interrupt.h>
108 #include <linux/poll.h>
109 #include <linux/tcp.h>
110 #include <linux/udp.h>
111 #include <linux/init.h>
112 #include <linux/highmem.h>
113 #include <linux/user_namespace.h>
114 #include <linux/static_key.h>
115 #include <linux/memcontrol.h>
116 #include <linux/prefetch.h>
117 #include <linux/compat.h>
118 #include <linux/mroute.h>
119 #include <linux/mroute6.h>
120 #include <linux/icmpv6.h>
122 #include <linux/uaccess.h>
124 #include <linux/netdevice.h>
125 #include <net/protocol.h>
126 #include <linux/skbuff.h>
127 #include <net/net_namespace.h>
128 #include <net/request_sock.h>
129 #include <net/sock.h>
130 #include <linux/net_tstamp.h>
131 #include <net/xfrm.h>
132 #include <linux/ipsec.h>
133 #include <net/cls_cgroup.h>
134 #include <net/netprio_cgroup.h>
135 #include <linux/sock_diag.h>
137 #include <linux/filter.h>
138 #include <net/sock_reuseport.h>
139 #include <net/bpf_sk_storage.h>
141 #include <trace/events/sock.h>
144 #include <net/busy_poll.h>
145 #include <net/phonet/phonet.h>
147 #include <linux/ethtool.h>
151 static DEFINE_MUTEX(proto_list_mutex
);
152 static LIST_HEAD(proto_list
);
154 static void sock_def_write_space_wfree(struct sock
*sk
);
155 static void sock_def_write_space(struct sock
*sk
);
158 * sk_ns_capable - General socket capability test
159 * @sk: Socket to use a capability on or through
160 * @user_ns: The user namespace of the capability to use
161 * @cap: The capability to use
163 * Test to see if the opener of the socket had when the socket was
164 * created and the current process has the capability @cap in the user
165 * namespace @user_ns.
167 bool sk_ns_capable(const struct sock
*sk
,
168 struct user_namespace
*user_ns
, int cap
)
170 return file_ns_capable(sk
->sk_socket
->file
, user_ns
, cap
) &&
171 ns_capable(user_ns
, cap
);
173 EXPORT_SYMBOL(sk_ns_capable
);
176 * sk_capable - Socket global capability test
177 * @sk: Socket to use a capability on or through
178 * @cap: The global capability to use
180 * Test to see if the opener of the socket had when the socket was
181 * created and the current process has the capability @cap in all user
184 bool sk_capable(const struct sock
*sk
, int cap
)
186 return sk_ns_capable(sk
, &init_user_ns
, cap
);
188 EXPORT_SYMBOL(sk_capable
);
191 * sk_net_capable - Network namespace socket capability test
192 * @sk: Socket to use a capability on or through
193 * @cap: The capability to use
195 * Test to see if the opener of the socket had when the socket was created
196 * and the current process has the capability @cap over the network namespace
197 * the socket is a member of.
199 bool sk_net_capable(const struct sock
*sk
, int cap
)
201 return sk_ns_capable(sk
, sock_net(sk
)->user_ns
, cap
);
203 EXPORT_SYMBOL(sk_net_capable
);
206 * Each address family might have different locking rules, so we have
207 * one slock key per address family and separate keys for internal and
210 static struct lock_class_key af_family_keys
[AF_MAX
];
211 static struct lock_class_key af_family_kern_keys
[AF_MAX
];
212 static struct lock_class_key af_family_slock_keys
[AF_MAX
];
213 static struct lock_class_key af_family_kern_slock_keys
[AF_MAX
];
216 * Make lock validator output more readable. (we pre-construct these
217 * strings build-time, so that runtime initialization of socket
221 #define _sock_locks(x) \
222 x "AF_UNSPEC", x "AF_UNIX" , x "AF_INET" , \
223 x "AF_AX25" , x "AF_IPX" , x "AF_APPLETALK", \
224 x "AF_NETROM", x "AF_BRIDGE" , x "AF_ATMPVC" , \
225 x "AF_X25" , x "AF_INET6" , x "AF_ROSE" , \
226 x "AF_DECnet", x "AF_NETBEUI" , x "AF_SECURITY" , \
227 x "AF_KEY" , x "AF_NETLINK" , x "AF_PACKET" , \
228 x "AF_ASH" , x "AF_ECONET" , x "AF_ATMSVC" , \
229 x "AF_RDS" , x "AF_SNA" , x "AF_IRDA" , \
230 x "AF_PPPOX" , x "AF_WANPIPE" , x "AF_LLC" , \
231 x "27" , x "28" , x "AF_CAN" , \
232 x "AF_TIPC" , x "AF_BLUETOOTH", x "IUCV" , \
233 x "AF_RXRPC" , x "AF_ISDN" , x "AF_PHONET" , \
234 x "AF_IEEE802154", x "AF_CAIF" , x "AF_ALG" , \
235 x "AF_NFC" , x "AF_VSOCK" , x "AF_KCM" , \
236 x "AF_QIPCRTR", x "AF_SMC" , x "AF_XDP" , \
240 static const char *const af_family_key_strings
[AF_MAX
+1] = {
241 _sock_locks("sk_lock-")
243 static const char *const af_family_slock_key_strings
[AF_MAX
+1] = {
244 _sock_locks("slock-")
246 static const char *const af_family_clock_key_strings
[AF_MAX
+1] = {
247 _sock_locks("clock-")
250 static const char *const af_family_kern_key_strings
[AF_MAX
+1] = {
251 _sock_locks("k-sk_lock-")
253 static const char *const af_family_kern_slock_key_strings
[AF_MAX
+1] = {
254 _sock_locks("k-slock-")
256 static const char *const af_family_kern_clock_key_strings
[AF_MAX
+1] = {
257 _sock_locks("k-clock-")
259 static const char *const af_family_rlock_key_strings
[AF_MAX
+1] = {
260 _sock_locks("rlock-")
262 static const char *const af_family_wlock_key_strings
[AF_MAX
+1] = {
263 _sock_locks("wlock-")
265 static const char *const af_family_elock_key_strings
[AF_MAX
+1] = {
266 _sock_locks("elock-")
270 * sk_callback_lock and sk queues locking rules are per-address-family,
271 * so split the lock classes by using a per-AF key:
273 static struct lock_class_key af_callback_keys
[AF_MAX
];
274 static struct lock_class_key af_rlock_keys
[AF_MAX
];
275 static struct lock_class_key af_wlock_keys
[AF_MAX
];
276 static struct lock_class_key af_elock_keys
[AF_MAX
];
277 static struct lock_class_key af_kern_callback_keys
[AF_MAX
];
279 /* Run time adjustable parameters. */
280 __u32 sysctl_wmem_max __read_mostly
= SK_WMEM_MAX
;
281 EXPORT_SYMBOL(sysctl_wmem_max
);
282 __u32 sysctl_rmem_max __read_mostly
= SK_RMEM_MAX
;
283 EXPORT_SYMBOL(sysctl_rmem_max
);
284 __u32 sysctl_wmem_default __read_mostly
= SK_WMEM_MAX
;
285 __u32 sysctl_rmem_default __read_mostly
= SK_RMEM_MAX
;
287 int sysctl_tstamp_allow_data __read_mostly
= 1;
289 DEFINE_STATIC_KEY_FALSE(memalloc_socks_key
);
290 EXPORT_SYMBOL_GPL(memalloc_socks_key
);
293 * sk_set_memalloc - sets %SOCK_MEMALLOC
294 * @sk: socket to set it on
296 * Set %SOCK_MEMALLOC on a socket for access to emergency reserves.
297 * It's the responsibility of the admin to adjust min_free_kbytes
298 * to meet the requirements
300 void sk_set_memalloc(struct sock
*sk
)
302 sock_set_flag(sk
, SOCK_MEMALLOC
);
303 sk
->sk_allocation
|= __GFP_MEMALLOC
;
304 static_branch_inc(&memalloc_socks_key
);
306 EXPORT_SYMBOL_GPL(sk_set_memalloc
);
308 void sk_clear_memalloc(struct sock
*sk
)
310 sock_reset_flag(sk
, SOCK_MEMALLOC
);
311 sk
->sk_allocation
&= ~__GFP_MEMALLOC
;
312 static_branch_dec(&memalloc_socks_key
);
315 * SOCK_MEMALLOC is allowed to ignore rmem limits to ensure forward
316 * progress of swapping. SOCK_MEMALLOC may be cleared while
317 * it has rmem allocations due to the last swapfile being deactivated
318 * but there is a risk that the socket is unusable due to exceeding
319 * the rmem limits. Reclaim the reserves and obey rmem limits again.
323 EXPORT_SYMBOL_GPL(sk_clear_memalloc
);
325 int __sk_backlog_rcv(struct sock
*sk
, struct sk_buff
*skb
)
328 unsigned int noreclaim_flag
;
330 /* these should have been dropped before queueing */
331 BUG_ON(!sock_flag(sk
, SOCK_MEMALLOC
));
333 noreclaim_flag
= memalloc_noreclaim_save();
334 ret
= INDIRECT_CALL_INET(sk
->sk_backlog_rcv
,
338 memalloc_noreclaim_restore(noreclaim_flag
);
342 EXPORT_SYMBOL(__sk_backlog_rcv
);
344 void sk_error_report(struct sock
*sk
)
346 sk
->sk_error_report(sk
);
348 switch (sk
->sk_family
) {
352 trace_inet_sk_error_report(sk
);
358 EXPORT_SYMBOL(sk_error_report
);
360 int sock_get_timeout(long timeo
, void *optval
, bool old_timeval
)
362 struct __kernel_sock_timeval tv
;
364 if (timeo
== MAX_SCHEDULE_TIMEOUT
) {
368 tv
.tv_sec
= timeo
/ HZ
;
369 tv
.tv_usec
= ((timeo
% HZ
) * USEC_PER_SEC
) / HZ
;
372 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
373 struct old_timeval32 tv32
= { tv
.tv_sec
, tv
.tv_usec
};
374 *(struct old_timeval32
*)optval
= tv32
;
379 struct __kernel_old_timeval old_tv
;
380 old_tv
.tv_sec
= tv
.tv_sec
;
381 old_tv
.tv_usec
= tv
.tv_usec
;
382 *(struct __kernel_old_timeval
*)optval
= old_tv
;
383 return sizeof(old_tv
);
386 *(struct __kernel_sock_timeval
*)optval
= tv
;
389 EXPORT_SYMBOL(sock_get_timeout
);
391 int sock_copy_user_timeval(struct __kernel_sock_timeval
*tv
,
392 sockptr_t optval
, int optlen
, bool old_timeval
)
394 if (old_timeval
&& in_compat_syscall() && !COMPAT_USE_64BIT_TIME
) {
395 struct old_timeval32 tv32
;
397 if (optlen
< sizeof(tv32
))
400 if (copy_from_sockptr(&tv32
, optval
, sizeof(tv32
)))
402 tv
->tv_sec
= tv32
.tv_sec
;
403 tv
->tv_usec
= tv32
.tv_usec
;
404 } else if (old_timeval
) {
405 struct __kernel_old_timeval old_tv
;
407 if (optlen
< sizeof(old_tv
))
409 if (copy_from_sockptr(&old_tv
, optval
, sizeof(old_tv
)))
411 tv
->tv_sec
= old_tv
.tv_sec
;
412 tv
->tv_usec
= old_tv
.tv_usec
;
414 if (optlen
< sizeof(*tv
))
416 if (copy_from_sockptr(tv
, optval
, sizeof(*tv
)))
422 EXPORT_SYMBOL(sock_copy_user_timeval
);
424 static int sock_set_timeout(long *timeo_p
, sockptr_t optval
, int optlen
,
427 struct __kernel_sock_timeval tv
;
428 int err
= sock_copy_user_timeval(&tv
, optval
, optlen
, old_timeval
);
434 if (tv
.tv_usec
< 0 || tv
.tv_usec
>= USEC_PER_SEC
)
438 static int warned __read_mostly
;
440 WRITE_ONCE(*timeo_p
, 0);
441 if (warned
< 10 && net_ratelimit()) {
443 pr_info("%s: `%s' (pid %d) tries to set negative timeout\n",
444 __func__
, current
->comm
, task_pid_nr(current
));
448 val
= MAX_SCHEDULE_TIMEOUT
;
449 if ((tv
.tv_sec
|| tv
.tv_usec
) &&
450 (tv
.tv_sec
< (MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)))
451 val
= tv
.tv_sec
* HZ
+ DIV_ROUND_UP((unsigned long)tv
.tv_usec
,
453 WRITE_ONCE(*timeo_p
, val
);
457 static bool sock_needs_netstamp(const struct sock
*sk
)
459 switch (sk
->sk_family
) {
468 static void sock_disable_timestamp(struct sock
*sk
, unsigned long flags
)
470 if (sk
->sk_flags
& flags
) {
471 sk
->sk_flags
&= ~flags
;
472 if (sock_needs_netstamp(sk
) &&
473 !(sk
->sk_flags
& SK_FLAGS_TIMESTAMP
))
474 net_disable_timestamp();
479 int __sock_queue_rcv_skb(struct sock
*sk
, struct sk_buff
*skb
)
482 struct sk_buff_head
*list
= &sk
->sk_receive_queue
;
484 if (atomic_read(&sk
->sk_rmem_alloc
) >= sk
->sk_rcvbuf
) {
485 atomic_inc(&sk
->sk_drops
);
486 trace_sock_rcvqueue_full(sk
, skb
);
490 if (!sk_rmem_schedule(sk
, skb
, skb
->truesize
)) {
491 atomic_inc(&sk
->sk_drops
);
496 skb_set_owner_r(skb
, sk
);
498 /* we escape from rcu protected region, make sure we dont leak
503 spin_lock_irqsave(&list
->lock
, flags
);
504 sock_skb_set_dropcount(sk
, skb
);
505 __skb_queue_tail(list
, skb
);
506 spin_unlock_irqrestore(&list
->lock
, flags
);
508 if (!sock_flag(sk
, SOCK_DEAD
))
509 sk
->sk_data_ready(sk
);
512 EXPORT_SYMBOL(__sock_queue_rcv_skb
);
514 int sock_queue_rcv_skb_reason(struct sock
*sk
, struct sk_buff
*skb
,
515 enum skb_drop_reason
*reason
)
517 enum skb_drop_reason drop_reason
;
520 err
= sk_filter(sk
, skb
);
522 drop_reason
= SKB_DROP_REASON_SOCKET_FILTER
;
525 err
= __sock_queue_rcv_skb(sk
, skb
);
528 drop_reason
= SKB_DROP_REASON_SOCKET_RCVBUFF
;
531 drop_reason
= SKB_DROP_REASON_PROTO_MEM
;
534 drop_reason
= SKB_NOT_DROPPED_YET
;
539 *reason
= drop_reason
;
542 EXPORT_SYMBOL(sock_queue_rcv_skb_reason
);
544 int __sk_receive_skb(struct sock
*sk
, struct sk_buff
*skb
,
545 const int nested
, unsigned int trim_cap
, bool refcounted
)
547 int rc
= NET_RX_SUCCESS
;
549 if (sk_filter_trim_cap(sk
, skb
, trim_cap
))
550 goto discard_and_relse
;
554 if (sk_rcvqueues_full(sk
, sk
->sk_rcvbuf
)) {
555 atomic_inc(&sk
->sk_drops
);
556 goto discard_and_relse
;
559 bh_lock_sock_nested(sk
);
562 if (!sock_owned_by_user(sk
)) {
564 * trylock + unlock semantics:
566 mutex_acquire(&sk
->sk_lock
.dep_map
, 0, 1, _RET_IP_
);
568 rc
= sk_backlog_rcv(sk
, skb
);
570 mutex_release(&sk
->sk_lock
.dep_map
, _RET_IP_
);
571 } else if (sk_add_backlog(sk
, skb
, READ_ONCE(sk
->sk_rcvbuf
))) {
573 atomic_inc(&sk
->sk_drops
);
574 goto discard_and_relse
;
586 EXPORT_SYMBOL(__sk_receive_skb
);
588 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ip6_dst_check(struct dst_entry
*,
590 INDIRECT_CALLABLE_DECLARE(struct dst_entry
*ipv4_dst_check(struct dst_entry
*,
592 struct dst_entry
*__sk_dst_check(struct sock
*sk
, u32 cookie
)
594 struct dst_entry
*dst
= __sk_dst_get(sk
);
596 if (dst
&& dst
->obsolete
&&
597 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
598 dst
, cookie
) == NULL
) {
599 sk_tx_queue_clear(sk
);
600 WRITE_ONCE(sk
->sk_dst_pending_confirm
, 0);
601 RCU_INIT_POINTER(sk
->sk_dst_cache
, NULL
);
608 EXPORT_SYMBOL(__sk_dst_check
);
610 struct dst_entry
*sk_dst_check(struct sock
*sk
, u32 cookie
)
612 struct dst_entry
*dst
= sk_dst_get(sk
);
614 if (dst
&& dst
->obsolete
&&
615 INDIRECT_CALL_INET(dst
->ops
->check
, ip6_dst_check
, ipv4_dst_check
,
616 dst
, cookie
) == NULL
) {
624 EXPORT_SYMBOL(sk_dst_check
);
626 static int sock_bindtoindex_locked(struct sock
*sk
, int ifindex
)
628 int ret
= -ENOPROTOOPT
;
629 #ifdef CONFIG_NETDEVICES
630 struct net
*net
= sock_net(sk
);
634 if (sk
->sk_bound_dev_if
&& !ns_capable(net
->user_ns
, CAP_NET_RAW
))
641 /* Paired with all READ_ONCE() done locklessly. */
642 WRITE_ONCE(sk
->sk_bound_dev_if
, ifindex
);
644 if (sk
->sk_prot
->rehash
)
645 sk
->sk_prot
->rehash(sk
);
656 int sock_bindtoindex(struct sock
*sk
, int ifindex
, bool lock_sk
)
662 ret
= sock_bindtoindex_locked(sk
, ifindex
);
668 EXPORT_SYMBOL(sock_bindtoindex
);
670 static int sock_setbindtodevice(struct sock
*sk
, sockptr_t optval
, int optlen
)
672 int ret
= -ENOPROTOOPT
;
673 #ifdef CONFIG_NETDEVICES
674 struct net
*net
= sock_net(sk
);
675 char devname
[IFNAMSIZ
];
682 /* Bind this socket to a particular device like "eth0",
683 * as specified in the passed interface name. If the
684 * name is "" or the option length is zero the socket
687 if (optlen
> IFNAMSIZ
- 1)
688 optlen
= IFNAMSIZ
- 1;
689 memset(devname
, 0, sizeof(devname
));
692 if (copy_from_sockptr(devname
, optval
, optlen
))
696 if (devname
[0] != '\0') {
697 struct net_device
*dev
;
700 dev
= dev_get_by_name_rcu(net
, devname
);
702 index
= dev
->ifindex
;
709 sockopt_lock_sock(sk
);
710 ret
= sock_bindtoindex_locked(sk
, index
);
711 sockopt_release_sock(sk
);
718 static int sock_getbindtodevice(struct sock
*sk
, sockptr_t optval
,
719 sockptr_t optlen
, int len
)
721 int ret
= -ENOPROTOOPT
;
722 #ifdef CONFIG_NETDEVICES
723 int bound_dev_if
= READ_ONCE(sk
->sk_bound_dev_if
);
724 struct net
*net
= sock_net(sk
);
725 char devname
[IFNAMSIZ
];
727 if (bound_dev_if
== 0) {
736 ret
= netdev_get_name(net
, devname
, bound_dev_if
);
740 len
= strlen(devname
) + 1;
743 if (copy_to_sockptr(optval
, devname
, len
))
748 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
759 bool sk_mc_loop(const struct sock
*sk
)
761 if (dev_recursion_level())
765 /* IPV6_ADDRFORM can change sk->sk_family under us. */
766 switch (READ_ONCE(sk
->sk_family
)) {
768 return inet_test_bit(MC_LOOP
, sk
);
769 #if IS_ENABLED(CONFIG_IPV6)
771 return inet6_test_bit(MC6_LOOP
, sk
);
777 EXPORT_SYMBOL(sk_mc_loop
);
779 void sock_set_reuseaddr(struct sock
*sk
)
782 sk
->sk_reuse
= SK_CAN_REUSE
;
785 EXPORT_SYMBOL(sock_set_reuseaddr
);
787 void sock_set_reuseport(struct sock
*sk
)
790 sk
->sk_reuseport
= true;
793 EXPORT_SYMBOL(sock_set_reuseport
);
795 void sock_no_linger(struct sock
*sk
)
798 WRITE_ONCE(sk
->sk_lingertime
, 0);
799 sock_set_flag(sk
, SOCK_LINGER
);
802 EXPORT_SYMBOL(sock_no_linger
);
804 void sock_set_priority(struct sock
*sk
, u32 priority
)
806 WRITE_ONCE(sk
->sk_priority
, priority
);
808 EXPORT_SYMBOL(sock_set_priority
);
810 void sock_set_sndtimeo(struct sock
*sk
, s64 secs
)
813 if (secs
&& secs
< MAX_SCHEDULE_TIMEOUT
/ HZ
- 1)
814 WRITE_ONCE(sk
->sk_sndtimeo
, secs
* HZ
);
816 WRITE_ONCE(sk
->sk_sndtimeo
, MAX_SCHEDULE_TIMEOUT
);
819 EXPORT_SYMBOL(sock_set_sndtimeo
);
821 static void __sock_set_timestamps(struct sock
*sk
, bool val
, bool new, bool ns
)
824 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, new);
825 sock_valbool_flag(sk
, SOCK_RCVTSTAMPNS
, ns
);
826 sock_set_flag(sk
, SOCK_RCVTSTAMP
);
827 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
829 sock_reset_flag(sk
, SOCK_RCVTSTAMP
);
830 sock_reset_flag(sk
, SOCK_RCVTSTAMPNS
);
834 void sock_enable_timestamps(struct sock
*sk
)
837 __sock_set_timestamps(sk
, true, false, true);
840 EXPORT_SYMBOL(sock_enable_timestamps
);
842 void sock_set_timestamp(struct sock
*sk
, int optname
, bool valbool
)
845 case SO_TIMESTAMP_OLD
:
846 __sock_set_timestamps(sk
, valbool
, false, false);
848 case SO_TIMESTAMP_NEW
:
849 __sock_set_timestamps(sk
, valbool
, true, false);
851 case SO_TIMESTAMPNS_OLD
:
852 __sock_set_timestamps(sk
, valbool
, false, true);
854 case SO_TIMESTAMPNS_NEW
:
855 __sock_set_timestamps(sk
, valbool
, true, true);
860 static int sock_timestamping_bind_phc(struct sock
*sk
, int phc_index
)
862 struct net
*net
= sock_net(sk
);
863 struct net_device
*dev
= NULL
;
868 if (sk
->sk_bound_dev_if
)
869 dev
= dev_get_by_index(net
, sk
->sk_bound_dev_if
);
872 pr_err("%s: sock not bind to device\n", __func__
);
876 num
= ethtool_get_phc_vclocks(dev
, &vclock_index
);
879 for (i
= 0; i
< num
; i
++) {
880 if (*(vclock_index
+ i
) == phc_index
) {
892 WRITE_ONCE(sk
->sk_bind_phc
, phc_index
);
897 int sock_set_timestamping(struct sock
*sk
, int optname
,
898 struct so_timestamping timestamping
)
900 int val
= timestamping
.flags
;
903 if (val
& ~SOF_TIMESTAMPING_MASK
)
906 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
&&
907 !(val
& SOF_TIMESTAMPING_OPT_ID
))
910 if (val
& SOF_TIMESTAMPING_OPT_ID
&&
911 !(sk
->sk_tsflags
& SOF_TIMESTAMPING_OPT_ID
)) {
913 if ((1 << sk
->sk_state
) &
914 (TCPF_CLOSE
| TCPF_LISTEN
))
916 if (val
& SOF_TIMESTAMPING_OPT_ID_TCP
)
917 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->write_seq
);
919 atomic_set(&sk
->sk_tskey
, tcp_sk(sk
)->snd_una
);
921 atomic_set(&sk
->sk_tskey
, 0);
925 if (val
& SOF_TIMESTAMPING_OPT_STATS
&&
926 !(val
& SOF_TIMESTAMPING_OPT_TSONLY
))
929 if (val
& SOF_TIMESTAMPING_BIND_PHC
) {
930 ret
= sock_timestamping_bind_phc(sk
, timestamping
.bind_phc
);
935 WRITE_ONCE(sk
->sk_tsflags
, val
);
936 sock_valbool_flag(sk
, SOCK_TSTAMP_NEW
, optname
== SO_TIMESTAMPING_NEW
);
938 if (val
& SOF_TIMESTAMPING_RX_SOFTWARE
)
939 sock_enable_timestamp(sk
,
940 SOCK_TIMESTAMPING_RX_SOFTWARE
);
942 sock_disable_timestamp(sk
,
943 (1UL << SOCK_TIMESTAMPING_RX_SOFTWARE
));
947 void sock_set_keepalive(struct sock
*sk
)
950 if (sk
->sk_prot
->keepalive
)
951 sk
->sk_prot
->keepalive(sk
, true);
952 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, true);
955 EXPORT_SYMBOL(sock_set_keepalive
);
957 static void __sock_set_rcvbuf(struct sock
*sk
, int val
)
959 /* Ensure val * 2 fits into an int, to prevent max_t() from treating it
960 * as a negative value.
962 val
= min_t(int, val
, INT_MAX
/ 2);
963 sk
->sk_userlocks
|= SOCK_RCVBUF_LOCK
;
965 /* We double it on the way in to account for "struct sk_buff" etc.
966 * overhead. Applications assume that the SO_RCVBUF setting they make
967 * will allow that much actual data to be received on that socket.
969 * Applications are unaware that "struct sk_buff" and other overheads
970 * allocate from the receive buffer during socket buffer allocation.
972 * And after considering the possible alternatives, returning the value
973 * we actually used in getsockopt is the most desirable behavior.
975 WRITE_ONCE(sk
->sk_rcvbuf
, max_t(int, val
* 2, SOCK_MIN_RCVBUF
));
978 void sock_set_rcvbuf(struct sock
*sk
, int val
)
981 __sock_set_rcvbuf(sk
, val
);
984 EXPORT_SYMBOL(sock_set_rcvbuf
);
986 static void __sock_set_mark(struct sock
*sk
, u32 val
)
988 if (val
!= sk
->sk_mark
) {
989 WRITE_ONCE(sk
->sk_mark
, val
);
994 void sock_set_mark(struct sock
*sk
, u32 val
)
997 __sock_set_mark(sk
, val
);
1000 EXPORT_SYMBOL(sock_set_mark
);
1002 static void sock_release_reserved_memory(struct sock
*sk
, int bytes
)
1004 /* Round down bytes to multiple of pages */
1005 bytes
= round_down(bytes
, PAGE_SIZE
);
1007 WARN_ON(bytes
> sk
->sk_reserved_mem
);
1008 WRITE_ONCE(sk
->sk_reserved_mem
, sk
->sk_reserved_mem
- bytes
);
1012 static int sock_reserve_memory(struct sock
*sk
, int bytes
)
1018 if (!mem_cgroup_sockets_enabled
|| !sk
->sk_memcg
|| !sk_has_account(sk
))
1024 pages
= sk_mem_pages(bytes
);
1026 /* pre-charge to memcg */
1027 charged
= mem_cgroup_charge_skmem(sk
->sk_memcg
, pages
,
1028 GFP_KERNEL
| __GFP_RETRY_MAYFAIL
);
1032 /* pre-charge to forward_alloc */
1033 sk_memory_allocated_add(sk
, pages
);
1034 allocated
= sk_memory_allocated(sk
);
1035 /* If the system goes into memory pressure with this
1036 * precharge, give up and return error.
1038 if (allocated
> sk_prot_mem_limits(sk
, 1)) {
1039 sk_memory_allocated_sub(sk
, pages
);
1040 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, pages
);
1043 sk_forward_alloc_add(sk
, pages
<< PAGE_SHIFT
);
1045 WRITE_ONCE(sk
->sk_reserved_mem
,
1046 sk
->sk_reserved_mem
+ (pages
<< PAGE_SHIFT
));
1051 void sockopt_lock_sock(struct sock
*sk
)
1053 /* When current->bpf_ctx is set, the setsockopt is called from
1054 * a bpf prog. bpf has ensured the sk lock has been
1055 * acquired before calling setsockopt().
1057 if (has_current_bpf_ctx())
1062 EXPORT_SYMBOL(sockopt_lock_sock
);
1064 void sockopt_release_sock(struct sock
*sk
)
1066 if (has_current_bpf_ctx())
1071 EXPORT_SYMBOL(sockopt_release_sock
);
1073 bool sockopt_ns_capable(struct user_namespace
*ns
, int cap
)
1075 return has_current_bpf_ctx() || ns_capable(ns
, cap
);
1077 EXPORT_SYMBOL(sockopt_ns_capable
);
1079 bool sockopt_capable(int cap
)
1081 return has_current_bpf_ctx() || capable(cap
);
1083 EXPORT_SYMBOL(sockopt_capable
);
1086 * This is meant for all protocols to use and covers goings on
1087 * at the socket level. Everything here is generic.
1090 int sk_setsockopt(struct sock
*sk
, int level
, int optname
,
1091 sockptr_t optval
, unsigned int optlen
)
1093 struct so_timestamping timestamping
;
1094 struct socket
*sock
= sk
->sk_socket
;
1095 struct sock_txtime sk_txtime
;
1102 * Options without arguments
1105 if (optname
== SO_BINDTODEVICE
)
1106 return sock_setbindtodevice(sk
, optval
, optlen
);
1108 if (optlen
< sizeof(int))
1111 if (copy_from_sockptr(&val
, optval
, sizeof(val
)))
1114 valbool
= val
? 1 : 0;
1116 /* handle options which do not require locking the socket. */
1119 if ((val
>= 0 && val
<= 6) ||
1120 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) ||
1121 sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1122 sock_set_priority(sk
, val
);
1127 assign_bit(SOCK_PASSSEC
, &sock
->flags
, valbool
);
1130 assign_bit(SOCK_PASSCRED
, &sock
->flags
, valbool
);
1133 assign_bit(SOCK_PASSPIDFD
, &sock
->flags
, valbool
);
1139 return -ENOPROTOOPT
;
1140 #ifdef CONFIG_NET_RX_BUSY_POLL
1144 WRITE_ONCE(sk
->sk_ll_usec
, val
);
1146 case SO_PREFER_BUSY_POLL
:
1147 if (valbool
&& !sockopt_capable(CAP_NET_ADMIN
))
1149 WRITE_ONCE(sk
->sk_prefer_busy_poll
, valbool
);
1151 case SO_BUSY_POLL_BUDGET
:
1152 if (val
> READ_ONCE(sk
->sk_busy_poll_budget
) &&
1153 !sockopt_capable(CAP_NET_ADMIN
))
1155 if (val
< 0 || val
> U16_MAX
)
1157 WRITE_ONCE(sk
->sk_busy_poll_budget
, val
);
1160 case SO_MAX_PACING_RATE
:
1162 unsigned long ulval
= (val
== ~0U) ? ~0UL : (unsigned int)val
;
1163 unsigned long pacing_rate
;
1165 if (sizeof(ulval
) != sizeof(val
) &&
1166 optlen
>= sizeof(ulval
) &&
1167 copy_from_sockptr(&ulval
, optval
, sizeof(ulval
))) {
1171 cmpxchg(&sk
->sk_pacing_status
,
1174 /* Pairs with READ_ONCE() from sk_getsockopt() */
1175 WRITE_ONCE(sk
->sk_max_pacing_rate
, ulval
);
1176 pacing_rate
= READ_ONCE(sk
->sk_pacing_rate
);
1177 if (ulval
< pacing_rate
)
1178 WRITE_ONCE(sk
->sk_pacing_rate
, ulval
);
1182 if (val
< -1 || val
> 1)
1184 if ((u8
)val
== SOCK_TXREHASH_DEFAULT
)
1185 val
= READ_ONCE(sock_net(sk
)->core
.sysctl_txrehash
);
1186 /* Paired with READ_ONCE() in tcp_rtx_synack()
1187 * and sk_getsockopt().
1189 WRITE_ONCE(sk
->sk_txrehash
, (u8
)val
);
1193 sockopt_lock_sock(sk
);
1197 if (val
&& !sockopt_capable(CAP_NET_ADMIN
))
1200 sock_valbool_flag(sk
, SOCK_DBG
, valbool
);
1203 sk
->sk_reuse
= (valbool
? SK_CAN_REUSE
: SK_NO_REUSE
);
1206 sk
->sk_reuseport
= valbool
;
1209 sock_valbool_flag(sk
, SOCK_LOCALROUTE
, valbool
);
1213 sock_valbool_flag(sk
, SOCK_BROADCAST
, valbool
);
1216 /* Don't error on this BSD doesn't and if you think
1217 * about it this is right. Otherwise apps have to
1218 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1219 * are treated in BSD as hints
1221 val
= min_t(u32
, val
, READ_ONCE(sysctl_wmem_max
));
1223 /* Ensure val * 2 fits into an int, to prevent max_t()
1224 * from treating it as a negative value.
1226 val
= min_t(int, val
, INT_MAX
/ 2);
1227 sk
->sk_userlocks
|= SOCK_SNDBUF_LOCK
;
1228 WRITE_ONCE(sk
->sk_sndbuf
,
1229 max_t(int, val
* 2, SOCK_MIN_SNDBUF
));
1230 /* Wake up sending tasks if we upped the value. */
1231 sk
->sk_write_space(sk
);
1234 case SO_SNDBUFFORCE
:
1235 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1240 /* No negative values (to prevent underflow, as val will be
1248 /* Don't error on this BSD doesn't and if you think
1249 * about it this is right. Otherwise apps have to
1250 * play 'guess the biggest size' games. RCVBUF/SNDBUF
1251 * are treated in BSD as hints
1253 __sock_set_rcvbuf(sk
, min_t(u32
, val
, READ_ONCE(sysctl_rmem_max
)));
1256 case SO_RCVBUFFORCE
:
1257 if (!sockopt_capable(CAP_NET_ADMIN
)) {
1262 /* No negative values (to prevent underflow, as val will be
1265 __sock_set_rcvbuf(sk
, max(val
, 0));
1269 if (sk
->sk_prot
->keepalive
)
1270 sk
->sk_prot
->keepalive(sk
, valbool
);
1271 sock_valbool_flag(sk
, SOCK_KEEPOPEN
, valbool
);
1275 sock_valbool_flag(sk
, SOCK_URGINLINE
, valbool
);
1279 sk
->sk_no_check_tx
= valbool
;
1283 if (optlen
< sizeof(ling
)) {
1284 ret
= -EINVAL
; /* 1003.1g */
1287 if (copy_from_sockptr(&ling
, optval
, sizeof(ling
))) {
1291 if (!ling
.l_onoff
) {
1292 sock_reset_flag(sk
, SOCK_LINGER
);
1294 unsigned long t_sec
= ling
.l_linger
;
1296 if (t_sec
>= MAX_SCHEDULE_TIMEOUT
/ HZ
)
1297 WRITE_ONCE(sk
->sk_lingertime
, MAX_SCHEDULE_TIMEOUT
);
1299 WRITE_ONCE(sk
->sk_lingertime
, t_sec
* HZ
);
1300 sock_set_flag(sk
, SOCK_LINGER
);
1307 case SO_TIMESTAMP_OLD
:
1308 case SO_TIMESTAMP_NEW
:
1309 case SO_TIMESTAMPNS_OLD
:
1310 case SO_TIMESTAMPNS_NEW
:
1311 sock_set_timestamp(sk
, optname
, valbool
);
1314 case SO_TIMESTAMPING_NEW
:
1315 case SO_TIMESTAMPING_OLD
:
1316 if (optlen
== sizeof(timestamping
)) {
1317 if (copy_from_sockptr(×tamping
, optval
,
1318 sizeof(timestamping
))) {
1323 memset(×tamping
, 0, sizeof(timestamping
));
1324 timestamping
.flags
= val
;
1326 ret
= sock_set_timestamping(sk
, optname
, timestamping
);
1331 int (*set_rcvlowat
)(struct sock
*sk
, int val
) = NULL
;
1336 set_rcvlowat
= READ_ONCE(sock
->ops
)->set_rcvlowat
;
1338 ret
= set_rcvlowat(sk
, val
);
1340 WRITE_ONCE(sk
->sk_rcvlowat
, val
? : 1);
1343 case SO_RCVTIMEO_OLD
:
1344 case SO_RCVTIMEO_NEW
:
1345 ret
= sock_set_timeout(&sk
->sk_rcvtimeo
, optval
,
1346 optlen
, optname
== SO_RCVTIMEO_OLD
);
1349 case SO_SNDTIMEO_OLD
:
1350 case SO_SNDTIMEO_NEW
:
1351 ret
= sock_set_timeout(&sk
->sk_sndtimeo
, optval
,
1352 optlen
, optname
== SO_SNDTIMEO_OLD
);
1355 case SO_ATTACH_FILTER
: {
1356 struct sock_fprog fprog
;
1358 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1360 ret
= sk_attach_filter(&fprog
, sk
);
1365 if (optlen
== sizeof(u32
)) {
1369 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1372 ret
= sk_attach_bpf(ufd
, sk
);
1376 case SO_ATTACH_REUSEPORT_CBPF
: {
1377 struct sock_fprog fprog
;
1379 ret
= copy_bpf_fprog_from_user(&fprog
, optval
, optlen
);
1381 ret
= sk_reuseport_attach_filter(&fprog
, sk
);
1384 case SO_ATTACH_REUSEPORT_EBPF
:
1386 if (optlen
== sizeof(u32
)) {
1390 if (copy_from_sockptr(&ufd
, optval
, sizeof(ufd
)))
1393 ret
= sk_reuseport_attach_bpf(ufd
, sk
);
1397 case SO_DETACH_REUSEPORT_BPF
:
1398 ret
= reuseport_detach_prog(sk
);
1401 case SO_DETACH_FILTER
:
1402 ret
= sk_detach_filter(sk
);
1405 case SO_LOCK_FILTER
:
1406 if (sock_flag(sk
, SOCK_FILTER_LOCKED
) && !valbool
)
1409 sock_valbool_flag(sk
, SOCK_FILTER_LOCKED
, valbool
);
1413 if (!sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
1414 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1419 __sock_set_mark(sk
, val
);
1422 sock_valbool_flag(sk
, SOCK_RCVMARK
, valbool
);
1426 sock_valbool_flag(sk
, SOCK_RXQ_OVFL
, valbool
);
1429 case SO_WIFI_STATUS
:
1430 sock_valbool_flag(sk
, SOCK_WIFI_STATUS
, valbool
);
1435 int (*set_peek_off
)(struct sock
*sk
, int val
);
1437 set_peek_off
= READ_ONCE(sock
->ops
)->set_peek_off
;
1439 ret
= set_peek_off(sk
, val
);
1446 sock_valbool_flag(sk
, SOCK_NOFCS
, valbool
);
1449 case SO_SELECT_ERR_QUEUE
:
1450 sock_valbool_flag(sk
, SOCK_SELECT_ERR_QUEUE
, valbool
);
1454 case SO_INCOMING_CPU
:
1455 reuseport_update_incoming_cpu(sk
, val
);
1460 dst_negative_advice(sk
);
1464 if (sk
->sk_family
== PF_INET
|| sk
->sk_family
== PF_INET6
) {
1465 if (!(sk_is_tcp(sk
) ||
1466 (sk
->sk_type
== SOCK_DGRAM
&&
1467 sk
->sk_protocol
== IPPROTO_UDP
)))
1469 } else if (sk
->sk_family
!= PF_RDS
) {
1473 if (val
< 0 || val
> 1)
1476 sock_valbool_flag(sk
, SOCK_ZEROCOPY
, valbool
);
1481 if (optlen
!= sizeof(struct sock_txtime
)) {
1484 } else if (copy_from_sockptr(&sk_txtime
, optval
,
1485 sizeof(struct sock_txtime
))) {
1488 } else if (sk_txtime
.flags
& ~SOF_TXTIME_FLAGS_MASK
) {
1492 /* CLOCK_MONOTONIC is only used by sch_fq, and this packet
1493 * scheduler has enough safe guards.
1495 if (sk_txtime
.clockid
!= CLOCK_MONOTONIC
&&
1496 !sockopt_ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
)) {
1500 sock_valbool_flag(sk
, SOCK_TXTIME
, true);
1501 sk
->sk_clockid
= sk_txtime
.clockid
;
1502 sk
->sk_txtime_deadline_mode
=
1503 !!(sk_txtime
.flags
& SOF_TXTIME_DEADLINE_MODE
);
1504 sk
->sk_txtime_report_errors
=
1505 !!(sk_txtime
.flags
& SOF_TXTIME_REPORT_ERRORS
);
1508 case SO_BINDTOIFINDEX
:
1509 ret
= sock_bindtoindex_locked(sk
, val
);
1513 if (val
& ~SOCK_BUF_LOCK_MASK
) {
1517 sk
->sk_userlocks
= val
| (sk
->sk_userlocks
&
1518 ~SOCK_BUF_LOCK_MASK
);
1521 case SO_RESERVE_MEM
:
1530 delta
= val
- sk
->sk_reserved_mem
;
1532 sock_release_reserved_memory(sk
, -delta
);
1534 ret
= sock_reserve_memory(sk
, delta
);
1542 sockopt_release_sock(sk
);
1546 int sock_setsockopt(struct socket
*sock
, int level
, int optname
,
1547 sockptr_t optval
, unsigned int optlen
)
1549 return sk_setsockopt(sock
->sk
, level
, optname
,
1552 EXPORT_SYMBOL(sock_setsockopt
);
1554 static const struct cred
*sk_get_peer_cred(struct sock
*sk
)
1556 const struct cred
*cred
;
1558 spin_lock(&sk
->sk_peer_lock
);
1559 cred
= get_cred(sk
->sk_peer_cred
);
1560 spin_unlock(&sk
->sk_peer_lock
);
1565 static void cred_to_ucred(struct pid
*pid
, const struct cred
*cred
,
1566 struct ucred
*ucred
)
1568 ucred
->pid
= pid_vnr(pid
);
1569 ucred
->uid
= ucred
->gid
= -1;
1571 struct user_namespace
*current_ns
= current_user_ns();
1573 ucred
->uid
= from_kuid_munged(current_ns
, cred
->euid
);
1574 ucred
->gid
= from_kgid_munged(current_ns
, cred
->egid
);
1578 static int groups_to_user(sockptr_t dst
, const struct group_info
*src
)
1580 struct user_namespace
*user_ns
= current_user_ns();
1583 for (i
= 0; i
< src
->ngroups
; i
++) {
1584 gid_t gid
= from_kgid_munged(user_ns
, src
->gid
[i
]);
1586 if (copy_to_sockptr_offset(dst
, i
* sizeof(gid
), &gid
, sizeof(gid
)))
1593 int sk_getsockopt(struct sock
*sk
, int level
, int optname
,
1594 sockptr_t optval
, sockptr_t optlen
)
1596 struct socket
*sock
= sk
->sk_socket
;
1601 unsigned long ulval
;
1603 struct old_timeval32 tm32
;
1604 struct __kernel_old_timeval tm
;
1605 struct __kernel_sock_timeval stm
;
1606 struct sock_txtime txtime
;
1607 struct so_timestamping timestamping
;
1610 int lv
= sizeof(int);
1613 if (copy_from_sockptr(&len
, optlen
, sizeof(int)))
1618 memset(&v
, 0, sizeof(v
));
1622 v
.val
= sock_flag(sk
, SOCK_DBG
);
1626 v
.val
= sock_flag(sk
, SOCK_LOCALROUTE
);
1630 v
.val
= sock_flag(sk
, SOCK_BROADCAST
);
1634 v
.val
= READ_ONCE(sk
->sk_sndbuf
);
1638 v
.val
= READ_ONCE(sk
->sk_rcvbuf
);
1642 v
.val
= sk
->sk_reuse
;
1646 v
.val
= sk
->sk_reuseport
;
1650 v
.val
= sock_flag(sk
, SOCK_KEEPOPEN
);
1654 v
.val
= sk
->sk_type
;
1658 v
.val
= sk
->sk_protocol
;
1662 v
.val
= sk
->sk_family
;
1666 v
.val
= -sock_error(sk
);
1668 v
.val
= xchg(&sk
->sk_err_soft
, 0);
1672 v
.val
= sock_flag(sk
, SOCK_URGINLINE
);
1676 v
.val
= sk
->sk_no_check_tx
;
1680 v
.val
= READ_ONCE(sk
->sk_priority
);
1684 lv
= sizeof(v
.ling
);
1685 v
.ling
.l_onoff
= sock_flag(sk
, SOCK_LINGER
);
1686 v
.ling
.l_linger
= READ_ONCE(sk
->sk_lingertime
) / HZ
;
1692 case SO_TIMESTAMP_OLD
:
1693 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) &&
1694 !sock_flag(sk
, SOCK_TSTAMP_NEW
) &&
1695 !sock_flag(sk
, SOCK_RCVTSTAMPNS
);
1698 case SO_TIMESTAMPNS_OLD
:
1699 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && !sock_flag(sk
, SOCK_TSTAMP_NEW
);
1702 case SO_TIMESTAMP_NEW
:
1703 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMP
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1706 case SO_TIMESTAMPNS_NEW
:
1707 v
.val
= sock_flag(sk
, SOCK_RCVTSTAMPNS
) && sock_flag(sk
, SOCK_TSTAMP_NEW
);
1710 case SO_TIMESTAMPING_OLD
:
1711 case SO_TIMESTAMPING_NEW
:
1712 lv
= sizeof(v
.timestamping
);
1713 /* For the later-added case SO_TIMESTAMPING_NEW: Be strict about only
1714 * returning the flags when they were set through the same option.
1715 * Don't change the beviour for the old case SO_TIMESTAMPING_OLD.
1717 if (optname
== SO_TIMESTAMPING_OLD
|| sock_flag(sk
, SOCK_TSTAMP_NEW
)) {
1718 v
.timestamping
.flags
= READ_ONCE(sk
->sk_tsflags
);
1719 v
.timestamping
.bind_phc
= READ_ONCE(sk
->sk_bind_phc
);
1723 case SO_RCVTIMEO_OLD
:
1724 case SO_RCVTIMEO_NEW
:
1725 lv
= sock_get_timeout(READ_ONCE(sk
->sk_rcvtimeo
), &v
,
1726 SO_RCVTIMEO_OLD
== optname
);
1729 case SO_SNDTIMEO_OLD
:
1730 case SO_SNDTIMEO_NEW
:
1731 lv
= sock_get_timeout(READ_ONCE(sk
->sk_sndtimeo
), &v
,
1732 SO_SNDTIMEO_OLD
== optname
);
1736 v
.val
= READ_ONCE(sk
->sk_rcvlowat
);
1744 v
.val
= !!test_bit(SOCK_PASSCRED
, &sock
->flags
);
1748 v
.val
= !!test_bit(SOCK_PASSPIDFD
, &sock
->flags
);
1753 struct ucred peercred
;
1754 if (len
> sizeof(peercred
))
1755 len
= sizeof(peercred
);
1757 spin_lock(&sk
->sk_peer_lock
);
1758 cred_to_ucred(sk
->sk_peer_pid
, sk
->sk_peer_cred
, &peercred
);
1759 spin_unlock(&sk
->sk_peer_lock
);
1761 if (copy_to_sockptr(optval
, &peercred
, len
))
1768 struct pid
*peer_pid
;
1769 struct file
*pidfd_file
= NULL
;
1772 if (len
> sizeof(pidfd
))
1773 len
= sizeof(pidfd
);
1775 spin_lock(&sk
->sk_peer_lock
);
1776 peer_pid
= get_pid(sk
->sk_peer_pid
);
1777 spin_unlock(&sk
->sk_peer_lock
);
1782 pidfd
= pidfd_prepare(peer_pid
, 0, &pidfd_file
);
1787 if (copy_to_sockptr(optval
, &pidfd
, len
) ||
1788 copy_to_sockptr(optlen
, &len
, sizeof(int))) {
1789 put_unused_fd(pidfd
);
1795 fd_install(pidfd
, pidfd_file
);
1801 const struct cred
*cred
;
1804 cred
= sk_get_peer_cred(sk
);
1808 n
= cred
->group_info
->ngroups
;
1809 if (len
< n
* sizeof(gid_t
)) {
1810 len
= n
* sizeof(gid_t
);
1812 return copy_to_sockptr(optlen
, &len
, sizeof(int)) ? -EFAULT
: -ERANGE
;
1814 len
= n
* sizeof(gid_t
);
1816 ret
= groups_to_user(optval
, cred
->group_info
);
1825 struct sockaddr_storage address
;
1827 lv
= READ_ONCE(sock
->ops
)->getname(sock
, (struct sockaddr
*)&address
, 2);
1832 if (copy_to_sockptr(optval
, &address
, len
))
1837 /* Dubious BSD thing... Probably nobody even uses it, but
1838 * the UNIX standard wants it for whatever reason... -DaveM
1841 v
.val
= sk
->sk_state
== TCP_LISTEN
;
1845 v
.val
= !!test_bit(SOCK_PASSSEC
, &sock
->flags
);
1849 return security_socket_getpeersec_stream(sock
,
1850 optval
, optlen
, len
);
1853 v
.val
= READ_ONCE(sk
->sk_mark
);
1857 v
.val
= sock_flag(sk
, SOCK_RCVMARK
);
1861 v
.val
= sock_flag(sk
, SOCK_RXQ_OVFL
);
1864 case SO_WIFI_STATUS
:
1865 v
.val
= sock_flag(sk
, SOCK_WIFI_STATUS
);
1869 if (!READ_ONCE(sock
->ops
)->set_peek_off
)
1872 v
.val
= READ_ONCE(sk
->sk_peek_off
);
1875 v
.val
= sock_flag(sk
, SOCK_NOFCS
);
1878 case SO_BINDTODEVICE
:
1879 return sock_getbindtodevice(sk
, optval
, optlen
, len
);
1882 len
= sk_get_filter(sk
, optval
, len
);
1888 case SO_LOCK_FILTER
:
1889 v
.val
= sock_flag(sk
, SOCK_FILTER_LOCKED
);
1892 case SO_BPF_EXTENSIONS
:
1893 v
.val
= bpf_tell_extensions();
1896 case SO_SELECT_ERR_QUEUE
:
1897 v
.val
= sock_flag(sk
, SOCK_SELECT_ERR_QUEUE
);
1900 #ifdef CONFIG_NET_RX_BUSY_POLL
1902 v
.val
= READ_ONCE(sk
->sk_ll_usec
);
1904 case SO_PREFER_BUSY_POLL
:
1905 v
.val
= READ_ONCE(sk
->sk_prefer_busy_poll
);
1909 case SO_MAX_PACING_RATE
:
1910 /* The READ_ONCE() pair with the WRITE_ONCE() in sk_setsockopt() */
1911 if (sizeof(v
.ulval
) != sizeof(v
.val
) && len
>= sizeof(v
.ulval
)) {
1912 lv
= sizeof(v
.ulval
);
1913 v
.ulval
= READ_ONCE(sk
->sk_max_pacing_rate
);
1916 v
.val
= min_t(unsigned long, ~0U,
1917 READ_ONCE(sk
->sk_max_pacing_rate
));
1921 case SO_INCOMING_CPU
:
1922 v
.val
= READ_ONCE(sk
->sk_incoming_cpu
);
1927 u32 meminfo
[SK_MEMINFO_VARS
];
1929 sk_get_meminfo(sk
, meminfo
);
1931 len
= min_t(unsigned int, len
, sizeof(meminfo
));
1932 if (copy_to_sockptr(optval
, &meminfo
, len
))
1938 #ifdef CONFIG_NET_RX_BUSY_POLL
1939 case SO_INCOMING_NAPI_ID
:
1940 v
.val
= READ_ONCE(sk
->sk_napi_id
);
1942 /* aggregate non-NAPI IDs down to 0 */
1943 if (v
.val
< MIN_NAPI_ID
)
1953 v
.val64
= sock_gen_cookie(sk
);
1957 v
.val
= sock_flag(sk
, SOCK_ZEROCOPY
);
1961 lv
= sizeof(v
.txtime
);
1962 v
.txtime
.clockid
= sk
->sk_clockid
;
1963 v
.txtime
.flags
|= sk
->sk_txtime_deadline_mode
?
1964 SOF_TXTIME_DEADLINE_MODE
: 0;
1965 v
.txtime
.flags
|= sk
->sk_txtime_report_errors
?
1966 SOF_TXTIME_REPORT_ERRORS
: 0;
1969 case SO_BINDTOIFINDEX
:
1970 v
.val
= READ_ONCE(sk
->sk_bound_dev_if
);
1973 case SO_NETNS_COOKIE
:
1977 v
.val64
= sock_net(sk
)->net_cookie
;
1981 v
.val
= sk
->sk_userlocks
& SOCK_BUF_LOCK_MASK
;
1984 case SO_RESERVE_MEM
:
1985 v
.val
= READ_ONCE(sk
->sk_reserved_mem
);
1989 /* Paired with WRITE_ONCE() in sk_setsockopt() */
1990 v
.val
= READ_ONCE(sk
->sk_txrehash
);
1994 /* We implement the SO_SNDLOWAT etc to not be settable
1997 return -ENOPROTOOPT
;
2002 if (copy_to_sockptr(optval
, &v
, len
))
2005 if (copy_to_sockptr(optlen
, &len
, sizeof(int)))
2011 * Initialize an sk_lock.
2013 * (We also register the sk_lock with the lock validator.)
2015 static inline void sock_lock_init(struct sock
*sk
)
2017 if (sk
->sk_kern_sock
)
2018 sock_lock_init_class_and_name(
2020 af_family_kern_slock_key_strings
[sk
->sk_family
],
2021 af_family_kern_slock_keys
+ sk
->sk_family
,
2022 af_family_kern_key_strings
[sk
->sk_family
],
2023 af_family_kern_keys
+ sk
->sk_family
);
2025 sock_lock_init_class_and_name(
2027 af_family_slock_key_strings
[sk
->sk_family
],
2028 af_family_slock_keys
+ sk
->sk_family
,
2029 af_family_key_strings
[sk
->sk_family
],
2030 af_family_keys
+ sk
->sk_family
);
2034 * Copy all fields from osk to nsk but nsk->sk_refcnt must not change yet,
2035 * even temporarly, because of RCU lookups. sk_node should also be left as is.
2036 * We must not copy fields between sk_dontcopy_begin and sk_dontcopy_end
2038 static void sock_copy(struct sock
*nsk
, const struct sock
*osk
)
2040 const struct proto
*prot
= READ_ONCE(osk
->sk_prot
);
2041 #ifdef CONFIG_SECURITY_NETWORK
2042 void *sptr
= nsk
->sk_security
;
2045 /* If we move sk_tx_queue_mapping out of the private section,
2046 * we must check if sk_tx_queue_clear() is called after
2047 * sock_copy() in sk_clone_lock().
2049 BUILD_BUG_ON(offsetof(struct sock
, sk_tx_queue_mapping
) <
2050 offsetof(struct sock
, sk_dontcopy_begin
) ||
2051 offsetof(struct sock
, sk_tx_queue_mapping
) >=
2052 offsetof(struct sock
, sk_dontcopy_end
));
2054 memcpy(nsk
, osk
, offsetof(struct sock
, sk_dontcopy_begin
));
2056 memcpy(&nsk
->sk_dontcopy_end
, &osk
->sk_dontcopy_end
,
2057 prot
->obj_size
- offsetof(struct sock
, sk_dontcopy_end
));
2059 #ifdef CONFIG_SECURITY_NETWORK
2060 nsk
->sk_security
= sptr
;
2061 security_sk_clone(osk
, nsk
);
2065 static struct sock
*sk_prot_alloc(struct proto
*prot
, gfp_t priority
,
2069 struct kmem_cache
*slab
;
2073 sk
= kmem_cache_alloc(slab
, priority
& ~__GFP_ZERO
);
2076 if (want_init_on_alloc(priority
))
2077 sk_prot_clear_nulls(sk
, prot
->obj_size
);
2079 sk
= kmalloc(prot
->obj_size
, priority
);
2082 if (security_sk_alloc(sk
, family
, priority
))
2085 if (!try_module_get(prot
->owner
))
2092 security_sk_free(sk
);
2095 kmem_cache_free(slab
, sk
);
2101 static void sk_prot_free(struct proto
*prot
, struct sock
*sk
)
2103 struct kmem_cache
*slab
;
2104 struct module
*owner
;
2106 owner
= prot
->owner
;
2109 cgroup_sk_free(&sk
->sk_cgrp_data
);
2110 mem_cgroup_sk_free(sk
);
2111 security_sk_free(sk
);
2113 kmem_cache_free(slab
, sk
);
2120 * sk_alloc - All socket objects are allocated here
2121 * @net: the applicable net namespace
2122 * @family: protocol family
2123 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2124 * @prot: struct proto associated with this new sock instance
2125 * @kern: is this to be a kernel socket?
2127 struct sock
*sk_alloc(struct net
*net
, int family
, gfp_t priority
,
2128 struct proto
*prot
, int kern
)
2132 sk
= sk_prot_alloc(prot
, priority
| __GFP_ZERO
, family
);
2134 sk
->sk_family
= family
;
2136 * See comment in struct sock definition to understand
2137 * why we need sk_prot_creator -acme
2139 sk
->sk_prot
= sk
->sk_prot_creator
= prot
;
2140 sk
->sk_kern_sock
= kern
;
2142 sk
->sk_net_refcnt
= kern
? 0 : 1;
2143 if (likely(sk
->sk_net_refcnt
)) {
2144 get_net_track(net
, &sk
->ns_tracker
, priority
);
2145 sock_inuse_add(net
, 1);
2147 __netns_tracker_alloc(net
, &sk
->ns_tracker
,
2151 sock_net_set(sk
, net
);
2152 refcount_set(&sk
->sk_wmem_alloc
, 1);
2154 mem_cgroup_sk_alloc(sk
);
2155 cgroup_sk_alloc(&sk
->sk_cgrp_data
);
2156 sock_update_classid(&sk
->sk_cgrp_data
);
2157 sock_update_netprioidx(&sk
->sk_cgrp_data
);
2158 sk_tx_queue_clear(sk
);
2163 EXPORT_SYMBOL(sk_alloc
);
2165 /* Sockets having SOCK_RCU_FREE will call this function after one RCU
2166 * grace period. This is the case for UDP sockets and TCP listeners.
2168 static void __sk_destruct(struct rcu_head
*head
)
2170 struct sock
*sk
= container_of(head
, struct sock
, sk_rcu
);
2171 struct sk_filter
*filter
;
2173 if (sk
->sk_destruct
)
2174 sk
->sk_destruct(sk
);
2176 filter
= rcu_dereference_check(sk
->sk_filter
,
2177 refcount_read(&sk
->sk_wmem_alloc
) == 0);
2179 sk_filter_uncharge(sk
, filter
);
2180 RCU_INIT_POINTER(sk
->sk_filter
, NULL
);
2183 sock_disable_timestamp(sk
, SK_FLAGS_TIMESTAMP
);
2185 #ifdef CONFIG_BPF_SYSCALL
2186 bpf_sk_storage_free(sk
);
2189 if (atomic_read(&sk
->sk_omem_alloc
))
2190 pr_debug("%s: optmem leakage (%d bytes) detected\n",
2191 __func__
, atomic_read(&sk
->sk_omem_alloc
));
2193 if (sk
->sk_frag
.page
) {
2194 put_page(sk
->sk_frag
.page
);
2195 sk
->sk_frag
.page
= NULL
;
2198 /* We do not need to acquire sk->sk_peer_lock, we are the last user. */
2199 put_cred(sk
->sk_peer_cred
);
2200 put_pid(sk
->sk_peer_pid
);
2202 if (likely(sk
->sk_net_refcnt
))
2203 put_net_track(sock_net(sk
), &sk
->ns_tracker
);
2205 __netns_tracker_free(sock_net(sk
), &sk
->ns_tracker
, false);
2207 sk_prot_free(sk
->sk_prot_creator
, sk
);
2210 void sk_destruct(struct sock
*sk
)
2212 bool use_call_rcu
= sock_flag(sk
, SOCK_RCU_FREE
);
2214 if (rcu_access_pointer(sk
->sk_reuseport_cb
)) {
2215 reuseport_detach_sock(sk
);
2216 use_call_rcu
= true;
2220 call_rcu(&sk
->sk_rcu
, __sk_destruct
);
2222 __sk_destruct(&sk
->sk_rcu
);
2225 static void __sk_free(struct sock
*sk
)
2227 if (likely(sk
->sk_net_refcnt
))
2228 sock_inuse_add(sock_net(sk
), -1);
2230 if (unlikely(sk
->sk_net_refcnt
&& sock_diag_has_destroy_listeners(sk
)))
2231 sock_diag_broadcast_destroy(sk
);
2236 void sk_free(struct sock
*sk
)
2239 * We subtract one from sk_wmem_alloc and can know if
2240 * some packets are still in some tx queue.
2241 * If not null, sock_wfree() will call __sk_free(sk) later
2243 if (refcount_dec_and_test(&sk
->sk_wmem_alloc
))
2246 EXPORT_SYMBOL(sk_free
);
2248 static void sk_init_common(struct sock
*sk
)
2250 skb_queue_head_init(&sk
->sk_receive_queue
);
2251 skb_queue_head_init(&sk
->sk_write_queue
);
2252 skb_queue_head_init(&sk
->sk_error_queue
);
2254 rwlock_init(&sk
->sk_callback_lock
);
2255 lockdep_set_class_and_name(&sk
->sk_receive_queue
.lock
,
2256 af_rlock_keys
+ sk
->sk_family
,
2257 af_family_rlock_key_strings
[sk
->sk_family
]);
2258 lockdep_set_class_and_name(&sk
->sk_write_queue
.lock
,
2259 af_wlock_keys
+ sk
->sk_family
,
2260 af_family_wlock_key_strings
[sk
->sk_family
]);
2261 lockdep_set_class_and_name(&sk
->sk_error_queue
.lock
,
2262 af_elock_keys
+ sk
->sk_family
,
2263 af_family_elock_key_strings
[sk
->sk_family
]);
2264 lockdep_set_class_and_name(&sk
->sk_callback_lock
,
2265 af_callback_keys
+ sk
->sk_family
,
2266 af_family_clock_key_strings
[sk
->sk_family
]);
2270 * sk_clone_lock - clone a socket, and lock its clone
2271 * @sk: the socket to clone
2272 * @priority: for allocation (%GFP_KERNEL, %GFP_ATOMIC, etc)
2274 * Caller must unlock socket even in error path (bh_unlock_sock(newsk))
2276 struct sock
*sk_clone_lock(const struct sock
*sk
, const gfp_t priority
)
2278 struct proto
*prot
= READ_ONCE(sk
->sk_prot
);
2279 struct sk_filter
*filter
;
2280 bool is_charged
= true;
2283 newsk
= sk_prot_alloc(prot
, priority
, sk
->sk_family
);
2287 sock_copy(newsk
, sk
);
2289 newsk
->sk_prot_creator
= prot
;
2292 if (likely(newsk
->sk_net_refcnt
)) {
2293 get_net_track(sock_net(newsk
), &newsk
->ns_tracker
, priority
);
2294 sock_inuse_add(sock_net(newsk
), 1);
2296 /* Kernel sockets are not elevating the struct net refcount.
2297 * Instead, use a tracker to more easily detect if a layer
2298 * is not properly dismantling its kernel sockets at netns
2301 __netns_tracker_alloc(sock_net(newsk
), &newsk
->ns_tracker
,
2304 sk_node_init(&newsk
->sk_node
);
2305 sock_lock_init(newsk
);
2306 bh_lock_sock(newsk
);
2307 newsk
->sk_backlog
.head
= newsk
->sk_backlog
.tail
= NULL
;
2308 newsk
->sk_backlog
.len
= 0;
2310 atomic_set(&newsk
->sk_rmem_alloc
, 0);
2312 /* sk_wmem_alloc set to one (see sk_free() and sock_wfree()) */
2313 refcount_set(&newsk
->sk_wmem_alloc
, 1);
2315 atomic_set(&newsk
->sk_omem_alloc
, 0);
2316 sk_init_common(newsk
);
2318 newsk
->sk_dst_cache
= NULL
;
2319 newsk
->sk_dst_pending_confirm
= 0;
2320 newsk
->sk_wmem_queued
= 0;
2321 newsk
->sk_forward_alloc
= 0;
2322 newsk
->sk_reserved_mem
= 0;
2323 atomic_set(&newsk
->sk_drops
, 0);
2324 newsk
->sk_send_head
= NULL
;
2325 newsk
->sk_userlocks
= sk
->sk_userlocks
& ~SOCK_BINDPORT_LOCK
;
2326 atomic_set(&newsk
->sk_zckey
, 0);
2328 sock_reset_flag(newsk
, SOCK_DONE
);
2330 /* sk->sk_memcg will be populated at accept() time */
2331 newsk
->sk_memcg
= NULL
;
2333 cgroup_sk_clone(&newsk
->sk_cgrp_data
);
2336 filter
= rcu_dereference(sk
->sk_filter
);
2338 /* though it's an empty new sock, the charging may fail
2339 * if sysctl_optmem_max was changed between creation of
2340 * original socket and cloning
2342 is_charged
= sk_filter_charge(newsk
, filter
);
2343 RCU_INIT_POINTER(newsk
->sk_filter
, filter
);
2346 if (unlikely(!is_charged
|| xfrm_sk_clone_policy(newsk
, sk
))) {
2347 /* We need to make sure that we don't uncharge the new
2348 * socket if we couldn't charge it in the first place
2349 * as otherwise we uncharge the parent's filter.
2352 RCU_INIT_POINTER(newsk
->sk_filter
, NULL
);
2353 sk_free_unlock_clone(newsk
);
2357 RCU_INIT_POINTER(newsk
->sk_reuseport_cb
, NULL
);
2359 if (bpf_sk_storage_clone(sk
, newsk
)) {
2360 sk_free_unlock_clone(newsk
);
2365 /* Clear sk_user_data if parent had the pointer tagged
2366 * as not suitable for copying when cloning.
2368 if (sk_user_data_is_nocopy(newsk
))
2369 newsk
->sk_user_data
= NULL
;
2372 newsk
->sk_err_soft
= 0;
2373 newsk
->sk_priority
= 0;
2374 newsk
->sk_incoming_cpu
= raw_smp_processor_id();
2376 /* Before updating sk_refcnt, we must commit prior changes to memory
2377 * (Documentation/RCU/rculist_nulls.rst for details)
2380 refcount_set(&newsk
->sk_refcnt
, 2);
2382 sk_set_socket(newsk
, NULL
);
2383 sk_tx_queue_clear(newsk
);
2384 RCU_INIT_POINTER(newsk
->sk_wq
, NULL
);
2386 if (newsk
->sk_prot
->sockets_allocated
)
2387 sk_sockets_allocated_inc(newsk
);
2389 if (sock_needs_netstamp(sk
) && newsk
->sk_flags
& SK_FLAGS_TIMESTAMP
)
2390 net_enable_timestamp();
2394 EXPORT_SYMBOL_GPL(sk_clone_lock
);
2396 void sk_free_unlock_clone(struct sock
*sk
)
2398 /* It is still raw copy of parent, so invalidate
2399 * destructor and make plain sk_free() */
2400 sk
->sk_destruct
= NULL
;
2404 EXPORT_SYMBOL_GPL(sk_free_unlock_clone
);
2406 static u32
sk_dst_gso_max_size(struct sock
*sk
, struct dst_entry
*dst
)
2408 bool is_ipv6
= false;
2411 #if IS_ENABLED(CONFIG_IPV6)
2412 is_ipv6
= (sk
->sk_family
== AF_INET6
&&
2413 !ipv6_addr_v4mapped(&sk
->sk_v6_rcv_saddr
));
2415 /* pairs with the WRITE_ONCE() in netif_set_gso(_ipv4)_max_size() */
2416 max_size
= is_ipv6
? READ_ONCE(dst
->dev
->gso_max_size
) :
2417 READ_ONCE(dst
->dev
->gso_ipv4_max_size
);
2418 if (max_size
> GSO_LEGACY_MAX_SIZE
&& !sk_is_tcp(sk
))
2419 max_size
= GSO_LEGACY_MAX_SIZE
;
2421 return max_size
- (MAX_TCP_HEADER
+ 1);
2424 void sk_setup_caps(struct sock
*sk
, struct dst_entry
*dst
)
2428 sk
->sk_route_caps
= dst
->dev
->features
;
2430 sk
->sk_route_caps
|= NETIF_F_GSO
;
2431 if (sk
->sk_route_caps
& NETIF_F_GSO
)
2432 sk
->sk_route_caps
|= NETIF_F_GSO_SOFTWARE
;
2433 if (unlikely(sk
->sk_gso_disabled
))
2434 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2435 if (sk_can_gso(sk
)) {
2436 if (dst
->header_len
&& !xfrm_dst_offload_ok(dst
)) {
2437 sk
->sk_route_caps
&= ~NETIF_F_GSO_MASK
;
2439 sk
->sk_route_caps
|= NETIF_F_SG
| NETIF_F_HW_CSUM
;
2440 sk
->sk_gso_max_size
= sk_dst_gso_max_size(sk
, dst
);
2441 /* pairs with the WRITE_ONCE() in netif_set_gso_max_segs() */
2442 max_segs
= max_t(u32
, READ_ONCE(dst
->dev
->gso_max_segs
), 1);
2445 sk
->sk_gso_max_segs
= max_segs
;
2446 sk_dst_set(sk
, dst
);
2448 EXPORT_SYMBOL_GPL(sk_setup_caps
);
2451 * Simple resource managers for sockets.
2456 * Write buffer destructor automatically called from kfree_skb.
2458 void sock_wfree(struct sk_buff
*skb
)
2460 struct sock
*sk
= skb
->sk
;
2461 unsigned int len
= skb
->truesize
;
2464 if (!sock_flag(sk
, SOCK_USE_WRITE_QUEUE
)) {
2465 if (sock_flag(sk
, SOCK_RCU_FREE
) &&
2466 sk
->sk_write_space
== sock_def_write_space
) {
2468 free
= refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
);
2469 sock_def_write_space_wfree(sk
);
2477 * Keep a reference on sk_wmem_alloc, this will be released
2478 * after sk_write_space() call
2480 WARN_ON(refcount_sub_and_test(len
- 1, &sk
->sk_wmem_alloc
));
2481 sk
->sk_write_space(sk
);
2485 * if sk_wmem_alloc reaches 0, we must finish what sk_free()
2486 * could not do because of in-flight packets
2488 if (refcount_sub_and_test(len
, &sk
->sk_wmem_alloc
))
2491 EXPORT_SYMBOL(sock_wfree
);
2493 /* This variant of sock_wfree() is used by TCP,
2494 * since it sets SOCK_USE_WRITE_QUEUE.
2496 void __sock_wfree(struct sk_buff
*skb
)
2498 struct sock
*sk
= skb
->sk
;
2500 if (refcount_sub_and_test(skb
->truesize
, &sk
->sk_wmem_alloc
))
2504 void skb_set_owner_w(struct sk_buff
*skb
, struct sock
*sk
)
2509 if (unlikely(!sk_fullsock(sk
))) {
2510 skb
->destructor
= sock_edemux
;
2515 skb
->destructor
= sock_wfree
;
2516 skb_set_hash_from_sk(skb
, sk
);
2518 * We used to take a refcount on sk, but following operation
2519 * is enough to guarantee sk_free() wont free this sock until
2520 * all in-flight packets are completed
2522 refcount_add(skb
->truesize
, &sk
->sk_wmem_alloc
);
2524 EXPORT_SYMBOL(skb_set_owner_w
);
2526 static bool can_skb_orphan_partial(const struct sk_buff
*skb
)
2528 #ifdef CONFIG_TLS_DEVICE
2529 /* Drivers depend on in-order delivery for crypto offload,
2530 * partial orphan breaks out-of-order-OK logic.
2535 return (skb
->destructor
== sock_wfree
||
2536 (IS_ENABLED(CONFIG_INET
) && skb
->destructor
== tcp_wfree
));
2539 /* This helper is used by netem, as it can hold packets in its
2540 * delay queue. We want to allow the owner socket to send more
2541 * packets, as if they were already TX completed by a typical driver.
2542 * But we also want to keep skb->sk set because some packet schedulers
2543 * rely on it (sch_fq for example).
2545 void skb_orphan_partial(struct sk_buff
*skb
)
2547 if (skb_is_tcp_pure_ack(skb
))
2550 if (can_skb_orphan_partial(skb
) && skb_set_owner_sk_safe(skb
, skb
->sk
))
2555 EXPORT_SYMBOL(skb_orphan_partial
);
2558 * Read buffer destructor automatically called from kfree_skb.
2560 void sock_rfree(struct sk_buff
*skb
)
2562 struct sock
*sk
= skb
->sk
;
2563 unsigned int len
= skb
->truesize
;
2565 atomic_sub(len
, &sk
->sk_rmem_alloc
);
2566 sk_mem_uncharge(sk
, len
);
2568 EXPORT_SYMBOL(sock_rfree
);
2571 * Buffer destructor for skbs that are not used directly in read or write
2572 * path, e.g. for error handler skbs. Automatically called from kfree_skb.
2574 void sock_efree(struct sk_buff
*skb
)
2578 EXPORT_SYMBOL(sock_efree
);
2580 /* Buffer destructor for prefetch/receive path where reference count may
2581 * not be held, e.g. for listen sockets.
2584 void sock_pfree(struct sk_buff
*skb
)
2586 struct sock
*sk
= skb
->sk
;
2588 if (!sk_is_refcounted(sk
))
2591 if (sk
->sk_state
== TCP_NEW_SYN_RECV
&& inet_reqsk(sk
)->syncookie
) {
2592 inet_reqsk(sk
)->rsk_listener
= NULL
;
2593 reqsk_free(inet_reqsk(sk
));
2599 EXPORT_SYMBOL(sock_pfree
);
2600 #endif /* CONFIG_INET */
2602 kuid_t
sock_i_uid(struct sock
*sk
)
2606 read_lock_bh(&sk
->sk_callback_lock
);
2607 uid
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_uid
: GLOBAL_ROOT_UID
;
2608 read_unlock_bh(&sk
->sk_callback_lock
);
2611 EXPORT_SYMBOL(sock_i_uid
);
2613 unsigned long __sock_i_ino(struct sock
*sk
)
2617 read_lock(&sk
->sk_callback_lock
);
2618 ino
= sk
->sk_socket
? SOCK_INODE(sk
->sk_socket
)->i_ino
: 0;
2619 read_unlock(&sk
->sk_callback_lock
);
2622 EXPORT_SYMBOL(__sock_i_ino
);
2624 unsigned long sock_i_ino(struct sock
*sk
)
2629 ino
= __sock_i_ino(sk
);
2633 EXPORT_SYMBOL(sock_i_ino
);
2636 * Allocate a skb from the socket's send buffer.
2638 struct sk_buff
*sock_wmalloc(struct sock
*sk
, unsigned long size
, int force
,
2642 refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
)) {
2643 struct sk_buff
*skb
= alloc_skb(size
, priority
);
2646 skb_set_owner_w(skb
, sk
);
2652 EXPORT_SYMBOL(sock_wmalloc
);
2654 static void sock_ofree(struct sk_buff
*skb
)
2656 struct sock
*sk
= skb
->sk
;
2658 atomic_sub(skb
->truesize
, &sk
->sk_omem_alloc
);
2661 struct sk_buff
*sock_omalloc(struct sock
*sk
, unsigned long size
,
2664 struct sk_buff
*skb
;
2666 /* small safe race: SKB_TRUESIZE may differ from final skb->truesize */
2667 if (atomic_read(&sk
->sk_omem_alloc
) + SKB_TRUESIZE(size
) >
2668 READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
))
2671 skb
= alloc_skb(size
, priority
);
2675 atomic_add(skb
->truesize
, &sk
->sk_omem_alloc
);
2677 skb
->destructor
= sock_ofree
;
2682 * Allocate a memory block from the socket's option memory buffer.
2684 void *sock_kmalloc(struct sock
*sk
, int size
, gfp_t priority
)
2686 int optmem_max
= READ_ONCE(sock_net(sk
)->core
.sysctl_optmem_max
);
2688 if ((unsigned int)size
<= optmem_max
&&
2689 atomic_read(&sk
->sk_omem_alloc
) + size
< optmem_max
) {
2691 /* First do the add, to avoid the race if kmalloc
2694 atomic_add(size
, &sk
->sk_omem_alloc
);
2695 mem
= kmalloc(size
, priority
);
2698 atomic_sub(size
, &sk
->sk_omem_alloc
);
2702 EXPORT_SYMBOL(sock_kmalloc
);
2704 /* Free an option memory block. Note, we actually want the inline
2705 * here as this allows gcc to detect the nullify and fold away the
2706 * condition entirely.
2708 static inline void __sock_kfree_s(struct sock
*sk
, void *mem
, int size
,
2711 if (WARN_ON_ONCE(!mem
))
2714 kfree_sensitive(mem
);
2717 atomic_sub(size
, &sk
->sk_omem_alloc
);
2720 void sock_kfree_s(struct sock
*sk
, void *mem
, int size
)
2722 __sock_kfree_s(sk
, mem
, size
, false);
2724 EXPORT_SYMBOL(sock_kfree_s
);
2726 void sock_kzfree_s(struct sock
*sk
, void *mem
, int size
)
2728 __sock_kfree_s(sk
, mem
, size
, true);
2730 EXPORT_SYMBOL(sock_kzfree_s
);
2732 /* It is almost wait_for_tcp_memory minus release_sock/lock_sock.
2733 I think, these locks should be removed for datagram sockets.
2735 static long sock_wait_for_wmem(struct sock
*sk
, long timeo
)
2739 sk_clear_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2743 if (signal_pending(current
))
2745 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2746 prepare_to_wait(sk_sleep(sk
), &wait
, TASK_INTERRUPTIBLE
);
2747 if (refcount_read(&sk
->sk_wmem_alloc
) < READ_ONCE(sk
->sk_sndbuf
))
2749 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2751 if (READ_ONCE(sk
->sk_err
))
2753 timeo
= schedule_timeout(timeo
);
2755 finish_wait(sk_sleep(sk
), &wait
);
2761 * Generic send/receive buffer handlers
2764 struct sk_buff
*sock_alloc_send_pskb(struct sock
*sk
, unsigned long header_len
,
2765 unsigned long data_len
, int noblock
,
2766 int *errcode
, int max_page_order
)
2768 struct sk_buff
*skb
;
2772 timeo
= sock_sndtimeo(sk
, noblock
);
2774 err
= sock_error(sk
);
2779 if (READ_ONCE(sk
->sk_shutdown
) & SEND_SHUTDOWN
)
2782 if (sk_wmem_alloc_get(sk
) < READ_ONCE(sk
->sk_sndbuf
))
2785 sk_set_bit(SOCKWQ_ASYNC_NOSPACE
, sk
);
2786 set_bit(SOCK_NOSPACE
, &sk
->sk_socket
->flags
);
2790 if (signal_pending(current
))
2792 timeo
= sock_wait_for_wmem(sk
, timeo
);
2794 skb
= alloc_skb_with_frags(header_len
, data_len
, max_page_order
,
2795 errcode
, sk
->sk_allocation
);
2797 skb_set_owner_w(skb
, sk
);
2801 err
= sock_intr_errno(timeo
);
2806 EXPORT_SYMBOL(sock_alloc_send_pskb
);
2808 int __sock_cmsg_send(struct sock
*sk
, struct cmsghdr
*cmsg
,
2809 struct sockcm_cookie
*sockc
)
2813 switch (cmsg
->cmsg_type
) {
2815 if (!ns_capable(sock_net(sk
)->user_ns
, CAP_NET_RAW
) &&
2816 !ns_capable(sock_net(sk
)->user_ns
, CAP_NET_ADMIN
))
2818 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2820 sockc
->mark
= *(u32
*)CMSG_DATA(cmsg
);
2822 case SO_TIMESTAMPING_OLD
:
2823 case SO_TIMESTAMPING_NEW
:
2824 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u32
)))
2827 tsflags
= *(u32
*)CMSG_DATA(cmsg
);
2828 if (tsflags
& ~SOF_TIMESTAMPING_TX_RECORD_MASK
)
2831 sockc
->tsflags
&= ~SOF_TIMESTAMPING_TX_RECORD_MASK
;
2832 sockc
->tsflags
|= tsflags
;
2835 if (!sock_flag(sk
, SOCK_TXTIME
))
2837 if (cmsg
->cmsg_len
!= CMSG_LEN(sizeof(u64
)))
2839 sockc
->transmit_time
= get_unaligned((u64
*)CMSG_DATA(cmsg
));
2841 /* SCM_RIGHTS and SCM_CREDENTIALS are semantically in SOL_UNIX. */
2843 case SCM_CREDENTIALS
:
2850 EXPORT_SYMBOL(__sock_cmsg_send
);
2852 int sock_cmsg_send(struct sock
*sk
, struct msghdr
*msg
,
2853 struct sockcm_cookie
*sockc
)
2855 struct cmsghdr
*cmsg
;
2858 for_each_cmsghdr(cmsg
, msg
) {
2859 if (!CMSG_OK(msg
, cmsg
))
2861 if (cmsg
->cmsg_level
!= SOL_SOCKET
)
2863 ret
= __sock_cmsg_send(sk
, cmsg
, sockc
);
2869 EXPORT_SYMBOL(sock_cmsg_send
);
2871 static void sk_enter_memory_pressure(struct sock
*sk
)
2873 if (!sk
->sk_prot
->enter_memory_pressure
)
2876 sk
->sk_prot
->enter_memory_pressure(sk
);
2879 static void sk_leave_memory_pressure(struct sock
*sk
)
2881 if (sk
->sk_prot
->leave_memory_pressure
) {
2882 INDIRECT_CALL_INET_1(sk
->sk_prot
->leave_memory_pressure
,
2883 tcp_leave_memory_pressure
, sk
);
2885 unsigned long *memory_pressure
= sk
->sk_prot
->memory_pressure
;
2887 if (memory_pressure
&& READ_ONCE(*memory_pressure
))
2888 WRITE_ONCE(*memory_pressure
, 0);
2892 DEFINE_STATIC_KEY_FALSE(net_high_order_alloc_disable_key
);
2895 * skb_page_frag_refill - check that a page_frag contains enough room
2896 * @sz: minimum size of the fragment we want to get
2897 * @pfrag: pointer to page_frag
2898 * @gfp: priority for memory allocation
2900 * Note: While this allocator tries to use high order pages, there is
2901 * no guarantee that allocations succeed. Therefore, @sz MUST be
2902 * less or equal than PAGE_SIZE.
2904 bool skb_page_frag_refill(unsigned int sz
, struct page_frag
*pfrag
, gfp_t gfp
)
2907 if (page_ref_count(pfrag
->page
) == 1) {
2911 if (pfrag
->offset
+ sz
<= pfrag
->size
)
2913 put_page(pfrag
->page
);
2917 if (SKB_FRAG_PAGE_ORDER
&&
2918 !static_branch_unlikely(&net_high_order_alloc_disable_key
)) {
2919 /* Avoid direct reclaim but allow kswapd to wake */
2920 pfrag
->page
= alloc_pages((gfp
& ~__GFP_DIRECT_RECLAIM
) |
2921 __GFP_COMP
| __GFP_NOWARN
|
2923 SKB_FRAG_PAGE_ORDER
);
2924 if (likely(pfrag
->page
)) {
2925 pfrag
->size
= PAGE_SIZE
<< SKB_FRAG_PAGE_ORDER
;
2929 pfrag
->page
= alloc_page(gfp
);
2930 if (likely(pfrag
->page
)) {
2931 pfrag
->size
= PAGE_SIZE
;
2936 EXPORT_SYMBOL(skb_page_frag_refill
);
2938 bool sk_page_frag_refill(struct sock
*sk
, struct page_frag
*pfrag
)
2940 if (likely(skb_page_frag_refill(32U, pfrag
, sk
->sk_allocation
)))
2943 sk_enter_memory_pressure(sk
);
2944 sk_stream_moderate_sndbuf(sk
);
2947 EXPORT_SYMBOL(sk_page_frag_refill
);
2949 void __lock_sock(struct sock
*sk
)
2950 __releases(&sk
->sk_lock
.slock
)
2951 __acquires(&sk
->sk_lock
.slock
)
2956 prepare_to_wait_exclusive(&sk
->sk_lock
.wq
, &wait
,
2957 TASK_UNINTERRUPTIBLE
);
2958 spin_unlock_bh(&sk
->sk_lock
.slock
);
2960 spin_lock_bh(&sk
->sk_lock
.slock
);
2961 if (!sock_owned_by_user(sk
))
2964 finish_wait(&sk
->sk_lock
.wq
, &wait
);
2967 void __release_sock(struct sock
*sk
)
2968 __releases(&sk
->sk_lock
.slock
)
2969 __acquires(&sk
->sk_lock
.slock
)
2971 struct sk_buff
*skb
, *next
;
2973 while ((skb
= sk
->sk_backlog
.head
) != NULL
) {
2974 sk
->sk_backlog
.head
= sk
->sk_backlog
.tail
= NULL
;
2976 spin_unlock_bh(&sk
->sk_lock
.slock
);
2981 DEBUG_NET_WARN_ON_ONCE(skb_dst_is_noref(skb
));
2982 skb_mark_not_on_list(skb
);
2983 sk_backlog_rcv(sk
, skb
);
2988 } while (skb
!= NULL
);
2990 spin_lock_bh(&sk
->sk_lock
.slock
);
2994 * Doing the zeroing here guarantee we can not loop forever
2995 * while a wild producer attempts to flood us.
2997 sk
->sk_backlog
.len
= 0;
3000 void __sk_flush_backlog(struct sock
*sk
)
3002 spin_lock_bh(&sk
->sk_lock
.slock
);
3005 if (sk
->sk_prot
->release_cb
)
3006 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3007 tcp_release_cb
, sk
);
3009 spin_unlock_bh(&sk
->sk_lock
.slock
);
3011 EXPORT_SYMBOL_GPL(__sk_flush_backlog
);
3014 * sk_wait_data - wait for data to arrive at sk_receive_queue
3015 * @sk: sock to wait on
3016 * @timeo: for how long
3017 * @skb: last skb seen on sk_receive_queue
3019 * Now socket state including sk->sk_err is changed only under lock,
3020 * hence we may omit checks after joining wait queue.
3021 * We check receive queue before schedule() only as optimization;
3022 * it is very likely that release_sock() added new data.
3024 int sk_wait_data(struct sock
*sk
, long *timeo
, const struct sk_buff
*skb
)
3026 DEFINE_WAIT_FUNC(wait
, woken_wake_function
);
3029 add_wait_queue(sk_sleep(sk
), &wait
);
3030 sk_set_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3031 rc
= sk_wait_event(sk
, timeo
, skb_peek_tail(&sk
->sk_receive_queue
) != skb
, &wait
);
3032 sk_clear_bit(SOCKWQ_ASYNC_WAITDATA
, sk
);
3033 remove_wait_queue(sk_sleep(sk
), &wait
);
3036 EXPORT_SYMBOL(sk_wait_data
);
3039 * __sk_mem_raise_allocated - increase memory_allocated
3041 * @size: memory size to allocate
3042 * @amt: pages to allocate
3043 * @kind: allocation type
3045 * Similar to __sk_mem_schedule(), but does not update sk_forward_alloc.
3047 * Unlike the globally shared limits among the sockets under same protocol,
3048 * consuming the budget of a memcg won't have direct effect on other ones.
3049 * So be optimistic about memcg's tolerance, and leave the callers to decide
3050 * whether or not to raise allocated through sk_under_memory_pressure() or
3053 int __sk_mem_raise_allocated(struct sock
*sk
, int size
, int amt
, int kind
)
3055 struct mem_cgroup
*memcg
= mem_cgroup_sockets_enabled
? sk
->sk_memcg
: NULL
;
3056 struct proto
*prot
= sk
->sk_prot
;
3057 bool charged
= false;
3060 sk_memory_allocated_add(sk
, amt
);
3061 allocated
= sk_memory_allocated(sk
);
3064 if (!mem_cgroup_charge_skmem(memcg
, amt
, gfp_memcg_charge()))
3065 goto suppress_allocation
;
3070 if (allocated
<= sk_prot_mem_limits(sk
, 0)) {
3071 sk_leave_memory_pressure(sk
);
3075 /* Under pressure. */
3076 if (allocated
> sk_prot_mem_limits(sk
, 1))
3077 sk_enter_memory_pressure(sk
);
3079 /* Over hard limit. */
3080 if (allocated
> sk_prot_mem_limits(sk
, 2))
3081 goto suppress_allocation
;
3083 /* Guarantee minimum buffer size under pressure (either global
3084 * or memcg) to make sure features described in RFC 7323 (TCP
3085 * Extensions for High Performance) work properly.
3087 * This rule does NOT stand when exceeds global or memcg's hard
3088 * limit, or else a DoS attack can be taken place by spawning
3089 * lots of sockets whose usage are under minimum buffer size.
3091 if (kind
== SK_MEM_RECV
) {
3092 if (atomic_read(&sk
->sk_rmem_alloc
) < sk_get_rmem0(sk
, prot
))
3095 } else { /* SK_MEM_SEND */
3096 int wmem0
= sk_get_wmem0(sk
, prot
);
3098 if (sk
->sk_type
== SOCK_STREAM
) {
3099 if (sk
->sk_wmem_queued
< wmem0
)
3101 } else if (refcount_read(&sk
->sk_wmem_alloc
) < wmem0
) {
3106 if (sk_has_memory_pressure(sk
)) {
3109 /* The following 'average' heuristic is within the
3110 * scope of global accounting, so it only makes
3111 * sense for global memory pressure.
3113 if (!sk_under_global_memory_pressure(sk
))
3116 /* Try to be fair among all the sockets under global
3117 * pressure by allowing the ones that below average
3120 alloc
= sk_sockets_allocated_read_positive(sk
);
3121 if (sk_prot_mem_limits(sk
, 2) > alloc
*
3122 sk_mem_pages(sk
->sk_wmem_queued
+
3123 atomic_read(&sk
->sk_rmem_alloc
) +
3124 sk
->sk_forward_alloc
))
3128 suppress_allocation
:
3130 if (kind
== SK_MEM_SEND
&& sk
->sk_type
== SOCK_STREAM
) {
3131 sk_stream_moderate_sndbuf(sk
);
3133 /* Fail only if socket is _under_ its sndbuf.
3134 * In this case we cannot block, so that we have to fail.
3136 if (sk
->sk_wmem_queued
+ size
>= sk
->sk_sndbuf
) {
3137 /* Force charge with __GFP_NOFAIL */
3138 if (memcg
&& !charged
) {
3139 mem_cgroup_charge_skmem(memcg
, amt
,
3140 gfp_memcg_charge() | __GFP_NOFAIL
);
3146 if (kind
== SK_MEM_SEND
|| (kind
== SK_MEM_RECV
&& charged
))
3147 trace_sock_exceed_buf_limit(sk
, prot
, allocated
, kind
);
3149 sk_memory_allocated_sub(sk
, amt
);
3152 mem_cgroup_uncharge_skmem(memcg
, amt
);
3158 * __sk_mem_schedule - increase sk_forward_alloc and memory_allocated
3160 * @size: memory size to allocate
3161 * @kind: allocation type
3163 * If kind is SK_MEM_SEND, it means wmem allocation. Otherwise it means
3164 * rmem allocation. This function assumes that protocols which have
3165 * memory_pressure use sk_wmem_queued as write buffer accounting.
3167 int __sk_mem_schedule(struct sock
*sk
, int size
, int kind
)
3169 int ret
, amt
= sk_mem_pages(size
);
3171 sk_forward_alloc_add(sk
, amt
<< PAGE_SHIFT
);
3172 ret
= __sk_mem_raise_allocated(sk
, size
, amt
, kind
);
3174 sk_forward_alloc_add(sk
, -(amt
<< PAGE_SHIFT
));
3177 EXPORT_SYMBOL(__sk_mem_schedule
);
3180 * __sk_mem_reduce_allocated - reclaim memory_allocated
3182 * @amount: number of quanta
3184 * Similar to __sk_mem_reclaim(), but does not update sk_forward_alloc
3186 void __sk_mem_reduce_allocated(struct sock
*sk
, int amount
)
3188 sk_memory_allocated_sub(sk
, amount
);
3190 if (mem_cgroup_sockets_enabled
&& sk
->sk_memcg
)
3191 mem_cgroup_uncharge_skmem(sk
->sk_memcg
, amount
);
3193 if (sk_under_global_memory_pressure(sk
) &&
3194 (sk_memory_allocated(sk
) < sk_prot_mem_limits(sk
, 0)))
3195 sk_leave_memory_pressure(sk
);
3199 * __sk_mem_reclaim - reclaim sk_forward_alloc and memory_allocated
3201 * @amount: number of bytes (rounded down to a PAGE_SIZE multiple)
3203 void __sk_mem_reclaim(struct sock
*sk
, int amount
)
3205 amount
>>= PAGE_SHIFT
;
3206 sk_forward_alloc_add(sk
, -(amount
<< PAGE_SHIFT
));
3207 __sk_mem_reduce_allocated(sk
, amount
);
3209 EXPORT_SYMBOL(__sk_mem_reclaim
);
3211 int sk_set_peek_off(struct sock
*sk
, int val
)
3213 WRITE_ONCE(sk
->sk_peek_off
, val
);
3216 EXPORT_SYMBOL_GPL(sk_set_peek_off
);
3219 * Set of default routines for initialising struct proto_ops when
3220 * the protocol does not support a particular function. In certain
3221 * cases where it makes no sense for a protocol to have a "do nothing"
3222 * function, some default processing is provided.
3225 int sock_no_bind(struct socket
*sock
, struct sockaddr
*saddr
, int len
)
3229 EXPORT_SYMBOL(sock_no_bind
);
3231 int sock_no_connect(struct socket
*sock
, struct sockaddr
*saddr
,
3236 EXPORT_SYMBOL(sock_no_connect
);
3238 int sock_no_socketpair(struct socket
*sock1
, struct socket
*sock2
)
3242 EXPORT_SYMBOL(sock_no_socketpair
);
3244 int sock_no_accept(struct socket
*sock
, struct socket
*newsock
, int flags
,
3249 EXPORT_SYMBOL(sock_no_accept
);
3251 int sock_no_getname(struct socket
*sock
, struct sockaddr
*saddr
,
3256 EXPORT_SYMBOL(sock_no_getname
);
3258 int sock_no_ioctl(struct socket
*sock
, unsigned int cmd
, unsigned long arg
)
3262 EXPORT_SYMBOL(sock_no_ioctl
);
3264 int sock_no_listen(struct socket
*sock
, int backlog
)
3268 EXPORT_SYMBOL(sock_no_listen
);
3270 int sock_no_shutdown(struct socket
*sock
, int how
)
3274 EXPORT_SYMBOL(sock_no_shutdown
);
3276 int sock_no_sendmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
)
3280 EXPORT_SYMBOL(sock_no_sendmsg
);
3282 int sock_no_sendmsg_locked(struct sock
*sk
, struct msghdr
*m
, size_t len
)
3286 EXPORT_SYMBOL(sock_no_sendmsg_locked
);
3288 int sock_no_recvmsg(struct socket
*sock
, struct msghdr
*m
, size_t len
,
3293 EXPORT_SYMBOL(sock_no_recvmsg
);
3295 int sock_no_mmap(struct file
*file
, struct socket
*sock
, struct vm_area_struct
*vma
)
3297 /* Mirror missing mmap method error code */
3300 EXPORT_SYMBOL(sock_no_mmap
);
3303 * When a file is received (via SCM_RIGHTS, etc), we must bump the
3304 * various sock-based usage counts.
3306 void __receive_sock(struct file
*file
)
3308 struct socket
*sock
;
3310 sock
= sock_from_file(file
);
3312 sock_update_netprioidx(&sock
->sk
->sk_cgrp_data
);
3313 sock_update_classid(&sock
->sk
->sk_cgrp_data
);
3318 * Default Socket Callbacks
3321 static void sock_def_wakeup(struct sock
*sk
)
3323 struct socket_wq
*wq
;
3326 wq
= rcu_dereference(sk
->sk_wq
);
3327 if (skwq_has_sleeper(wq
))
3328 wake_up_interruptible_all(&wq
->wait
);
3332 static void sock_def_error_report(struct sock
*sk
)
3334 struct socket_wq
*wq
;
3337 wq
= rcu_dereference(sk
->sk_wq
);
3338 if (skwq_has_sleeper(wq
))
3339 wake_up_interruptible_poll(&wq
->wait
, EPOLLERR
);
3340 sk_wake_async(sk
, SOCK_WAKE_IO
, POLL_ERR
);
3344 void sock_def_readable(struct sock
*sk
)
3346 struct socket_wq
*wq
;
3348 trace_sk_data_ready(sk
);
3351 wq
= rcu_dereference(sk
->sk_wq
);
3352 if (skwq_has_sleeper(wq
))
3353 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLIN
| EPOLLPRI
|
3354 EPOLLRDNORM
| EPOLLRDBAND
);
3355 sk_wake_async(sk
, SOCK_WAKE_WAITD
, POLL_IN
);
3359 static void sock_def_write_space(struct sock
*sk
)
3361 struct socket_wq
*wq
;
3365 /* Do not wake up a writer until he can make "significant"
3368 if (sock_writeable(sk
)) {
3369 wq
= rcu_dereference(sk
->sk_wq
);
3370 if (skwq_has_sleeper(wq
))
3371 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3372 EPOLLWRNORM
| EPOLLWRBAND
);
3374 /* Should agree with poll, otherwise some programs break */
3375 sk_wake_async(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3381 /* An optimised version of sock_def_write_space(), should only be called
3382 * for SOCK_RCU_FREE sockets under RCU read section and after putting
3385 static void sock_def_write_space_wfree(struct sock
*sk
)
3387 /* Do not wake up a writer until he can make "significant"
3390 if (sock_writeable(sk
)) {
3391 struct socket_wq
*wq
= rcu_dereference(sk
->sk_wq
);
3393 /* rely on refcount_sub from sock_wfree() */
3394 smp_mb__after_atomic();
3395 if (wq
&& waitqueue_active(&wq
->wait
))
3396 wake_up_interruptible_sync_poll(&wq
->wait
, EPOLLOUT
|
3397 EPOLLWRNORM
| EPOLLWRBAND
);
3399 /* Should agree with poll, otherwise some programs break */
3400 sk_wake_async(sk
, SOCK_WAKE_SPACE
, POLL_OUT
);
3404 static void sock_def_destruct(struct sock
*sk
)
3408 void sk_send_sigurg(struct sock
*sk
)
3410 if (sk
->sk_socket
&& sk
->sk_socket
->file
)
3411 if (send_sigurg(&sk
->sk_socket
->file
->f_owner
))
3412 sk_wake_async(sk
, SOCK_WAKE_URG
, POLL_PRI
);
3414 EXPORT_SYMBOL(sk_send_sigurg
);
3416 void sk_reset_timer(struct sock
*sk
, struct timer_list
* timer
,
3417 unsigned long expires
)
3419 if (!mod_timer(timer
, expires
))
3422 EXPORT_SYMBOL(sk_reset_timer
);
3424 void sk_stop_timer(struct sock
*sk
, struct timer_list
* timer
)
3426 if (del_timer(timer
))
3429 EXPORT_SYMBOL(sk_stop_timer
);
3431 void sk_stop_timer_sync(struct sock
*sk
, struct timer_list
*timer
)
3433 if (del_timer_sync(timer
))
3436 EXPORT_SYMBOL(sk_stop_timer_sync
);
3438 void sock_init_data_uid(struct socket
*sock
, struct sock
*sk
, kuid_t uid
)
3441 sk
->sk_send_head
= NULL
;
3443 timer_setup(&sk
->sk_timer
, NULL
, 0);
3445 sk
->sk_allocation
= GFP_KERNEL
;
3446 sk
->sk_rcvbuf
= READ_ONCE(sysctl_rmem_default
);
3447 sk
->sk_sndbuf
= READ_ONCE(sysctl_wmem_default
);
3448 sk
->sk_state
= TCP_CLOSE
;
3449 sk
->sk_use_task_frag
= true;
3450 sk_set_socket(sk
, sock
);
3452 sock_set_flag(sk
, SOCK_ZAPPED
);
3455 sk
->sk_type
= sock
->type
;
3456 RCU_INIT_POINTER(sk
->sk_wq
, &sock
->wq
);
3459 RCU_INIT_POINTER(sk
->sk_wq
, NULL
);
3463 rwlock_init(&sk
->sk_callback_lock
);
3464 if (sk
->sk_kern_sock
)
3465 lockdep_set_class_and_name(
3466 &sk
->sk_callback_lock
,
3467 af_kern_callback_keys
+ sk
->sk_family
,
3468 af_family_kern_clock_key_strings
[sk
->sk_family
]);
3470 lockdep_set_class_and_name(
3471 &sk
->sk_callback_lock
,
3472 af_callback_keys
+ sk
->sk_family
,
3473 af_family_clock_key_strings
[sk
->sk_family
]);
3475 sk
->sk_state_change
= sock_def_wakeup
;
3476 sk
->sk_data_ready
= sock_def_readable
;
3477 sk
->sk_write_space
= sock_def_write_space
;
3478 sk
->sk_error_report
= sock_def_error_report
;
3479 sk
->sk_destruct
= sock_def_destruct
;
3481 sk
->sk_frag
.page
= NULL
;
3482 sk
->sk_frag
.offset
= 0;
3483 sk
->sk_peek_off
= -1;
3485 sk
->sk_peer_pid
= NULL
;
3486 sk
->sk_peer_cred
= NULL
;
3487 spin_lock_init(&sk
->sk_peer_lock
);
3489 sk
->sk_write_pending
= 0;
3490 sk
->sk_rcvlowat
= 1;
3491 sk
->sk_rcvtimeo
= MAX_SCHEDULE_TIMEOUT
;
3492 sk
->sk_sndtimeo
= MAX_SCHEDULE_TIMEOUT
;
3494 sk
->sk_stamp
= SK_DEFAULT_STAMP
;
3495 #if BITS_PER_LONG==32
3496 seqlock_init(&sk
->sk_stamp_seq
);
3498 atomic_set(&sk
->sk_zckey
, 0);
3500 #ifdef CONFIG_NET_RX_BUSY_POLL
3502 sk
->sk_ll_usec
= READ_ONCE(sysctl_net_busy_read
);
3505 sk
->sk_max_pacing_rate
= ~0UL;
3506 sk
->sk_pacing_rate
= ~0UL;
3507 WRITE_ONCE(sk
->sk_pacing_shift
, 10);
3508 sk
->sk_incoming_cpu
= -1;
3510 sk_rx_queue_clear(sk
);
3512 * Before updating sk_refcnt, we must commit prior changes to memory
3513 * (Documentation/RCU/rculist_nulls.rst for details)
3516 refcount_set(&sk
->sk_refcnt
, 1);
3517 atomic_set(&sk
->sk_drops
, 0);
3519 EXPORT_SYMBOL(sock_init_data_uid
);
3521 void sock_init_data(struct socket
*sock
, struct sock
*sk
)
3524 SOCK_INODE(sock
)->i_uid
:
3525 make_kuid(sock_net(sk
)->user_ns
, 0);
3527 sock_init_data_uid(sock
, sk
, uid
);
3529 EXPORT_SYMBOL(sock_init_data
);
3531 void lock_sock_nested(struct sock
*sk
, int subclass
)
3533 /* The sk_lock has mutex_lock() semantics here. */
3534 mutex_acquire(&sk
->sk_lock
.dep_map
, subclass
, 0, _RET_IP_
);
3537 spin_lock_bh(&sk
->sk_lock
.slock
);
3538 if (sock_owned_by_user_nocheck(sk
))
3540 sk
->sk_lock
.owned
= 1;
3541 spin_unlock_bh(&sk
->sk_lock
.slock
);
3543 EXPORT_SYMBOL(lock_sock_nested
);
3545 void release_sock(struct sock
*sk
)
3547 spin_lock_bh(&sk
->sk_lock
.slock
);
3548 if (sk
->sk_backlog
.tail
)
3551 if (sk
->sk_prot
->release_cb
)
3552 INDIRECT_CALL_INET_1(sk
->sk_prot
->release_cb
,
3553 tcp_release_cb
, sk
);
3555 sock_release_ownership(sk
);
3556 if (waitqueue_active(&sk
->sk_lock
.wq
))
3557 wake_up(&sk
->sk_lock
.wq
);
3558 spin_unlock_bh(&sk
->sk_lock
.slock
);
3560 EXPORT_SYMBOL(release_sock
);
3562 bool __lock_sock_fast(struct sock
*sk
) __acquires(&sk
->sk_lock
.slock
)
3565 spin_lock_bh(&sk
->sk_lock
.slock
);
3567 if (!sock_owned_by_user_nocheck(sk
)) {
3569 * Fast path return with bottom halves disabled and
3570 * sock::sk_lock.slock held.
3572 * The 'mutex' is not contended and holding
3573 * sock::sk_lock.slock prevents all other lockers to
3574 * proceed so the corresponding unlock_sock_fast() can
3575 * avoid the slow path of release_sock() completely and
3576 * just release slock.
3578 * From a semantical POV this is equivalent to 'acquiring'
3579 * the 'mutex', hence the corresponding lockdep
3580 * mutex_release() has to happen in the fast path of
3581 * unlock_sock_fast().
3587 sk
->sk_lock
.owned
= 1;
3588 __acquire(&sk
->sk_lock
.slock
);
3589 spin_unlock_bh(&sk
->sk_lock
.slock
);
3592 EXPORT_SYMBOL(__lock_sock_fast
);
3594 int sock_gettstamp(struct socket
*sock
, void __user
*userstamp
,
3595 bool timeval
, bool time32
)
3597 struct sock
*sk
= sock
->sk
;
3598 struct timespec64 ts
;
3600 sock_enable_timestamp(sk
, SOCK_TIMESTAMP
);
3601 ts
= ktime_to_timespec64(sock_read_timestamp(sk
));
3602 if (ts
.tv_sec
== -1)
3604 if (ts
.tv_sec
== 0) {
3605 ktime_t kt
= ktime_get_real();
3606 sock_write_timestamp(sk
, kt
);
3607 ts
= ktime_to_timespec64(kt
);
3613 #ifdef CONFIG_COMPAT_32BIT_TIME
3615 return put_old_timespec32(&ts
, userstamp
);
3617 #ifdef CONFIG_SPARC64
3618 /* beware of padding in sparc64 timeval */
3619 if (timeval
&& !in_compat_syscall()) {
3620 struct __kernel_old_timeval __user tv
= {
3621 .tv_sec
= ts
.tv_sec
,
3622 .tv_usec
= ts
.tv_nsec
,
3624 if (copy_to_user(userstamp
, &tv
, sizeof(tv
)))
3629 return put_timespec64(&ts
, userstamp
);
3631 EXPORT_SYMBOL(sock_gettstamp
);
3633 void sock_enable_timestamp(struct sock
*sk
, enum sock_flags flag
)
3635 if (!sock_flag(sk
, flag
)) {
3636 unsigned long previous_flags
= sk
->sk_flags
;
3638 sock_set_flag(sk
, flag
);
3640 * we just set one of the two flags which require net
3641 * time stamping, but time stamping might have been on
3642 * already because of the other one
3644 if (sock_needs_netstamp(sk
) &&
3645 !(previous_flags
& SK_FLAGS_TIMESTAMP
))
3646 net_enable_timestamp();
3650 int sock_recv_errqueue(struct sock
*sk
, struct msghdr
*msg
, int len
,
3651 int level
, int type
)
3653 struct sock_exterr_skb
*serr
;
3654 struct sk_buff
*skb
;
3658 skb
= sock_dequeue_err_skb(sk
);
3664 msg
->msg_flags
|= MSG_TRUNC
;
3667 err
= skb_copy_datagram_msg(skb
, 0, msg
, copied
);
3671 sock_recv_timestamp(msg
, sk
, skb
);
3673 serr
= SKB_EXT_ERR(skb
);
3674 put_cmsg(msg
, level
, type
, sizeof(serr
->ee
), &serr
->ee
);
3676 msg
->msg_flags
|= MSG_ERRQUEUE
;
3684 EXPORT_SYMBOL(sock_recv_errqueue
);
3687 * Get a socket option on an socket.
3689 * FIX: POSIX 1003.1g is very ambiguous here. It states that
3690 * asynchronous errors should be reported by getsockopt. We assume
3691 * this means if you specify SO_ERROR (otherwise whats the point of it).
3693 int sock_common_getsockopt(struct socket
*sock
, int level
, int optname
,
3694 char __user
*optval
, int __user
*optlen
)
3696 struct sock
*sk
= sock
->sk
;
3698 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3699 return READ_ONCE(sk
->sk_prot
)->getsockopt(sk
, level
, optname
, optval
, optlen
);
3701 EXPORT_SYMBOL(sock_common_getsockopt
);
3703 int sock_common_recvmsg(struct socket
*sock
, struct msghdr
*msg
, size_t size
,
3706 struct sock
*sk
= sock
->sk
;
3710 err
= sk
->sk_prot
->recvmsg(sk
, msg
, size
, flags
, &addr_len
);
3712 msg
->msg_namelen
= addr_len
;
3715 EXPORT_SYMBOL(sock_common_recvmsg
);
3718 * Set socket options on an inet socket.
3720 int sock_common_setsockopt(struct socket
*sock
, int level
, int optname
,
3721 sockptr_t optval
, unsigned int optlen
)
3723 struct sock
*sk
= sock
->sk
;
3725 /* IPV6_ADDRFORM can change sk->sk_prot under us. */
3726 return READ_ONCE(sk
->sk_prot
)->setsockopt(sk
, level
, optname
, optval
, optlen
);
3728 EXPORT_SYMBOL(sock_common_setsockopt
);
3730 void sk_common_release(struct sock
*sk
)
3732 if (sk
->sk_prot
->destroy
)
3733 sk
->sk_prot
->destroy(sk
);
3736 * Observation: when sk_common_release is called, processes have
3737 * no access to socket. But net still has.
3738 * Step one, detach it from networking:
3740 * A. Remove from hash tables.
3743 sk
->sk_prot
->unhash(sk
);
3746 * In this point socket cannot receive new packets, but it is possible
3747 * that some packets are in flight because some CPU runs receiver and
3748 * did hash table lookup before we unhashed socket. They will achieve
3749 * receive queue and will be purged by socket destructor.
3751 * Also we still have packets pending on receive queue and probably,
3752 * our own packets waiting in device queues. sock_destroy will drain
3753 * receive queue, but transmitted packets will delay socket destruction
3754 * until the last reference will be released.
3759 xfrm_sk_free_policy(sk
);
3763 EXPORT_SYMBOL(sk_common_release
);
3765 void sk_get_meminfo(const struct sock
*sk
, u32
*mem
)
3767 memset(mem
, 0, sizeof(*mem
) * SK_MEMINFO_VARS
);
3769 mem
[SK_MEMINFO_RMEM_ALLOC
] = sk_rmem_alloc_get(sk
);
3770 mem
[SK_MEMINFO_RCVBUF
] = READ_ONCE(sk
->sk_rcvbuf
);
3771 mem
[SK_MEMINFO_WMEM_ALLOC
] = sk_wmem_alloc_get(sk
);
3772 mem
[SK_MEMINFO_SNDBUF
] = READ_ONCE(sk
->sk_sndbuf
);
3773 mem
[SK_MEMINFO_FWD_ALLOC
] = sk_forward_alloc_get(sk
);
3774 mem
[SK_MEMINFO_WMEM_QUEUED
] = READ_ONCE(sk
->sk_wmem_queued
);
3775 mem
[SK_MEMINFO_OPTMEM
] = atomic_read(&sk
->sk_omem_alloc
);
3776 mem
[SK_MEMINFO_BACKLOG
] = READ_ONCE(sk
->sk_backlog
.len
);
3777 mem
[SK_MEMINFO_DROPS
] = atomic_read(&sk
->sk_drops
);
3780 #ifdef CONFIG_PROC_FS
3781 static DECLARE_BITMAP(proto_inuse_idx
, PROTO_INUSE_NR
);
3783 int sock_prot_inuse_get(struct net
*net
, struct proto
*prot
)
3785 int cpu
, idx
= prot
->inuse_idx
;
3788 for_each_possible_cpu(cpu
)
3789 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->val
[idx
];
3791 return res
>= 0 ? res
: 0;
3793 EXPORT_SYMBOL_GPL(sock_prot_inuse_get
);
3795 int sock_inuse_get(struct net
*net
)
3799 for_each_possible_cpu(cpu
)
3800 res
+= per_cpu_ptr(net
->core
.prot_inuse
, cpu
)->all
;
3805 EXPORT_SYMBOL_GPL(sock_inuse_get
);
3807 static int __net_init
sock_inuse_init_net(struct net
*net
)
3809 net
->core
.prot_inuse
= alloc_percpu(struct prot_inuse
);
3810 if (net
->core
.prot_inuse
== NULL
)
3815 static void __net_exit
sock_inuse_exit_net(struct net
*net
)
3817 free_percpu(net
->core
.prot_inuse
);
3820 static struct pernet_operations net_inuse_ops
= {
3821 .init
= sock_inuse_init_net
,
3822 .exit
= sock_inuse_exit_net
,
3825 static __init
int net_inuse_init(void)
3827 if (register_pernet_subsys(&net_inuse_ops
))
3828 panic("Cannot initialize net inuse counters");
3833 core_initcall(net_inuse_init
);
3835 static int assign_proto_idx(struct proto
*prot
)
3837 prot
->inuse_idx
= find_first_zero_bit(proto_inuse_idx
, PROTO_INUSE_NR
);
3839 if (unlikely(prot
->inuse_idx
== PROTO_INUSE_NR
- 1)) {
3840 pr_err("PROTO_INUSE_NR exhausted\n");
3844 set_bit(prot
->inuse_idx
, proto_inuse_idx
);
3848 static void release_proto_idx(struct proto
*prot
)
3850 if (prot
->inuse_idx
!= PROTO_INUSE_NR
- 1)
3851 clear_bit(prot
->inuse_idx
, proto_inuse_idx
);
3854 static inline int assign_proto_idx(struct proto
*prot
)
3859 static inline void release_proto_idx(struct proto
*prot
)
3865 static void tw_prot_cleanup(struct timewait_sock_ops
*twsk_prot
)
3869 kfree(twsk_prot
->twsk_slab_name
);
3870 twsk_prot
->twsk_slab_name
= NULL
;
3871 kmem_cache_destroy(twsk_prot
->twsk_slab
);
3872 twsk_prot
->twsk_slab
= NULL
;
3875 static int tw_prot_init(const struct proto
*prot
)
3877 struct timewait_sock_ops
*twsk_prot
= prot
->twsk_prot
;
3882 twsk_prot
->twsk_slab_name
= kasprintf(GFP_KERNEL
, "tw_sock_%s",
3884 if (!twsk_prot
->twsk_slab_name
)
3887 twsk_prot
->twsk_slab
=
3888 kmem_cache_create(twsk_prot
->twsk_slab_name
,
3889 twsk_prot
->twsk_obj_size
, 0,
3890 SLAB_ACCOUNT
| prot
->slab_flags
,
3892 if (!twsk_prot
->twsk_slab
) {
3893 pr_crit("%s: Can't create timewait sock SLAB cache!\n",
3901 static void req_prot_cleanup(struct request_sock_ops
*rsk_prot
)
3905 kfree(rsk_prot
->slab_name
);
3906 rsk_prot
->slab_name
= NULL
;
3907 kmem_cache_destroy(rsk_prot
->slab
);
3908 rsk_prot
->slab
= NULL
;
3911 static int req_prot_init(const struct proto
*prot
)
3913 struct request_sock_ops
*rsk_prot
= prot
->rsk_prot
;
3918 rsk_prot
->slab_name
= kasprintf(GFP_KERNEL
, "request_sock_%s",
3920 if (!rsk_prot
->slab_name
)
3923 rsk_prot
->slab
= kmem_cache_create(rsk_prot
->slab_name
,
3924 rsk_prot
->obj_size
, 0,
3925 SLAB_ACCOUNT
| prot
->slab_flags
,
3928 if (!rsk_prot
->slab
) {
3929 pr_crit("%s: Can't create request sock SLAB cache!\n",
3936 int proto_register(struct proto
*prot
, int alloc_slab
)
3940 if (prot
->memory_allocated
&& !prot
->sysctl_mem
) {
3941 pr_err("%s: missing sysctl_mem\n", prot
->name
);
3944 if (prot
->memory_allocated
&& !prot
->per_cpu_fw_alloc
) {
3945 pr_err("%s: missing per_cpu_fw_alloc\n", prot
->name
);
3949 prot
->slab
= kmem_cache_create_usercopy(prot
->name
,
3951 SLAB_HWCACHE_ALIGN
| SLAB_ACCOUNT
|
3953 prot
->useroffset
, prot
->usersize
,
3956 if (prot
->slab
== NULL
) {
3957 pr_crit("%s: Can't create sock SLAB cache!\n",
3962 if (req_prot_init(prot
))
3963 goto out_free_request_sock_slab
;
3965 if (tw_prot_init(prot
))
3966 goto out_free_timewait_sock_slab
;
3969 mutex_lock(&proto_list_mutex
);
3970 ret
= assign_proto_idx(prot
);
3972 mutex_unlock(&proto_list_mutex
);
3973 goto out_free_timewait_sock_slab
;
3975 list_add(&prot
->node
, &proto_list
);
3976 mutex_unlock(&proto_list_mutex
);
3979 out_free_timewait_sock_slab
:
3981 tw_prot_cleanup(prot
->twsk_prot
);
3982 out_free_request_sock_slab
:
3984 req_prot_cleanup(prot
->rsk_prot
);
3986 kmem_cache_destroy(prot
->slab
);
3992 EXPORT_SYMBOL(proto_register
);
3994 void proto_unregister(struct proto
*prot
)
3996 mutex_lock(&proto_list_mutex
);
3997 release_proto_idx(prot
);
3998 list_del(&prot
->node
);
3999 mutex_unlock(&proto_list_mutex
);
4001 kmem_cache_destroy(prot
->slab
);
4004 req_prot_cleanup(prot
->rsk_prot
);
4005 tw_prot_cleanup(prot
->twsk_prot
);
4007 EXPORT_SYMBOL(proto_unregister
);
4009 int sock_load_diag_module(int family
, int protocol
)
4012 if (!sock_is_registered(family
))
4015 return request_module("net-pf-%d-proto-%d-type-%d", PF_NETLINK
,
4016 NETLINK_SOCK_DIAG
, family
);
4020 if (family
== AF_INET
&&
4021 protocol
!= IPPROTO_RAW
&&
4022 protocol
< MAX_INET_PROTOS
&&
4023 !rcu_access_pointer(inet_protos
[protocol
]))
4027 return request_module("net-pf-%d-proto-%d-type-%d-%d", PF_NETLINK
,
4028 NETLINK_SOCK_DIAG
, family
, protocol
);
4030 EXPORT_SYMBOL(sock_load_diag_module
);
4032 #ifdef CONFIG_PROC_FS
4033 static void *proto_seq_start(struct seq_file
*seq
, loff_t
*pos
)
4034 __acquires(proto_list_mutex
)
4036 mutex_lock(&proto_list_mutex
);
4037 return seq_list_start_head(&proto_list
, *pos
);
4040 static void *proto_seq_next(struct seq_file
*seq
, void *v
, loff_t
*pos
)
4042 return seq_list_next(v
, &proto_list
, pos
);
4045 static void proto_seq_stop(struct seq_file
*seq
, void *v
)
4046 __releases(proto_list_mutex
)
4048 mutex_unlock(&proto_list_mutex
);
4051 static char proto_method_implemented(const void *method
)
4053 return method
== NULL
? 'n' : 'y';
4055 static long sock_prot_memory_allocated(struct proto
*proto
)
4057 return proto
->memory_allocated
!= NULL
? proto_memory_allocated(proto
) : -1L;
4060 static const char *sock_prot_memory_pressure(struct proto
*proto
)
4062 return proto
->memory_pressure
!= NULL
?
4063 proto_memory_pressure(proto
) ? "yes" : "no" : "NI";
4066 static void proto_seq_printf(struct seq_file
*seq
, struct proto
*proto
)
4069 seq_printf(seq
, "%-9s %4u %6d %6ld %-3s %6u %-3s %-10s "
4070 "%2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c %2c\n",
4073 sock_prot_inuse_get(seq_file_net(seq
), proto
),
4074 sock_prot_memory_allocated(proto
),
4075 sock_prot_memory_pressure(proto
),
4077 proto
->slab
== NULL
? "no" : "yes",
4078 module_name(proto
->owner
),
4079 proto_method_implemented(proto
->close
),
4080 proto_method_implemented(proto
->connect
),
4081 proto_method_implemented(proto
->disconnect
),
4082 proto_method_implemented(proto
->accept
),
4083 proto_method_implemented(proto
->ioctl
),
4084 proto_method_implemented(proto
->init
),
4085 proto_method_implemented(proto
->destroy
),
4086 proto_method_implemented(proto
->shutdown
),
4087 proto_method_implemented(proto
->setsockopt
),
4088 proto_method_implemented(proto
->getsockopt
),
4089 proto_method_implemented(proto
->sendmsg
),
4090 proto_method_implemented(proto
->recvmsg
),
4091 proto_method_implemented(proto
->bind
),
4092 proto_method_implemented(proto
->backlog_rcv
),
4093 proto_method_implemented(proto
->hash
),
4094 proto_method_implemented(proto
->unhash
),
4095 proto_method_implemented(proto
->get_port
),
4096 proto_method_implemented(proto
->enter_memory_pressure
));
4099 static int proto_seq_show(struct seq_file
*seq
, void *v
)
4101 if (v
== &proto_list
)
4102 seq_printf(seq
, "%-9s %-4s %-8s %-6s %-5s %-7s %-4s %-10s %s",
4111 "cl co di ac io in de sh ss gs se re bi br ha uh gp em\n");
4113 proto_seq_printf(seq
, list_entry(v
, struct proto
, node
));
4117 static const struct seq_operations proto_seq_ops
= {
4118 .start
= proto_seq_start
,
4119 .next
= proto_seq_next
,
4120 .stop
= proto_seq_stop
,
4121 .show
= proto_seq_show
,
4124 static __net_init
int proto_init_net(struct net
*net
)
4126 if (!proc_create_net("protocols", 0444, net
->proc_net
, &proto_seq_ops
,
4127 sizeof(struct seq_net_private
)))
4133 static __net_exit
void proto_exit_net(struct net
*net
)
4135 remove_proc_entry("protocols", net
->proc_net
);
4139 static __net_initdata
struct pernet_operations proto_net_ops
= {
4140 .init
= proto_init_net
,
4141 .exit
= proto_exit_net
,
4144 static int __init
proto_init(void)
4146 return register_pernet_subsys(&proto_net_ops
);
4149 subsys_initcall(proto_init
);
4151 #endif /* PROC_FS */
4153 #ifdef CONFIG_NET_RX_BUSY_POLL
4154 bool sk_busy_loop_end(void *p
, unsigned long start_time
)
4156 struct sock
*sk
= p
;
4158 if (!skb_queue_empty_lockless(&sk
->sk_receive_queue
))
4161 if (sk_is_udp(sk
) &&
4162 !skb_queue_empty_lockless(&udp_sk(sk
)->reader_queue
))
4165 return sk_busy_loop_timeout(sk
, start_time
);
4167 EXPORT_SYMBOL(sk_busy_loop_end
);
4168 #endif /* CONFIG_NET_RX_BUSY_POLL */
4170 int sock_bind_add(struct sock
*sk
, struct sockaddr
*addr
, int addr_len
)
4172 if (!sk
->sk_prot
->bind_add
)
4174 return sk
->sk_prot
->bind_add(sk
, addr
, addr_len
);
4176 EXPORT_SYMBOL(sock_bind_add
);
4178 /* Copy 'size' bytes from userspace and return `size` back to userspace */
4179 int sock_ioctl_inout(struct sock
*sk
, unsigned int cmd
,
4180 void __user
*arg
, void *karg
, size_t size
)
4184 if (copy_from_user(karg
, arg
, size
))
4187 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, karg
);
4191 if (copy_to_user(arg
, karg
, size
))
4196 EXPORT_SYMBOL(sock_ioctl_inout
);
4198 /* This is the most common ioctl prep function, where the result (4 bytes) is
4199 * copied back to userspace if the ioctl() returns successfully. No input is
4200 * copied from userspace as input argument.
4202 static int sock_ioctl_out(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4206 ret
= READ_ONCE(sk
->sk_prot
)->ioctl(sk
, cmd
, &karg
);
4210 return put_user(karg
, (int __user
*)arg
);
4213 /* A wrapper around sock ioctls, which copies the data from userspace
4214 * (depending on the protocol/ioctl), and copies back the result to userspace.
4215 * The main motivation for this function is to pass kernel memory to the
4216 * protocol ioctl callbacks, instead of userspace memory.
4218 int sk_ioctl(struct sock
*sk
, unsigned int cmd
, void __user
*arg
)
4222 if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET
)
4223 rc
= ipmr_sk_ioctl(sk
, cmd
, arg
);
4224 else if (sk
->sk_type
== SOCK_RAW
&& sk
->sk_family
== AF_INET6
)
4225 rc
= ip6mr_sk_ioctl(sk
, cmd
, arg
);
4226 else if (sk_is_phonet(sk
))
4227 rc
= phonet_sk_ioctl(sk
, cmd
, arg
);
4229 /* If ioctl was processed, returns its value */
4233 /* Otherwise call the default handler */
4234 return sock_ioctl_out(sk
, cmd
, arg
);
4236 EXPORT_SYMBOL(sk_ioctl
);